Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / gcc / tree-vect-loop-manip.c
blob4988c93fdb61507a26430651b416ae61b217793a
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 rgroup_controls *src_rgm)
326 tree src_masktype = src_rgm->type;
327 tree dest_masktype = dest_rgm->type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
343 tree src = src_rgm->controls[i / 2];
344 tree dest = dest_rgm->controls[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
360 gimple_seq_add_stmt (seq, stmt);
362 return true;
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
376 tree src = src_rgm->controls[i / 2];
377 tree dest = dest_rgm->controls[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
382 return true;
384 return false;
387 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
388 for all the rgroup controls in RGC and return a control that is nonzero
389 when the loop needs to iterate. Add any new preheader statements to
390 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
392 RGC belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
400 It is known that:
402 NITERS * RGC->max_nscalars_per_iter * RGC->factor
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
407 VF * RGC->max_nscalars_per_iter * RGC->factor
409 might overflow before hitting a value above:
411 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks or zero
415 lengths for RGC.
417 Note: the cost of the code generated by this function is modeled
418 by vect_estimate_min_profitable_iters, so changes here may need
419 corresponding changes there. */
421 static tree
422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 gimple_seq *preheader_seq,
424 gimple_stmt_iterator loop_cond_gsi,
425 rgroup_controls *rgc, tree niters,
426 tree niters_skip, bool might_wrap_p)
428 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
429 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
430 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
432 tree ctrl_type = rgc->type;
433 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
434 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
435 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
436 tree length_limit = NULL_TREE;
437 /* For length, we need length_limit to ensure length in range. */
438 if (!use_masks_p)
439 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
441 /* Calculate the maximum number of item values that the rgroup
442 handles in total, the number that it handles for each iteration
443 of the vector loop, and the number that it should skip during the
444 first iteration of the vector loop. */
445 tree nitems_total = niters;
446 tree nitems_step = build_int_cst (iv_type, vf);
447 tree nitems_skip = niters_skip;
448 if (nitems_per_iter != 1)
450 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
451 these multiplications don't overflow. */
452 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
453 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
454 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
455 nitems_total, compare_factor);
456 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
457 nitems_step, iv_factor);
458 if (nitems_skip)
459 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
460 nitems_skip, compare_factor);
463 /* Create an induction variable that counts the number of items
464 processed. */
465 tree index_before_incr, index_after_incr;
466 gimple_stmt_iterator incr_gsi;
467 bool insert_after;
468 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
469 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
470 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
472 tree zero_index = build_int_cst (compare_type, 0);
473 tree test_index, test_limit, first_limit;
474 gimple_stmt_iterator *test_gsi;
475 if (might_wrap_p)
477 /* In principle the loop should stop iterating once the incremented
478 IV reaches a value greater than or equal to:
480 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
482 However, there's no guarantee that this addition doesn't overflow
483 the comparison type, or that the IV hits a value above it before
484 wrapping around. We therefore adjust the limit down by one
485 IV step:
487 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
488 -[infinite-prec] NITEMS_STEP
490 and compare the IV against this limit _before_ incrementing it.
491 Since the comparison type is unsigned, we actually want the
492 subtraction to saturate at zero:
494 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
495 -[sat] NITEMS_STEP
497 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
499 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
501 where the rightmost subtraction can be done directly in
502 COMPARE_TYPE. */
503 test_index = index_before_incr;
504 tree adjust = gimple_convert (preheader_seq, compare_type,
505 nitems_step);
506 if (nitems_skip)
507 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
508 adjust, nitems_skip);
509 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
510 nitems_total, adjust);
511 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
512 test_limit, adjust);
513 test_gsi = &incr_gsi;
515 /* Get a safe limit for the first iteration. */
516 if (nitems_skip)
518 /* The first vector iteration can handle at most NITEMS_STEP
519 items. NITEMS_STEP <= CONST_LIMIT, and adding
520 NITEMS_SKIP to that cannot overflow. */
521 tree const_limit = build_int_cst (compare_type,
522 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
523 * nitems_per_iter);
524 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
525 nitems_total, const_limit);
526 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
527 first_limit, nitems_skip);
529 else
530 /* For the first iteration it doesn't matter whether the IV hits
531 a value above NITEMS_TOTAL. That only matters for the latch
532 condition. */
533 first_limit = nitems_total;
535 else
537 /* Test the incremented IV, which will always hit a value above
538 the bound before wrapping. */
539 test_index = index_after_incr;
540 test_limit = nitems_total;
541 if (nitems_skip)
542 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
543 test_limit, nitems_skip);
544 test_gsi = &loop_cond_gsi;
546 first_limit = test_limit;
549 /* Convert the IV value to the comparison type (either a no-op or
550 a demotion). */
551 gimple_seq test_seq = NULL;
552 test_index = gimple_convert (&test_seq, compare_type, test_index);
553 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
555 /* Provide a definition of each control in the group. */
556 tree next_ctrl = NULL_TREE;
557 tree ctrl;
558 unsigned int i;
559 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
561 /* Previous controls will cover BIAS items. This control covers the
562 next batch. */
563 poly_uint64 bias = nitems_per_ctrl * i;
564 tree bias_tree = build_int_cst (compare_type, bias);
566 /* See whether the first iteration of the vector loop is known
567 to have a full control. */
568 poly_uint64 const_limit;
569 bool first_iteration_full
570 = (poly_int_tree_p (first_limit, &const_limit)
571 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
573 /* Rather than have a new IV that starts at BIAS and goes up to
574 TEST_LIMIT, prefer to use the same 0-based IV for each control
575 and adjust the bound down by BIAS. */
576 tree this_test_limit = test_limit;
577 if (i != 0)
579 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
580 compare_type, this_test_limit,
581 bias_tree);
582 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
583 compare_type, this_test_limit,
584 bias_tree);
587 /* Create the initial control. First include all items that
588 are within the loop limit. */
589 tree init_ctrl = NULL_TREE;
590 if (!first_iteration_full)
592 tree start, end;
593 if (first_limit == test_limit)
595 /* Use a natural test between zero (the initial IV value)
596 and the loop limit. The "else" block would be valid too,
597 but this choice can avoid the need to load BIAS_TREE into
598 a register. */
599 start = zero_index;
600 end = this_test_limit;
602 else
604 /* FIRST_LIMIT is the maximum number of items handled by the
605 first iteration of the vector loop. Test the portion
606 associated with this control. */
607 start = bias_tree;
608 end = first_limit;
611 if (use_masks_p)
612 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
613 start, end, "max_mask");
614 else
616 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
617 gimple_seq seq = vect_gen_len (init_ctrl, start,
618 end, length_limit);
619 gimple_seq_add_seq (preheader_seq, seq);
623 /* Now AND out the bits that are within the number of skipped
624 items. */
625 poly_uint64 const_skip;
626 if (nitems_skip
627 && !(poly_int_tree_p (nitems_skip, &const_skip)
628 && known_le (const_skip, bias)))
630 gcc_assert (use_masks_p);
631 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
632 bias_tree, nitems_skip);
633 if (init_ctrl)
634 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
635 init_ctrl, unskipped_mask);
636 else
637 init_ctrl = unskipped_mask;
640 if (!init_ctrl)
642 /* First iteration is full. */
643 if (use_masks_p)
644 init_ctrl = build_minus_one_cst (ctrl_type);
645 else
646 init_ctrl = length_limit;
649 /* Get the control value for the next iteration of the loop. */
650 if (use_masks_p)
652 gimple_seq stmts = NULL;
653 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
654 this_test_limit, "next_mask");
655 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
657 else
659 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
660 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
661 length_limit);
662 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
665 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
667 return next_ctrl;
670 /* Set up the iteration condition and rgroup controls for LOOP, given
671 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
672 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
673 the number of iterations of the original scalar loop that should be
674 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
675 for vect_set_loop_condition.
677 Insert the branch-back condition before LOOP_COND_GSI and return the
678 final gcond. */
680 static gcond *
681 vect_set_loop_condition_partial_vectors (class loop *loop,
682 loop_vec_info loop_vinfo, tree niters,
683 tree final_iv, bool niters_maybe_zero,
684 gimple_stmt_iterator loop_cond_gsi)
686 gimple_seq preheader_seq = NULL;
687 gimple_seq header_seq = NULL;
689 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
690 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
691 unsigned int compare_precision = TYPE_PRECISION (compare_type);
692 tree orig_niters = niters;
694 /* Type of the initial value of NITERS. */
695 tree ni_actual_type = TREE_TYPE (niters);
696 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
697 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
699 /* Convert NITERS to the same size as the compare. */
700 if (compare_precision > ni_actual_precision
701 && niters_maybe_zero)
703 /* We know that there is always at least one iteration, so if the
704 count is zero then it must have wrapped. Cope with this by
705 subtracting 1 before the conversion and adding 1 to the result. */
706 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
707 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
708 niters, build_minus_one_cst (ni_actual_type));
709 niters = gimple_convert (&preheader_seq, compare_type, niters);
710 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
711 niters, build_one_cst (compare_type));
713 else
714 niters = gimple_convert (&preheader_seq, compare_type, niters);
716 /* Iterate over all the rgroups and fill in their controls. We could use
717 the first control from any rgroup for the loop condition; here we
718 arbitrarily pick the last. */
719 tree test_ctrl = NULL_TREE;
720 rgroup_controls *rgc;
721 unsigned int i;
722 auto_vec<rgroup_controls> *controls = use_masks_p
723 ? &LOOP_VINFO_MASKS (loop_vinfo)
724 : &LOOP_VINFO_LENS (loop_vinfo);
725 FOR_EACH_VEC_ELT (*controls, i, rgc)
726 if (!rgc->controls.is_empty ())
728 /* First try using permutes. This adds a single vector
729 instruction to the loop for each mask, but needs no extra
730 loop invariants or IVs. */
731 unsigned int nmasks = i + 1;
732 if (use_masks_p && (nmasks & 1) == 0)
734 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
735 if (!half_rgc->controls.is_empty ()
736 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
737 continue;
740 /* See whether zero-based IV would ever generate all-false masks
741 or zero length before wrapping around. */
742 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
744 /* Set up all controls for this group. */
745 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
746 &preheader_seq,
747 loop_cond_gsi, rgc,
748 niters, niters_skip,
749 might_wrap_p);
752 /* Emit all accumulated statements. */
753 add_preheader_seq (loop, preheader_seq);
754 add_header_seq (loop, header_seq);
756 /* Get a boolean result that tells us whether to iterate. */
757 edge exit_edge = single_exit (loop);
758 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
759 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
760 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
761 NULL_TREE, NULL_TREE);
762 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
764 /* The loop iterates (NITERS - 1) / VF + 1 times.
765 Subtract one from this to get the latch count. */
766 tree step = build_int_cst (compare_type,
767 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
768 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
769 build_minus_one_cst (compare_type));
770 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
771 niters_minus_one, step);
773 if (final_iv)
775 gassign *assign = gimple_build_assign (final_iv, orig_niters);
776 gsi_insert_on_edge_immediate (single_exit (loop), assign);
779 return cond_stmt;
782 /* Like vect_set_loop_condition, but handle the case in which the vector
783 loop handles exactly VF scalars per iteration. */
785 static gcond *
786 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
787 tree final_iv, bool niters_maybe_zero,
788 gimple_stmt_iterator loop_cond_gsi)
790 tree indx_before_incr, indx_after_incr;
791 gcond *cond_stmt;
792 gcond *orig_cond;
793 edge pe = loop_preheader_edge (loop);
794 edge exit_edge = single_exit (loop);
795 gimple_stmt_iterator incr_gsi;
796 bool insert_after;
797 enum tree_code code;
798 tree niters_type = TREE_TYPE (niters);
800 orig_cond = get_loop_exit_condition (loop);
801 gcc_assert (orig_cond);
802 loop_cond_gsi = gsi_for_stmt (orig_cond);
804 tree init, limit;
805 if (!niters_maybe_zero && integer_onep (step))
807 /* In this case we can use a simple 0-based IV:
810 x = 0;
814 x += 1;
816 while (x < NITERS); */
817 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
818 init = build_zero_cst (niters_type);
819 limit = niters;
821 else
823 /* The following works for all values of NITERS except 0:
826 x = 0;
830 x += STEP;
832 while (x <= NITERS - STEP);
834 so that the loop continues to iterate if x + STEP - 1 < NITERS
835 but stops if x + STEP - 1 >= NITERS.
837 However, if NITERS is zero, x never hits a value above NITERS - STEP
838 before wrapping around. There are two obvious ways of dealing with
839 this:
841 - start at STEP - 1 and compare x before incrementing it
842 - start at -1 and compare x after incrementing it
844 The latter is simpler and is what we use. The loop in this case
845 looks like:
848 x = -1;
852 x += STEP;
854 while (x < NITERS - STEP);
856 In both cases the loop limit is NITERS - STEP. */
857 gimple_seq seq = NULL;
858 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
859 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
860 if (seq)
862 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
863 gcc_assert (!new_bb);
865 if (niters_maybe_zero)
867 /* Case C. */
868 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
869 init = build_all_ones_cst (niters_type);
871 else
873 /* Case B. */
874 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
875 init = build_zero_cst (niters_type);
879 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
880 create_iv (init, step, NULL_TREE, loop,
881 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
882 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
883 true, NULL_TREE, true,
884 GSI_SAME_STMT);
885 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
886 true, GSI_SAME_STMT);
888 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
889 NULL_TREE);
891 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
893 /* Record the number of latch iterations. */
894 if (limit == niters)
895 /* Case A: the loop iterates NITERS times. Subtract one to get the
896 latch count. */
897 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
898 build_int_cst (niters_type, 1));
899 else
900 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
901 Subtract one from this to get the latch count. */
902 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
903 limit, step);
905 if (final_iv)
907 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
908 indx_after_incr, init);
909 gsi_insert_on_edge_immediate (single_exit (loop), assign);
912 return cond_stmt;
915 /* If we're using fully-masked loops, make LOOP iterate:
917 N == (NITERS - 1) / STEP + 1
919 times. When NITERS is zero, this is equivalent to making the loop
920 execute (1 << M) / STEP times, where M is the precision of NITERS.
921 NITERS_MAYBE_ZERO is true if this last case might occur.
923 If we're not using fully-masked loops, make LOOP iterate:
925 N == (NITERS - STEP) / STEP + 1
927 times, where NITERS is known to be outside the range [1, STEP - 1].
928 This is equivalent to making the loop execute NITERS / STEP times
929 when NITERS is nonzero and (1 << M) / STEP times otherwise.
930 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
932 If FINAL_IV is nonnull, it is an SSA name that should be set to
933 N * STEP on exit from the loop.
935 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
937 void
938 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
939 tree niters, tree step, tree final_iv,
940 bool niters_maybe_zero)
942 gcond *cond_stmt;
943 gcond *orig_cond = get_loop_exit_condition (loop);
944 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
946 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
947 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
948 niters, final_iv,
949 niters_maybe_zero,
950 loop_cond_gsi);
951 else
952 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
953 niters_maybe_zero,
954 loop_cond_gsi);
956 /* Remove old loop exit test. */
957 stmt_vec_info orig_cond_info;
958 if (loop_vinfo
959 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
960 loop_vinfo->remove_stmt (orig_cond_info);
961 else
962 gsi_remove (&loop_cond_gsi, true);
964 if (dump_enabled_p ())
965 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
966 cond_stmt);
969 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
970 For all PHI arguments in FROM->dest and TO->dest from those
971 edges ensure that TO->dest PHI arguments have current_def
972 to that in from. */
974 static void
975 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
977 gimple_stmt_iterator gsi_from, gsi_to;
979 for (gsi_from = gsi_start_phis (from->dest),
980 gsi_to = gsi_start_phis (to->dest);
981 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
983 gimple *from_phi = gsi_stmt (gsi_from);
984 gimple *to_phi = gsi_stmt (gsi_to);
985 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
986 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
987 if (virtual_operand_p (from_arg))
989 gsi_next (&gsi_from);
990 continue;
992 if (virtual_operand_p (to_arg))
994 gsi_next (&gsi_to);
995 continue;
997 if (TREE_CODE (from_arg) != SSA_NAME)
998 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
999 else if (TREE_CODE (to_arg) == SSA_NAME
1000 && from_arg != to_arg)
1002 if (get_current_def (to_arg) == NULL_TREE)
1004 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1005 TREE_TYPE (get_current_def
1006 (from_arg))));
1007 set_current_def (to_arg, get_current_def (from_arg));
1010 gsi_next (&gsi_from);
1011 gsi_next (&gsi_to);
1014 gphi *from_phi = get_virtual_phi (from->dest);
1015 gphi *to_phi = get_virtual_phi (to->dest);
1016 if (from_phi)
1017 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1018 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1022 /* Given LOOP this function generates a new copy of it and puts it
1023 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1024 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1025 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1026 entry or exit of LOOP. */
1028 class loop *
1029 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1030 class loop *scalar_loop, edge e)
1032 class loop *new_loop;
1033 basic_block *new_bbs, *bbs, *pbbs;
1034 bool at_exit;
1035 bool was_imm_dom;
1036 basic_block exit_dest;
1037 edge exit, new_exit;
1038 bool duplicate_outer_loop = false;
1040 exit = single_exit (loop);
1041 at_exit = (e == exit);
1042 if (!at_exit && e != loop_preheader_edge (loop))
1043 return NULL;
1045 if (scalar_loop == NULL)
1046 scalar_loop = loop;
1048 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1049 pbbs = bbs + 1;
1050 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1051 /* Allow duplication of outer loops. */
1052 if (scalar_loop->inner)
1053 duplicate_outer_loop = true;
1054 /* Check whether duplication is possible. */
1055 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1057 free (bbs);
1058 return NULL;
1061 /* Generate new loop structure. */
1062 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1063 duplicate_subloops (scalar_loop, new_loop);
1065 exit_dest = exit->dest;
1066 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1067 exit_dest) == loop->header ?
1068 true : false);
1070 /* Also copy the pre-header, this avoids jumping through hoops to
1071 duplicate the loop entry PHI arguments. Create an empty
1072 pre-header unconditionally for this. */
1073 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1074 edge entry_e = single_pred_edge (preheader);
1075 bbs[0] = preheader;
1076 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078 exit = single_exit (scalar_loop);
1079 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1080 &exit, 1, &new_exit, NULL,
1081 at_exit ? loop->latch : e->src, true);
1082 exit = single_exit (loop);
1083 basic_block new_preheader = new_bbs[0];
1085 /* Before installing PHI arguments make sure that the edges
1086 into them match that of the scalar loop we analyzed. This
1087 makes sure the SLP tree matches up between the main vectorized
1088 loop and the epilogue vectorized copies. */
1089 if (single_succ_edge (preheader)->dest_idx
1090 != single_succ_edge (new_bbs[0])->dest_idx)
1092 basic_block swap_bb = new_bbs[1];
1093 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1094 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1095 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1096 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1098 if (duplicate_outer_loop)
1100 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1101 if (loop_preheader_edge (scalar_loop)->dest_idx
1102 != loop_preheader_edge (new_inner_loop)->dest_idx)
1104 basic_block swap_bb = new_inner_loop->header;
1105 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1106 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1107 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1108 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1112 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1114 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1115 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1116 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1118 if (scalar_loop != loop)
1120 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1121 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1122 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1123 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1124 header) to have current_def set, so copy them over. */
1125 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1126 exit);
1127 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1129 EDGE_SUCC (loop->latch, 0));
1132 if (at_exit) /* Add the loop copy at exit. */
1134 if (scalar_loop != loop)
1136 gphi_iterator gsi;
1137 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1139 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1140 gsi_next (&gsi))
1142 gphi *phi = gsi.phi ();
1143 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1144 location_t orig_locus
1145 = gimple_phi_arg_location_from_edge (phi, e);
1147 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1150 redirect_edge_and_branch_force (e, new_preheader);
1151 flush_pending_stmts (e);
1152 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1153 if (was_imm_dom || duplicate_outer_loop)
1154 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1156 /* And remove the non-necessary forwarder again. Keep the other
1157 one so we have a proper pre-header for the loop at the exit edge. */
1158 redirect_edge_pred (single_succ_edge (preheader),
1159 single_pred (preheader));
1160 delete_basic_block (preheader);
1161 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1162 loop_preheader_edge (scalar_loop)->src);
1164 else /* Add the copy at entry. */
1166 if (scalar_loop != loop)
1168 /* Remove the non-necessary forwarder of scalar_loop again. */
1169 redirect_edge_pred (single_succ_edge (preheader),
1170 single_pred (preheader));
1171 delete_basic_block (preheader);
1172 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1173 loop_preheader_edge (scalar_loop)->src);
1174 preheader = split_edge (loop_preheader_edge (loop));
1175 entry_e = single_pred_edge (preheader);
1178 redirect_edge_and_branch_force (entry_e, new_preheader);
1179 flush_pending_stmts (entry_e);
1180 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1182 redirect_edge_and_branch_force (new_exit, preheader);
1183 flush_pending_stmts (new_exit);
1184 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1186 /* And remove the non-necessary forwarder again. Keep the other
1187 one so we have a proper pre-header for the loop at the exit edge. */
1188 redirect_edge_pred (single_succ_edge (new_preheader),
1189 single_pred (new_preheader));
1190 delete_basic_block (new_preheader);
1191 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1192 loop_preheader_edge (new_loop)->src);
1195 if (scalar_loop != loop)
1197 /* Update new_loop->header PHIs, so that on the preheader
1198 edge they are the ones from loop rather than scalar_loop. */
1199 gphi_iterator gsi_orig, gsi_new;
1200 edge orig_e = loop_preheader_edge (loop);
1201 edge new_e = loop_preheader_edge (new_loop);
1203 for (gsi_orig = gsi_start_phis (loop->header),
1204 gsi_new = gsi_start_phis (new_loop->header);
1205 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1206 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1208 gphi *orig_phi = gsi_orig.phi ();
1209 gphi *new_phi = gsi_new.phi ();
1210 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1211 location_t orig_locus
1212 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1214 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1218 free (new_bbs);
1219 free (bbs);
1221 checking_verify_dominators (CDI_DOMINATORS);
1223 return new_loop;
1227 /* Given the condition expression COND, put it as the last statement of
1228 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1229 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1230 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1231 new edge as irreducible if IRREDUCIBLE_P is true. */
1233 static edge
1234 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1235 basic_block guard_to, basic_block dom_bb,
1236 profile_probability probability, bool irreducible_p)
1238 gimple_stmt_iterator gsi;
1239 edge new_e, enter_e;
1240 gcond *cond_stmt;
1241 gimple_seq gimplify_stmt_list = NULL;
1243 enter_e = EDGE_SUCC (guard_bb, 0);
1244 enter_e->flags &= ~EDGE_FALLTHRU;
1245 enter_e->flags |= EDGE_FALSE_VALUE;
1246 gsi = gsi_last_bb (guard_bb);
1248 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1249 NULL_TREE);
1250 if (gimplify_stmt_list)
1251 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1253 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1254 gsi = gsi_last_bb (guard_bb);
1255 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1257 /* Add new edge to connect guard block to the merge/loop-exit block. */
1258 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1260 new_e->probability = probability;
1261 if (irreducible_p)
1262 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1264 enter_e->probability = probability.invert ();
1265 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1267 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1268 if (enter_e->dest->loop_father->header == enter_e->dest)
1269 split_edge (enter_e);
1271 return new_e;
1275 /* This function verifies that the following restrictions apply to LOOP:
1276 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1277 for innermost loop and 5 basic blocks for outer-loop.
1278 (2) it is single entry, single exit
1279 (3) its exit condition is the last stmt in the header
1280 (4) E is the entry/exit edge of LOOP.
1283 bool
1284 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1286 edge exit_e = single_exit (loop);
1287 edge entry_e = loop_preheader_edge (loop);
1288 gcond *orig_cond = get_loop_exit_condition (loop);
1289 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1290 unsigned int num_bb = loop->inner? 5 : 2;
1292 /* All loops have an outer scope; the only case loop->outer is NULL is for
1293 the function itself. */
1294 if (!loop_outer (loop)
1295 || loop->num_nodes != num_bb
1296 || !empty_block_p (loop->latch)
1297 || !single_exit (loop)
1298 /* Verify that new loop exit condition can be trivially modified. */
1299 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1300 || (e != exit_e && e != entry_e))
1301 return false;
1303 return true;
1306 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1307 in the exit bb and rename all the uses after the loop. This simplifies
1308 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1309 (but normally loop closed SSA form doesn't require virtual PHIs to be
1310 in the same form). Doing this early simplifies the checking what
1311 uses should be renamed.
1313 If we create a new phi after the loop, return the definition that
1314 applies on entry to the loop, otherwise return null. */
1316 static tree
1317 create_lcssa_for_virtual_phi (class loop *loop)
1319 gphi_iterator gsi;
1320 edge exit_e = single_exit (loop);
1322 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1323 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1325 gphi *phi = gsi.phi ();
1326 for (gsi = gsi_start_phis (exit_e->dest);
1327 !gsi_end_p (gsi); gsi_next (&gsi))
1328 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1329 break;
1330 if (gsi_end_p (gsi))
1332 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1333 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1334 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1335 imm_use_iterator imm_iter;
1336 gimple *stmt;
1337 use_operand_p use_p;
1339 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1340 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1341 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1342 gimple_phi_set_result (new_phi, new_vop);
1343 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1344 if (stmt != new_phi
1345 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1346 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1347 SET_USE (use_p, new_vop);
1349 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1351 break;
1353 return NULL_TREE;
1356 /* Function vect_get_loop_location.
1358 Extract the location of the loop in the source code.
1359 If the loop is not well formed for vectorization, an estimated
1360 location is calculated.
1361 Return the loop location if succeed and NULL if not. */
1363 dump_user_location_t
1364 find_loop_location (class loop *loop)
1366 gimple *stmt = NULL;
1367 basic_block bb;
1368 gimple_stmt_iterator si;
1370 if (!loop)
1371 return dump_user_location_t ();
1373 stmt = get_loop_exit_condition (loop);
1375 if (stmt
1376 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1377 return stmt;
1379 /* If we got here the loop is probably not "well formed",
1380 try to estimate the loop location */
1382 if (!loop->header)
1383 return dump_user_location_t ();
1385 bb = loop->header;
1387 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1389 stmt = gsi_stmt (si);
1390 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1391 return stmt;
1394 return dump_user_location_t ();
1397 /* Return true if the phi described by STMT_INFO defines an IV of the
1398 loop to be vectorized. */
1400 static bool
1401 iv_phi_p (stmt_vec_info stmt_info)
1403 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1404 if (virtual_operand_p (PHI_RESULT (phi)))
1405 return false;
1407 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1408 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1409 return false;
1411 return true;
1414 /* Function vect_can_advance_ivs_p
1416 In case the number of iterations that LOOP iterates is unknown at compile
1417 time, an epilog loop will be generated, and the loop induction variables
1418 (IVs) will be "advanced" to the value they are supposed to take just before
1419 the epilog loop. Here we check that the access function of the loop IVs
1420 and the expression that represents the loop bound are simple enough.
1421 These restrictions will be relaxed in the future. */
1423 bool
1424 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1426 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1427 basic_block bb = loop->header;
1428 gphi_iterator gsi;
1430 /* Analyze phi functions of the loop header. */
1432 if (dump_enabled_p ())
1433 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1434 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1436 tree evolution_part;
1438 gphi *phi = gsi.phi ();
1439 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1440 if (dump_enabled_p ())
1441 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1442 phi_info->stmt);
1444 /* Skip virtual phi's. The data dependences that are associated with
1445 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1447 Skip reduction phis. */
1448 if (!iv_phi_p (phi_info))
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_NOTE, vect_location,
1452 "reduc or virtual phi. skip.\n");
1453 continue;
1456 /* Analyze the evolution function. */
1458 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1459 if (evolution_part == NULL_TREE)
1461 if (dump_enabled_p ())
1462 dump_printf (MSG_MISSED_OPTIMIZATION,
1463 "No access function or evolution.\n");
1464 return false;
1467 /* FORNOW: We do not transform initial conditions of IVs
1468 which evolution functions are not invariants in the loop. */
1470 if (!expr_invariant_in_loop_p (loop, evolution_part))
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "evolution not invariant in loop.\n");
1475 return false;
1478 /* FORNOW: We do not transform initial conditions of IVs
1479 which evolution functions are a polynomial of degree >= 2. */
1481 if (tree_is_chrec (evolution_part))
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "evolution is chrec.\n");
1486 return false;
1490 return true;
1494 /* Function vect_update_ivs_after_vectorizer.
1496 "Advance" the induction variables of LOOP to the value they should take
1497 after the execution of LOOP. This is currently necessary because the
1498 vectorizer does not handle induction variables that are used after the
1499 loop. Such a situation occurs when the last iterations of LOOP are
1500 peeled, because:
1501 1. We introduced new uses after LOOP for IVs that were not originally used
1502 after LOOP: the IVs of LOOP are now used by an epilog loop.
1503 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1504 times, whereas the loop IVs should be bumped N times.
1506 Input:
1507 - LOOP - a loop that is going to be vectorized. The last few iterations
1508 of LOOP were peeled.
1509 - NITERS - the number of iterations that LOOP executes (before it is
1510 vectorized). i.e, the number of times the ivs should be bumped.
1511 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1512 coming out from LOOP on which there are uses of the LOOP ivs
1513 (this is the path from LOOP->exit to epilog_loop->preheader).
1515 The new definitions of the ivs are placed in LOOP->exit.
1516 The phi args associated with the edge UPDATE_E in the bb
1517 UPDATE_E->dest are updated accordingly.
1519 Assumption 1: Like the rest of the vectorizer, this function assumes
1520 a single loop exit that has a single predecessor.
1522 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1523 organized in the same order.
1525 Assumption 3: The access function of the ivs is simple enough (see
1526 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1528 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1529 coming out of LOOP on which the ivs of LOOP are used (this is the path
1530 that leads to the epilog loop; other paths skip the epilog loop). This
1531 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1532 needs to have its phis updated.
1535 static void
1536 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1537 tree niters, edge update_e)
1539 gphi_iterator gsi, gsi1;
1540 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1541 basic_block update_bb = update_e->dest;
1542 basic_block exit_bb = single_exit (loop)->dest;
1544 /* Make sure there exists a single-predecessor exit bb: */
1545 gcc_assert (single_pred_p (exit_bb));
1546 gcc_assert (single_succ_edge (exit_bb) == update_e);
1548 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1549 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1550 gsi_next (&gsi), gsi_next (&gsi1))
1552 tree init_expr;
1553 tree step_expr, off;
1554 tree type;
1555 tree var, ni, ni_name;
1556 gimple_stmt_iterator last_gsi;
1558 gphi *phi = gsi.phi ();
1559 gphi *phi1 = gsi1.phi ();
1560 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1561 if (dump_enabled_p ())
1562 dump_printf_loc (MSG_NOTE, vect_location,
1563 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1565 /* Skip reduction and virtual phis. */
1566 if (!iv_phi_p (phi_info))
1568 if (dump_enabled_p ())
1569 dump_printf_loc (MSG_NOTE, vect_location,
1570 "reduc or virtual phi. skip.\n");
1571 continue;
1574 type = TREE_TYPE (gimple_phi_result (phi));
1575 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1576 step_expr = unshare_expr (step_expr);
1578 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1579 of degree >= 2 or exponential. */
1580 gcc_assert (!tree_is_chrec (step_expr));
1582 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1584 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1585 fold_convert (TREE_TYPE (step_expr), niters),
1586 step_expr);
1587 if (POINTER_TYPE_P (type))
1588 ni = fold_build_pointer_plus (init_expr, off);
1589 else
1590 ni = fold_build2 (PLUS_EXPR, type,
1591 init_expr, fold_convert (type, off));
1593 var = create_tmp_var (type, "tmp");
1595 last_gsi = gsi_last_bb (exit_bb);
1596 gimple_seq new_stmts = NULL;
1597 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1598 /* Exit_bb shouldn't be empty. */
1599 if (!gsi_end_p (last_gsi))
1600 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1601 else
1602 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1604 /* Fix phi expressions in the successor bb. */
1605 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1609 /* Return a gimple value containing the misalignment (measured in vector
1610 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1611 it is away from a perfectly aligned address. Add any new statements
1612 to SEQ. */
1614 static tree
1615 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1617 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1618 stmt_vec_info stmt_info = dr_info->stmt;
1619 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1621 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1622 unsigned HOST_WIDE_INT target_align_c;
1623 tree target_align_minus_1;
1625 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1626 size_zero_node) < 0;
1627 tree offset = (negative
1628 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1629 : size_zero_node);
1630 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1631 stmt_info, seq,
1632 offset);
1633 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1634 if (target_align.is_constant (&target_align_c))
1635 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1636 else
1638 tree vla = build_int_cst (type, target_align);
1639 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1640 fold_build2 (MINUS_EXPR, type,
1641 build_int_cst (type, 0), vla));
1642 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1643 build_int_cst (type, 1));
1646 HOST_WIDE_INT elem_size
1647 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1648 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1650 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1651 tree int_start_addr = fold_convert (type, start_addr);
1652 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1653 target_align_minus_1);
1655 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1656 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1657 elem_size_log);
1659 return misalign_in_elems;
1662 /* Function vect_gen_prolog_loop_niters
1664 Generate the number of iterations which should be peeled as prolog for the
1665 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1666 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1667 As a result, after the execution of this loop, the data reference DR will
1668 refer to an aligned location. The following computation is generated:
1670 If the misalignment of DR is known at compile time:
1671 addr_mis = int mis = DR_MISALIGNMENT (dr);
1672 Else, compute address misalignment in bytes:
1673 addr_mis = addr & (target_align - 1)
1675 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1677 (elem_size = element type size; an element is the scalar element whose type
1678 is the inner type of the vectype)
1680 The computations will be emitted at the end of BB. We also compute and
1681 store upper bound (included) of the result in BOUND.
1683 When the step of the data-ref in the loop is not 1 (as in interleaved data
1684 and SLP), the number of iterations of the prolog must be divided by the step
1685 (which is equal to the size of interleaved group).
1687 The above formulas assume that VF == number of elements in the vector. This
1688 may not hold when there are multiple-types in the loop.
1689 In this case, for some data-references in the loop the VF does not represent
1690 the number of elements that fit in the vector. Therefore, instead of VF we
1691 use TYPE_VECTOR_SUBPARTS. */
1693 static tree
1694 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1695 basic_block bb, int *bound)
1697 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1698 tree var;
1699 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1700 gimple_seq stmts = NULL, new_stmts = NULL;
1701 tree iters, iters_name;
1702 stmt_vec_info stmt_info = dr_info->stmt;
1703 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1704 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1706 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1708 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1710 if (dump_enabled_p ())
1711 dump_printf_loc (MSG_NOTE, vect_location,
1712 "known peeling = %d.\n", npeel);
1714 iters = build_int_cst (niters_type, npeel);
1715 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1717 else
1719 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1720 tree type = TREE_TYPE (misalign_in_elems);
1721 HOST_WIDE_INT elem_size
1722 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1723 /* We only do prolog peeling if the target alignment is known at compile
1724 time. */
1725 poly_uint64 align_in_elems =
1726 exact_div (target_align, elem_size);
1727 tree align_in_elems_minus_1 =
1728 build_int_cst (type, align_in_elems - 1);
1729 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1731 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1732 & (align_in_elems - 1)). */
1733 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1734 size_zero_node) < 0;
1735 if (negative)
1736 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1737 align_in_elems_tree);
1738 else
1739 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1740 misalign_in_elems);
1741 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1742 iters = fold_convert (niters_type, iters);
1743 unsigned HOST_WIDE_INT align_in_elems_c;
1744 if (align_in_elems.is_constant (&align_in_elems_c))
1745 *bound = align_in_elems_c - 1;
1746 else
1747 *bound = -1;
1750 if (dump_enabled_p ())
1751 dump_printf_loc (MSG_NOTE, vect_location,
1752 "niters for prolog loop: %T\n", iters);
1754 var = create_tmp_var (niters_type, "prolog_loop_niters");
1755 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1757 if (new_stmts)
1758 gimple_seq_add_seq (&stmts, new_stmts);
1759 if (stmts)
1761 gcc_assert (single_succ_p (bb));
1762 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1763 if (gsi_end_p (gsi))
1764 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1765 else
1766 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1768 return iters_name;
1772 /* Function vect_update_init_of_dr
1774 If CODE is PLUS, the vector loop starts NITERS iterations after the
1775 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1776 iterations before the scalar one (using masking to skip inactive
1777 elements). This function updates the information recorded in DR to
1778 account for the difference. Specifically, it updates the OFFSET
1779 field of DR_INFO. */
1781 static void
1782 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1784 struct data_reference *dr = dr_info->dr;
1785 tree offset = dr_info->offset;
1786 if (!offset)
1787 offset = build_zero_cst (sizetype);
1789 niters = fold_build2 (MULT_EXPR, sizetype,
1790 fold_convert (sizetype, niters),
1791 fold_convert (sizetype, DR_STEP (dr)));
1792 offset = fold_build2 (code, sizetype,
1793 fold_convert (sizetype, offset), niters);
1794 dr_info->offset = offset;
1798 /* Function vect_update_inits_of_drs
1800 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1801 CODE and NITERS are as for vect_update_inits_of_dr. */
1803 void
1804 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1805 tree_code code)
1807 unsigned int i;
1808 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1809 struct data_reference *dr;
1811 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1813 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1814 here, but since we might use these niters to update the epilogues niters
1815 and data references we can't insert them here as this definition might not
1816 always dominate its uses. */
1817 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1818 niters = fold_convert (sizetype, niters);
1820 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1823 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1824 vect_update_init_of_dr (dr_info, niters, code);
1828 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1829 by masking. This involves calculating the number of iterations to
1830 be peeled and then aligning all memory references appropriately. */
1832 void
1833 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1835 tree misalign_in_elems;
1836 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1838 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1840 /* From the information recorded in LOOP_VINFO get the number of iterations
1841 that need to be skipped via masking. */
1842 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1844 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1845 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1846 misalign_in_elems = build_int_cst (type, misalign);
1848 else
1850 gimple_seq seq1 = NULL, seq2 = NULL;
1851 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1852 misalign_in_elems = fold_convert (type, misalign_in_elems);
1853 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1854 &seq2, true, NULL_TREE);
1855 gimple_seq_add_seq (&seq1, seq2);
1856 if (seq1)
1858 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1859 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1860 gcc_assert (!new_bb);
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_NOTE, vect_location,
1866 "misalignment for fully-masked loop: %T\n",
1867 misalign_in_elems);
1869 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1871 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1874 /* This function builds ni_name = number of iterations. Statements
1875 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1876 it to TRUE if new ssa_var is generated. */
1878 tree
1879 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1881 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1882 if (TREE_CODE (ni) == INTEGER_CST)
1883 return ni;
1884 else
1886 tree ni_name, var;
1887 gimple_seq stmts = NULL;
1888 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1890 var = create_tmp_var (TREE_TYPE (ni), "niters");
1891 ni_name = force_gimple_operand (ni, &stmts, false, var);
1892 if (stmts)
1894 gsi_insert_seq_on_edge_immediate (pe, stmts);
1895 if (new_var_p != NULL)
1896 *new_var_p = true;
1899 return ni_name;
1903 /* Calculate the number of iterations above which vectorized loop will be
1904 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1905 of prolog loop. If it's integer const, the integer number is also passed
1906 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1907 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1908 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1909 threshold below which the scalar (rather than vectorized) loop will be
1910 executed. This function stores the upper bound (inclusive) of the result
1911 in BOUND_SCALAR. */
1913 static tree
1914 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1915 int bound_prolog, poly_int64 bound_epilog, int th,
1916 poly_uint64 *bound_scalar,
1917 bool check_profitability)
1919 tree type = TREE_TYPE (niters_prolog);
1920 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1921 build_int_cst (type, bound_epilog));
1923 *bound_scalar = bound_prolog + bound_epilog;
1924 if (check_profitability)
1926 /* TH indicates the minimum niters of vectorized loop, while we
1927 compute the maximum niters of scalar loop. */
1928 th--;
1929 /* Peeling for constant times. */
1930 if (int_niters_prolog >= 0)
1932 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1933 return build_int_cst (type, *bound_scalar);
1935 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1936 and BOUND_EPILOG are inclusive upper bounds. */
1937 if (known_ge (th, bound_prolog + bound_epilog))
1939 *bound_scalar = th;
1940 return build_int_cst (type, th);
1942 /* Need to do runtime comparison. */
1943 else if (maybe_gt (th, bound_epilog))
1945 *bound_scalar = upper_bound (*bound_scalar, th);
1946 return fold_build2 (MAX_EXPR, type,
1947 build_int_cst (type, th), niters);
1950 return niters;
1953 /* NITERS is the number of times that the original scalar loop executes
1954 after peeling. Work out the maximum number of iterations N that can
1955 be handled by the vectorized form of the loop and then either:
1957 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1959 niters_vector = N
1961 b) set *STEP_VECTOR_PTR to one and generate:
1963 niters_vector = N / vf
1965 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1966 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1967 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1969 void
1970 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1971 tree *niters_vector_ptr, tree *step_vector_ptr,
1972 bool niters_no_overflow)
1974 tree ni_minus_gap, var;
1975 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1976 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1978 tree log_vf = NULL_TREE;
1980 /* If epilogue loop is required because of data accesses with gaps, we
1981 subtract one iteration from the total number of iterations here for
1982 correct calculation of RATIO. */
1983 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1985 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1986 build_one_cst (type));
1987 if (!is_gimple_val (ni_minus_gap))
1989 var = create_tmp_var (type, "ni_gap");
1990 gimple *stmts = NULL;
1991 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1992 true, var);
1993 gsi_insert_seq_on_edge_immediate (pe, stmts);
1996 else
1997 ni_minus_gap = niters;
1999 unsigned HOST_WIDE_INT const_vf;
2000 if (vf.is_constant (&const_vf)
2001 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2003 /* Create: niters >> log2(vf) */
2004 /* If it's known that niters == number of latch executions + 1 doesn't
2005 overflow, we can generate niters >> log2(vf); otherwise we generate
2006 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2007 will be at least one. */
2008 log_vf = build_int_cst (type, exact_log2 (const_vf));
2009 if (niters_no_overflow)
2010 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2011 else
2012 niters_vector
2013 = fold_build2 (PLUS_EXPR, type,
2014 fold_build2 (RSHIFT_EXPR, type,
2015 fold_build2 (MINUS_EXPR, type,
2016 ni_minus_gap,
2017 build_int_cst (type, vf)),
2018 log_vf),
2019 build_int_cst (type, 1));
2020 step_vector = build_one_cst (type);
2022 else
2024 niters_vector = ni_minus_gap;
2025 step_vector = build_int_cst (type, vf);
2028 if (!is_gimple_val (niters_vector))
2030 var = create_tmp_var (type, "bnd");
2031 gimple_seq stmts = NULL;
2032 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2033 gsi_insert_seq_on_edge_immediate (pe, stmts);
2034 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2035 we set range information to make niters analyzer's life easier.
2036 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2037 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2038 if (stmts != NULL && log_vf)
2040 if (niters_no_overflow)
2041 set_range_info (niters_vector, VR_RANGE,
2042 wi::one (TYPE_PRECISION (type)),
2043 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2044 TYPE_SIGN (type)),
2045 exact_log2 (const_vf),
2046 TYPE_SIGN (type)));
2047 /* For VF == 1 the vector IV might also overflow so we cannot
2048 assert a minimum value of 1. */
2049 else if (const_vf > 1)
2050 set_range_info (niters_vector, VR_RANGE,
2051 wi::one (TYPE_PRECISION (type)),
2052 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2053 TYPE_SIGN (type))
2054 - (const_vf - 1),
2055 exact_log2 (const_vf), TYPE_SIGN (type))
2056 + 1);
2059 *niters_vector_ptr = niters_vector;
2060 *step_vector_ptr = step_vector;
2062 return;
2065 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2066 loop specified by LOOP_VINFO after vectorization, compute the number
2067 of iterations before vectorization (niters_vector * vf) and store it
2068 to NITERS_VECTOR_MULT_VF_PTR. */
2070 static void
2071 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2072 tree niters_vector,
2073 tree *niters_vector_mult_vf_ptr)
2075 /* We should be using a step_vector of VF if VF is variable. */
2076 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2077 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2078 tree type = TREE_TYPE (niters_vector);
2079 tree log_vf = build_int_cst (type, exact_log2 (vf));
2080 basic_block exit_bb = single_exit (loop)->dest;
2082 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2083 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2084 niters_vector, log_vf);
2085 if (!is_gimple_val (niters_vector_mult_vf))
2087 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2088 gimple_seq stmts = NULL;
2089 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2090 &stmts, true, var);
2091 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2092 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2094 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2097 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2098 this function searches for the corresponding lcssa phi node in exit
2099 bb of LOOP. If it is found, return the phi result; otherwise return
2100 NULL. */
2102 static tree
2103 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2104 gphi *lcssa_phi)
2106 gphi_iterator gsi;
2107 edge e = single_exit (loop);
2109 gcc_assert (single_pred_p (e->dest));
2110 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2112 gphi *phi = gsi.phi ();
2113 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2114 PHI_ARG_DEF (lcssa_phi, 0), 0))
2115 return PHI_RESULT (phi);
2117 return NULL_TREE;
2120 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2121 from SECOND/FIRST and puts it at the original loop's preheader/exit
2122 edge, the two loops are arranged as below:
2124 preheader_a:
2125 first_loop:
2126 header_a:
2127 i_1 = PHI<i_0, i_2>;
2129 i_2 = i_1 + 1;
2130 if (cond_a)
2131 goto latch_a;
2132 else
2133 goto between_bb;
2134 latch_a:
2135 goto header_a;
2137 between_bb:
2138 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2140 second_loop:
2141 header_b:
2142 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2143 or with i_2 if no LCSSA phi is created
2144 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2146 i_4 = i_3 + 1;
2147 if (cond_b)
2148 goto latch_b;
2149 else
2150 goto exit_bb;
2151 latch_b:
2152 goto header_b;
2154 exit_bb:
2156 This function creates loop closed SSA for the first loop; update the
2157 second loop's PHI nodes by replacing argument on incoming edge with the
2158 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2159 is false, Loop closed ssa phis will only be created for non-iv phis for
2160 the first loop.
2162 This function assumes exit bb of the first loop is preheader bb of the
2163 second loop, i.e, between_bb in the example code. With PHIs updated,
2164 the second loop will execute rest iterations of the first. */
2166 static void
2167 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2168 class loop *first, class loop *second,
2169 bool create_lcssa_for_iv_phis)
2171 gphi_iterator gsi_update, gsi_orig;
2172 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2174 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2175 edge second_preheader_e = loop_preheader_edge (second);
2176 basic_block between_bb = single_exit (first)->dest;
2178 gcc_assert (between_bb == second_preheader_e->src);
2179 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2180 /* Either the first loop or the second is the loop to be vectorized. */
2181 gcc_assert (loop == first || loop == second);
2183 for (gsi_orig = gsi_start_phis (first->header),
2184 gsi_update = gsi_start_phis (second->header);
2185 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2186 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2188 gphi *orig_phi = gsi_orig.phi ();
2189 gphi *update_phi = gsi_update.phi ();
2191 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2192 /* Generate lcssa PHI node for the first loop. */
2193 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2194 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2195 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2197 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2198 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2199 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2200 arg = new_res;
2203 /* Update PHI node in the second loop by replacing arg on the loop's
2204 incoming edge. */
2205 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2208 /* For epilogue peeling we have to make sure to copy all LC PHIs
2209 for correct vectorization of live stmts. */
2210 if (loop == first)
2212 basic_block orig_exit = single_exit (second)->dest;
2213 for (gsi_orig = gsi_start_phis (orig_exit);
2214 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2216 gphi *orig_phi = gsi_orig.phi ();
2217 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2218 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2219 continue;
2221 /* Already created in the above loop. */
2222 if (find_guard_arg (first, second, orig_phi))
2223 continue;
2225 tree new_res = copy_ssa_name (orig_arg);
2226 gphi *lcphi = create_phi_node (new_res, between_bb);
2227 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2232 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2233 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2234 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2235 appear like below:
2237 guard_bb:
2238 if (cond)
2239 goto merge_bb;
2240 else
2241 goto skip_loop;
2243 skip_loop:
2244 header_a:
2245 i_1 = PHI<i_0, i_2>;
2247 i_2 = i_1 + 1;
2248 if (cond_a)
2249 goto latch_a;
2250 else
2251 goto exit_a;
2252 latch_a:
2253 goto header_a;
2255 exit_a:
2256 i_5 = PHI<i_2>;
2258 merge_bb:
2259 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2261 update_loop:
2262 header_b:
2263 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2265 i_4 = i_3 + 1;
2266 if (cond_b)
2267 goto latch_b;
2268 else
2269 goto exit_bb;
2270 latch_b:
2271 goto header_b;
2273 exit_bb:
2275 This function creates PHI nodes at merge_bb and replaces the use of i_5
2276 in the update_loop's PHI node with the result of new PHI result. */
2278 static void
2279 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2280 class loop *update_loop,
2281 edge guard_edge, edge merge_edge)
2283 location_t merge_loc, guard_loc;
2284 edge orig_e = loop_preheader_edge (skip_loop);
2285 edge update_e = loop_preheader_edge (update_loop);
2286 gphi_iterator gsi_orig, gsi_update;
2288 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2289 gsi_update = gsi_start_phis (update_loop->header));
2290 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2291 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2293 gphi *orig_phi = gsi_orig.phi ();
2294 gphi *update_phi = gsi_update.phi ();
2296 /* Generate new phi node at merge bb of the guard. */
2297 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2298 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2300 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2301 args in NEW_PHI for these edges. */
2302 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2303 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2304 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2305 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2306 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2307 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2309 /* Update phi in UPDATE_PHI. */
2310 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2314 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2315 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2316 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2317 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2318 The CFG looks like:
2320 loop:
2321 header_a:
2322 i_1 = PHI<i_0, i_2>;
2324 i_2 = i_1 + 1;
2325 if (cond_a)
2326 goto latch_a;
2327 else
2328 goto exit_a;
2329 latch_a:
2330 goto header_a;
2332 exit_a:
2334 guard_bb:
2335 if (cond)
2336 goto merge_bb;
2337 else
2338 goto epilog_loop;
2340 ;; fall_through_bb
2342 epilog_loop:
2343 header_b:
2344 i_3 = PHI<i_2, i_4>;
2346 i_4 = i_3 + 1;
2347 if (cond_b)
2348 goto latch_b;
2349 else
2350 goto merge_bb;
2351 latch_b:
2352 goto header_b;
2354 merge_bb:
2355 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2357 exit_bb:
2358 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2360 For each name used out side EPILOG (i.e - for each name that has a lcssa
2361 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2362 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2363 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2364 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2365 in exit_bb will also be updated. */
2367 static void
2368 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2369 edge guard_edge, edge merge_edge)
2371 gphi_iterator gsi;
2372 basic_block merge_bb = guard_edge->dest;
2374 gcc_assert (single_succ_p (merge_bb));
2375 edge e = single_succ_edge (merge_bb);
2376 basic_block exit_bb = e->dest;
2377 gcc_assert (single_pred_p (exit_bb));
2378 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2380 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2382 gphi *update_phi = gsi.phi ();
2383 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2385 tree merge_arg = NULL_TREE;
2387 /* If the old argument is a SSA_NAME use its current_def. */
2388 if (TREE_CODE (old_arg) == SSA_NAME)
2389 merge_arg = get_current_def (old_arg);
2390 /* If it's a constant or doesn't have a current_def, just use the old
2391 argument. */
2392 if (!merge_arg)
2393 merge_arg = old_arg;
2395 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2396 /* If the var is live after loop but not a reduction, we simply
2397 use the old arg. */
2398 if (!guard_arg)
2399 guard_arg = old_arg;
2401 /* Create new phi node in MERGE_BB: */
2402 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2403 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2405 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2406 the two PHI args in merge_phi for these edges. */
2407 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2408 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2410 /* Update the original phi in exit_bb. */
2411 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2415 /* EPILOG loop is duplicated from the original loop for vectorizing,
2416 the arg of its loop closed ssa PHI needs to be updated. */
2418 static void
2419 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2421 gphi_iterator gsi;
2422 basic_block exit_bb = single_exit (epilog)->dest;
2424 gcc_assert (single_pred_p (exit_bb));
2425 edge e = EDGE_PRED (exit_bb, 0);
2426 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2427 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2430 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2431 iterate exactly CONST_NITERS times. Make a final decision about
2432 whether the epilogue loop should be used, returning true if so. */
2434 static bool
2435 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2436 unsigned HOST_WIDE_INT const_niters)
2438 /* Avoid wrap-around when computing const_niters - 1. Also reject
2439 using an epilogue loop for a single scalar iteration, even if
2440 we could in principle implement that using partial vectors. */
2441 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2442 if (const_niters <= gap_niters + 1)
2443 return false;
2445 /* Install the number of iterations. */
2446 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2447 tree niters_tree = build_int_cst (niters_type, const_niters);
2448 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2450 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2451 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2453 /* Decide what to do if the number of epilogue iterations is not
2454 a multiple of the epilogue loop's vectorization factor. */
2455 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2458 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2459 Return a value that equals:
2461 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2462 - SKIP_VALUE when the main loop is skipped. */
2464 tree
2465 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2466 tree skip_value)
2468 gcc_assert (loop_vinfo->main_loop_edge);
2470 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2471 basic_block bb = loop_vinfo->main_loop_edge->dest;
2472 gphi *new_phi = create_phi_node (phi_result, bb);
2473 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2474 UNKNOWN_LOCATION);
2475 add_phi_arg (new_phi, skip_value,
2476 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2477 return phi_result;
2480 /* Function vect_do_peeling.
2482 Input:
2483 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2485 preheader:
2486 LOOP:
2487 header_bb:
2488 loop_body
2489 if (exit_loop_cond) goto exit_bb
2490 else goto header_bb
2491 exit_bb:
2493 - NITERS: The number of iterations of the loop.
2494 - NITERSM1: The number of iterations of the loop's latch.
2495 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2496 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2497 CHECK_PROFITABILITY is true.
2498 Output:
2499 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2500 iterate after vectorization; see vect_set_loop_condition for details.
2501 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2502 should be set to the number of scalar iterations handled by the
2503 vector loop. The SSA name is only used on exit from the loop.
2505 This function peels prolog and epilog from the loop, adds guards skipping
2506 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2507 would look like:
2509 guard_bb_1:
2510 if (prefer_scalar_loop) goto merge_bb_1
2511 else goto guard_bb_2
2513 guard_bb_2:
2514 if (skip_prolog) goto merge_bb_2
2515 else goto prolog_preheader
2517 prolog_preheader:
2518 PROLOG:
2519 prolog_header_bb:
2520 prolog_body
2521 if (exit_prolog_cond) goto prolog_exit_bb
2522 else goto prolog_header_bb
2523 prolog_exit_bb:
2525 merge_bb_2:
2527 vector_preheader:
2528 VECTOR LOOP:
2529 vector_header_bb:
2530 vector_body
2531 if (exit_vector_cond) goto vector_exit_bb
2532 else goto vector_header_bb
2533 vector_exit_bb:
2535 guard_bb_3:
2536 if (skip_epilog) goto merge_bb_3
2537 else goto epilog_preheader
2539 merge_bb_1:
2541 epilog_preheader:
2542 EPILOG:
2543 epilog_header_bb:
2544 epilog_body
2545 if (exit_epilog_cond) goto merge_bb_3
2546 else goto epilog_header_bb
2548 merge_bb_3:
2550 Note this function peels prolog and epilog only if it's necessary,
2551 as well as guards.
2552 This function returns the epilogue loop if a decision was made to vectorize
2553 it, otherwise NULL.
2555 The analysis resulting in this epilogue loop's loop_vec_info was performed
2556 in the same vect_analyze_loop call as the main loop's. At that time
2557 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2558 vectorization factors than the main loop. This list is stored in the main
2559 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2560 vectorize the epilogue loop for a lower vectorization factor, the
2561 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2562 updated and linked to the epilogue loop. This is later used to vectorize
2563 the epilogue. The reason the loop_vec_info needs updating is that it was
2564 constructed based on the original main loop, and the epilogue loop is a
2565 copy of this loop, so all links pointing to statements in the original loop
2566 need updating. Furthermore, these loop_vec_infos share the
2567 data_reference's records, which will also need to be updated.
2569 TODO: Guard for prefer_scalar_loop should be emitted along with
2570 versioning conditions if loop versioning is needed. */
2573 class loop *
2574 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2575 tree *niters_vector, tree *step_vector,
2576 tree *niters_vector_mult_vf_var, int th,
2577 bool check_profitability, bool niters_no_overflow,
2578 tree *advance)
2580 edge e, guard_e;
2581 tree type = TREE_TYPE (niters), guard_cond;
2582 basic_block guard_bb, guard_to;
2583 profile_probability prob_prolog, prob_vector, prob_epilog;
2584 int estimated_vf;
2585 int prolog_peeling = 0;
2586 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2587 bool vect_epilogues_updated_niters = false;
2588 /* We currently do not support prolog peeling if the target alignment is not
2589 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2590 target alignment being constant. */
2591 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2592 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2593 return NULL;
2595 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2596 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2598 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2599 poly_uint64 bound_epilog = 0;
2600 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2601 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602 bound_epilog += vf - 1;
2603 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2604 bound_epilog += 1;
2605 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2606 poly_uint64 bound_scalar = bound_epilog;
2608 if (!prolog_peeling && !epilog_peeling)
2609 return NULL;
2611 /* Before doing any peeling make sure to reset debug binds outside of
2612 the loop refering to defs not in LC SSA. */
2613 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2614 for (unsigned i = 0; i < loop->num_nodes; ++i)
2616 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2617 imm_use_iterator ui;
2618 gimple *use_stmt;
2619 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2620 gsi_next (&gsi))
2622 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2623 if (gimple_debug_bind_p (use_stmt)
2624 && loop != gimple_bb (use_stmt)->loop_father
2625 && !flow_loop_nested_p (loop,
2626 gimple_bb (use_stmt)->loop_father))
2628 gimple_debug_bind_reset_value (use_stmt);
2629 update_stmt (use_stmt);
2632 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2633 gsi_next (&gsi))
2635 ssa_op_iter op_iter;
2636 def_operand_p def_p;
2637 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2638 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2639 if (gimple_debug_bind_p (use_stmt)
2640 && loop != gimple_bb (use_stmt)->loop_father
2641 && !flow_loop_nested_p (loop,
2642 gimple_bb (use_stmt)->loop_father))
2644 gimple_debug_bind_reset_value (use_stmt);
2645 update_stmt (use_stmt);
2650 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2651 estimated_vf = vect_vf_for_cost (loop_vinfo);
2652 if (estimated_vf == 2)
2653 estimated_vf = 3;
2654 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2655 .apply_scale (estimated_vf - 1, estimated_vf);
2657 class loop *prolog, *epilog = NULL;
2658 class loop *first_loop = loop;
2659 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2661 /* We might have a queued need to update virtual SSA form. As we
2662 delete the update SSA machinery below after doing a regular
2663 incremental SSA update during loop copying make sure we don't
2664 lose that fact.
2665 ??? Needing to update virtual SSA form by renaming is unfortunate
2666 but not all of the vectorizer code inserting new loads / stores
2667 properly assigns virtual operands to those statements. */
2668 update_ssa (TODO_update_ssa_only_virtuals);
2670 create_lcssa_for_virtual_phi (loop);
2672 /* If we're vectorizing an epilogue loop, the update_ssa above will
2673 have ensured that the virtual operand is in SSA form throughout the
2674 vectorized main loop. Normally it is possible to trace the updated
2675 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2676 back to scalar-stmt vuses, meaning that the effect of the SSA update
2677 remains local to the main loop. However, there are rare cases in
2678 which the vectorized loop has vdefs even when the original scalar
2679 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2680 introduces clobbers of the temporary vector array, which in turn
2681 needs new vdefs. If the scalar loop doesn't write to memory, these
2682 new vdefs will be the only ones in the vector loop.
2684 In that case, update_ssa will have added a new virtual phi to the
2685 main loop, which previously didn't need one. Ensure that we (locally)
2686 maintain LCSSA form for the virtual operand, just as we would have
2687 done if the virtual phi had existed from the outset. This makes it
2688 easier to duplicate the scalar epilogue loop below. */
2689 tree vop_to_rename = NULL_TREE;
2690 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2692 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2693 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2696 if (MAY_HAVE_DEBUG_BIND_STMTS)
2698 gcc_assert (!adjust_vec.exists ());
2699 adjust_vec.create (32);
2701 initialize_original_copy_tables ();
2703 /* Record the anchor bb at which the guard should be placed if the scalar
2704 loop might be preferred. */
2705 basic_block anchor = loop_preheader_edge (loop)->src;
2707 /* Generate the number of iterations for the prolog loop. We do this here
2708 so that we can also get the upper bound on the number of iterations. */
2709 tree niters_prolog;
2710 int bound_prolog = 0;
2711 if (prolog_peeling)
2712 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2713 &bound_prolog);
2714 else
2715 niters_prolog = build_int_cst (type, 0);
2717 loop_vec_info epilogue_vinfo = NULL;
2718 if (vect_epilogues)
2720 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2721 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2724 tree niters_vector_mult_vf = NULL_TREE;
2725 /* Saving NITERs before the loop, as this may be changed by prologue. */
2726 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2727 edge update_e = NULL, skip_e = NULL;
2728 unsigned int lowest_vf = constant_lower_bound (vf);
2729 /* If we know the number of scalar iterations for the main loop we should
2730 check whether after the main loop there are enough iterations left over
2731 for the epilogue. */
2732 if (vect_epilogues
2733 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2734 && prolog_peeling >= 0
2735 && known_eq (vf, lowest_vf))
2737 unsigned HOST_WIDE_INT eiters
2738 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2739 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2741 eiters -= prolog_peeling;
2742 eiters
2743 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2745 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2747 delete epilogue_vinfo;
2748 epilogue_vinfo = NULL;
2749 if (loop_vinfo->epilogue_vinfos.length () == 0)
2751 vect_epilogues = false;
2752 break;
2754 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2755 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2757 vect_epilogues_updated_niters = true;
2759 /* Prolog loop may be skipped. */
2760 bool skip_prolog = (prolog_peeling != 0);
2761 /* Skip this loop to epilog when there are not enough iterations to enter this
2762 vectorized loop. If true we should perform runtime checks on the NITERS
2763 to check whether we should skip the current vectorized loop. If we know
2764 the number of scalar iterations we may choose to add a runtime check if
2765 this number "maybe" smaller than the number of iterations required
2766 when we know the number of scalar iterations may potentially
2767 be smaller than the number of iterations required to enter this loop, for
2768 this we use the upper bounds on the prolog and epilog peeling. When we
2769 don't know the number of iterations and don't require versioning it is
2770 because we have asserted that there are enough scalar iterations to enter
2771 the main loop, so this skip is not necessary. When we are versioning then
2772 we only add such a skip if we have chosen to vectorize the epilogue. */
2773 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2774 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2775 bound_prolog + bound_epilog)
2776 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2777 || vect_epilogues));
2778 /* Epilog loop must be executed if the number of iterations for epilog
2779 loop is known at compile time, otherwise we need to add a check at
2780 the end of vector loop and skip to the end of epilog loop. */
2781 bool skip_epilog = (prolog_peeling < 0
2782 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2783 || !vf.is_constant ());
2784 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2785 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2786 skip_epilog = false;
2788 if (skip_vector)
2790 split_edge (loop_preheader_edge (loop));
2792 /* Due to the order in which we peel prolog and epilog, we first
2793 propagate probability to the whole loop. The purpose is to
2794 avoid adjusting probabilities of both prolog and vector loops
2795 separately. Note in this case, the probability of epilog loop
2796 needs to be scaled back later. */
2797 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2798 if (prob_vector.initialized_p ())
2800 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2801 scale_loop_profile (loop, prob_vector, 0);
2805 dump_user_location_t loop_loc = find_loop_location (loop);
2806 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2807 if (vect_epilogues)
2808 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2809 use the original scalar loop as remaining epilogue if necessary. */
2810 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2811 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2813 if (prolog_peeling)
2815 e = loop_preheader_edge (loop);
2816 if (!slpeel_can_duplicate_loop_p (loop, e))
2818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2819 "loop can't be duplicated to preheader edge.\n");
2820 gcc_unreachable ();
2822 /* Peel prolog and put it on preheader edge of loop. */
2823 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2824 if (!prolog)
2826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2827 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2828 gcc_unreachable ();
2830 prolog->force_vectorize = false;
2831 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2832 first_loop = prolog;
2833 reset_original_copy_tables ();
2835 /* Update the number of iterations for prolog loop. */
2836 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2837 vect_set_loop_condition (prolog, NULL, niters_prolog,
2838 step_prolog, NULL_TREE, false);
2840 /* Skip the prolog loop. */
2841 if (skip_prolog)
2843 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2844 niters_prolog, build_int_cst (type, 0));
2845 guard_bb = loop_preheader_edge (prolog)->src;
2846 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2847 guard_to = split_edge (loop_preheader_edge (loop));
2848 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2849 guard_to, guard_bb,
2850 prob_prolog.invert (),
2851 irred_flag);
2852 e = EDGE_PRED (guard_to, 0);
2853 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2854 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2856 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2857 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2860 /* Update init address of DRs. */
2861 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2862 /* Update niters for vector loop. */
2863 LOOP_VINFO_NITERS (loop_vinfo)
2864 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2865 LOOP_VINFO_NITERSM1 (loop_vinfo)
2866 = fold_build2 (MINUS_EXPR, type,
2867 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2868 bool new_var_p = false;
2869 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2870 /* It's guaranteed that vector loop bound before vectorization is at
2871 least VF, so set range information for newly generated var. */
2872 if (new_var_p)
2873 set_range_info (niters, VR_RANGE,
2874 wi::to_wide (build_int_cst (type, vf)),
2875 wi::to_wide (TYPE_MAX_VALUE (type)));
2877 /* Prolog iterates at most bound_prolog times, latch iterates at
2878 most bound_prolog - 1 times. */
2879 record_niter_bound (prolog, bound_prolog - 1, false, true);
2880 delete_update_ssa ();
2881 adjust_vec_debug_stmts ();
2882 scev_reset ();
2885 if (epilog_peeling)
2887 e = single_exit (loop);
2888 if (!slpeel_can_duplicate_loop_p (loop, e))
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2891 "loop can't be duplicated to exit edge.\n");
2892 gcc_unreachable ();
2894 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2895 said epilog then we should use a copy of the main loop as a starting
2896 point. This loop may have already had some preliminary transformations
2897 to allow for more optimal vectorization, for example if-conversion.
2898 If we are not vectorizing the epilog then we should use the scalar loop
2899 as the transformations mentioned above make less or no sense when not
2900 vectorizing. */
2901 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2902 if (vop_to_rename)
2904 /* Vectorizing the main loop can sometimes introduce a vdef to
2905 a loop that previously didn't have one; see the comment above
2906 the definition of VOP_TO_RENAME for details. The definition
2907 D that holds on E will then be different from the definition
2908 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2909 rename VOP_TO_RENAME to D when copying the loop.
2911 The virtual operand is in LCSSA form for the main loop,
2912 and no stmt between the main loop and E needs a vdef,
2913 so we know that D is provided by a phi rather than by a
2914 vdef on a normal gimple stmt. */
2915 basic_block vdef_bb = e->src;
2916 gphi *vphi;
2917 while (!(vphi = get_virtual_phi (vdef_bb)))
2918 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2919 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2920 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2922 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2923 if (!epilog)
2925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2926 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2927 gcc_unreachable ();
2929 epilog->force_vectorize = false;
2930 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2932 /* Scalar version loop may be preferred. In this case, add guard
2933 and skip to epilog. Note this only happens when the number of
2934 iterations of loop is unknown at compile time, otherwise this
2935 won't be vectorized. */
2936 if (skip_vector)
2938 /* Additional epilogue iteration is peeled if gap exists. */
2939 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2940 bound_prolog, bound_epilog,
2941 th, &bound_scalar,
2942 check_profitability);
2943 /* Build guard against NITERSM1 since NITERS may overflow. */
2944 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2945 guard_bb = anchor;
2946 guard_to = split_edge (loop_preheader_edge (epilog));
2947 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2948 guard_to, guard_bb,
2949 prob_vector.invert (),
2950 irred_flag);
2951 skip_e = guard_e;
2952 e = EDGE_PRED (guard_to, 0);
2953 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2954 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2956 /* Simply propagate profile info from guard_bb to guard_to which is
2957 a merge point of control flow. */
2958 guard_to->count = guard_bb->count;
2960 /* Scale probability of epilog loop back.
2961 FIXME: We should avoid scaling down and back up. Profile may
2962 get lost if we scale down to 0. */
2963 basic_block *bbs = get_loop_body (epilog);
2964 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2965 bbs[i]->count = bbs[i]->count.apply_scale
2966 (bbs[i]->count,
2967 bbs[i]->count.apply_probability
2968 (prob_vector));
2969 free (bbs);
2972 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2973 /* If loop is peeled for non-zero constant times, now niters refers to
2974 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2975 overflows. */
2976 niters_no_overflow |= (prolog_peeling > 0);
2977 vect_gen_vector_loop_niters (loop_vinfo, niters,
2978 niters_vector, step_vector,
2979 niters_no_overflow);
2980 if (!integer_onep (*step_vector))
2982 /* On exit from the loop we will have an easy way of calcalating
2983 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2984 until then. */
2985 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2986 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2987 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2989 else
2990 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2991 &niters_vector_mult_vf);
2992 /* Update IVs of original loop as if they were advanced by
2993 niters_vector_mult_vf steps. */
2994 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2995 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2996 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2997 update_e);
2999 if (skip_epilog)
3001 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3002 niters, niters_vector_mult_vf);
3003 guard_bb = single_exit (loop)->dest;
3004 guard_to = split_edge (single_exit (epilog));
3005 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3006 skip_vector ? anchor : guard_bb,
3007 prob_epilog.invert (),
3008 irred_flag);
3009 if (vect_epilogues)
3010 epilogue_vinfo->skip_this_loop_edge = guard_e;
3011 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3012 single_exit (epilog));
3013 /* Only need to handle basic block before epilog loop if it's not
3014 the guard_bb, which is the case when skip_vector is true. */
3015 if (guard_bb != bb_before_epilog)
3017 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3019 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3021 scale_loop_profile (epilog, prob_epilog, 0);
3023 else
3024 slpeel_update_phi_nodes_for_lcssa (epilog);
3026 unsigned HOST_WIDE_INT bound;
3027 if (bound_scalar.is_constant (&bound))
3029 gcc_assert (bound != 0);
3030 /* -1 to convert loop iterations to latch iterations. */
3031 record_niter_bound (epilog, bound - 1, false, true);
3034 delete_update_ssa ();
3035 adjust_vec_debug_stmts ();
3036 scev_reset ();
3039 if (vect_epilogues)
3041 epilog->aux = epilogue_vinfo;
3042 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3044 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3046 /* We now must calculate the number of NITERS performed by the previous
3047 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3048 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3049 niters_prolog, niters_vector_mult_vf);
3051 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3052 determine whether we are coming from the previous vectorized loop
3053 using the update_e edge or the skip_vector basic block using the
3054 skip_e edge. */
3055 if (skip_vector)
3057 gcc_assert (update_e != NULL
3058 && skip_e != NULL
3059 && !vect_epilogues_updated_niters);
3060 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3061 update_e->dest);
3062 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3063 gimple *stmt = gimple_build_assign (new_ssa, niters);
3064 gimple_stmt_iterator gsi;
3065 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3066 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3068 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3069 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3071 else
3073 gsi = gsi_last_bb (update_e->src);
3074 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3077 niters = new_ssa;
3078 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3079 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3080 UNKNOWN_LOCATION);
3081 niters = PHI_RESULT (new_phi);
3082 epilogue_vinfo->main_loop_edge = update_e;
3083 epilogue_vinfo->skip_main_loop_edge = skip_e;
3086 /* Set ADVANCE to the number of iterations performed by the previous
3087 loop and its prologue. */
3088 *advance = niters;
3090 if (!vect_epilogues_updated_niters)
3092 /* Subtract the number of iterations performed by the vectorized loop
3093 from the number of total iterations. */
3094 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3095 before_loop_niters,
3096 niters);
3098 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3099 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3100 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3101 epilogue_niters,
3102 build_one_cst (TREE_TYPE (epilogue_niters)));
3104 /* Decide what to do if the number of epilogue iterations is not
3105 a multiple of the epilogue loop's vectorization factor.
3106 We should have rejected the loop during the analysis phase
3107 if this fails. */
3108 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3109 true))
3110 gcc_unreachable ();
3114 adjust_vec.release ();
3115 free_original_copy_tables ();
3117 return vect_epilogues ? epilog : NULL;
3120 /* Function vect_create_cond_for_niters_checks.
3122 Create a conditional expression that represents the run-time checks for
3123 loop's niter. The loop is guaranteed to terminate if the run-time
3124 checks hold.
3126 Input:
3127 COND_EXPR - input conditional expression. New conditions will be chained
3128 with logical AND operation. If it is NULL, then the function
3129 is used to return the number of alias checks.
3130 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3131 to be checked.
3133 Output:
3134 COND_EXPR - conditional expression.
3136 The returned COND_EXPR is the conditional expression to be used in the
3137 if statement that controls which version of the loop gets executed at
3138 runtime. */
3140 static void
3141 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3143 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3145 if (*cond_expr)
3146 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3147 *cond_expr, part_cond_expr);
3148 else
3149 *cond_expr = part_cond_expr;
3152 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3153 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3155 static void
3156 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3158 if (*cond_expr)
3159 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3160 *cond_expr, part_cond_expr);
3161 else
3162 *cond_expr = part_cond_expr;
3165 /* Function vect_create_cond_for_align_checks.
3167 Create a conditional expression that represents the alignment checks for
3168 all of data references (array element references) whose alignment must be
3169 checked at runtime.
3171 Input:
3172 COND_EXPR - input conditional expression. New conditions will be chained
3173 with logical AND operation.
3174 LOOP_VINFO - two fields of the loop information are used.
3175 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3176 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3178 Output:
3179 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3180 expression.
3181 The returned value is the conditional expression to be used in the if
3182 statement that controls which version of the loop gets executed at runtime.
3184 The algorithm makes two assumptions:
3185 1) The number of bytes "n" in a vector is a power of 2.
3186 2) An address "a" is aligned if a%n is zero and that this
3187 test can be done as a&(n-1) == 0. For example, for 16
3188 byte vectors the test is a&0xf == 0. */
3190 static void
3191 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3192 tree *cond_expr,
3193 gimple_seq *cond_expr_stmt_list)
3195 const vec<stmt_vec_info> &may_misalign_stmts
3196 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3197 stmt_vec_info stmt_info;
3198 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3199 tree mask_cst;
3200 unsigned int i;
3201 tree int_ptrsize_type;
3202 char tmp_name[20];
3203 tree or_tmp_name = NULL_TREE;
3204 tree and_tmp_name;
3205 gimple *and_stmt;
3206 tree ptrsize_zero;
3207 tree part_cond_expr;
3209 /* Check that mask is one less than a power of 2, i.e., mask is
3210 all zeros followed by all ones. */
3211 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3213 int_ptrsize_type = signed_type_for (ptr_type_node);
3215 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3216 of the first vector of the i'th data reference. */
3218 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3220 gimple_seq new_stmt_list = NULL;
3221 tree addr_base;
3222 tree addr_tmp_name;
3223 tree new_or_tmp_name;
3224 gimple *addr_stmt, *or_stmt;
3225 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3226 bool negative = tree_int_cst_compare
3227 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3228 tree offset = negative
3229 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3231 /* create: addr_tmp = (int)(address_of_first_vector) */
3232 addr_base =
3233 vect_create_addr_base_for_vector_ref (loop_vinfo,
3234 stmt_info, &new_stmt_list,
3235 offset);
3236 if (new_stmt_list != NULL)
3237 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3239 sprintf (tmp_name, "addr2int%d", i);
3240 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3241 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3242 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3244 /* The addresses are OR together. */
3246 if (or_tmp_name != NULL_TREE)
3248 /* create: or_tmp = or_tmp | addr_tmp */
3249 sprintf (tmp_name, "orptrs%d", i);
3250 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3251 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3252 or_tmp_name, addr_tmp_name);
3253 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3254 or_tmp_name = new_or_tmp_name;
3256 else
3257 or_tmp_name = addr_tmp_name;
3259 } /* end for i */
3261 mask_cst = build_int_cst (int_ptrsize_type, mask);
3263 /* create: and_tmp = or_tmp & mask */
3264 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3266 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3267 or_tmp_name, mask_cst);
3268 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3270 /* Make and_tmp the left operand of the conditional test against zero.
3271 if and_tmp has a nonzero bit then some address is unaligned. */
3272 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3273 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3274 and_tmp_name, ptrsize_zero);
3275 chain_cond_expr (cond_expr, part_cond_expr);
3278 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3279 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3280 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3281 and this new condition are true. Treat a null *COND_EXPR as "true". */
3283 static void
3284 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3286 const vec<vec_object_pair> &pairs
3287 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3288 unsigned int i;
3289 vec_object_pair *pair;
3290 FOR_EACH_VEC_ELT (pairs, i, pair)
3292 tree addr1 = build_fold_addr_expr (pair->first);
3293 tree addr2 = build_fold_addr_expr (pair->second);
3294 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3295 addr1, addr2);
3296 chain_cond_expr (cond_expr, part_cond_expr);
3300 /* Create an expression that is true when all lower-bound conditions for
3301 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3303 static void
3304 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3306 const vec<vec_lower_bound> &lower_bounds
3307 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3308 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3310 tree expr = lower_bounds[i].expr;
3311 tree type = unsigned_type_for (TREE_TYPE (expr));
3312 expr = fold_convert (type, expr);
3313 poly_uint64 bound = lower_bounds[i].min_value;
3314 if (!lower_bounds[i].unsigned_p)
3316 expr = fold_build2 (PLUS_EXPR, type, expr,
3317 build_int_cstu (type, bound - 1));
3318 bound += bound - 1;
3320 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3321 build_int_cstu (type, bound));
3322 chain_cond_expr (cond_expr, part_cond_expr);
3326 /* Function vect_create_cond_for_alias_checks.
3328 Create a conditional expression that represents the run-time checks for
3329 overlapping of address ranges represented by a list of data references
3330 relations passed as input.
3332 Input:
3333 COND_EXPR - input conditional expression. New conditions will be chained
3334 with logical AND operation. If it is NULL, then the function
3335 is used to return the number of alias checks.
3336 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3337 to be checked.
3339 Output:
3340 COND_EXPR - conditional expression.
3342 The returned COND_EXPR is the conditional expression to be used in the if
3343 statement that controls which version of the loop gets executed at runtime.
3346 void
3347 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3349 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3350 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3352 if (comp_alias_ddrs.is_empty ())
3353 return;
3355 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3356 &comp_alias_ddrs, cond_expr);
3357 if (dump_enabled_p ())
3358 dump_printf_loc (MSG_NOTE, vect_location,
3359 "created %u versioning for alias checks.\n",
3360 comp_alias_ddrs.length ());
3364 /* Function vect_loop_versioning.
3366 If the loop has data references that may or may not be aligned or/and
3367 has data reference relations whose independence was not proven then
3368 two versions of the loop need to be generated, one which is vectorized
3369 and one which isn't. A test is then generated to control which of the
3370 loops is executed. The test checks for the alignment of all of the
3371 data references that may or may not be aligned. An additional
3372 sequence of runtime tests is generated for each pairs of DDRs whose
3373 independence was not proven. The vectorized version of loop is
3374 executed only if both alias and alignment tests are passed.
3376 The test generated to check which version of loop is executed
3377 is modified to also check for profitability as indicated by the
3378 cost model threshold TH.
3380 The versioning precondition(s) are placed in *COND_EXPR and
3381 *COND_EXPR_STMT_LIST. */
3383 class loop *
3384 vect_loop_versioning (loop_vec_info loop_vinfo,
3385 gimple *loop_vectorized_call)
3387 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3388 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3389 basic_block condition_bb;
3390 gphi_iterator gsi;
3391 gimple_stmt_iterator cond_exp_gsi;
3392 basic_block merge_bb;
3393 basic_block new_exit_bb;
3394 edge new_exit_e, e;
3395 gphi *orig_phi, *new_phi;
3396 tree cond_expr = NULL_TREE;
3397 gimple_seq cond_expr_stmt_list = NULL;
3398 tree arg;
3399 profile_probability prob = profile_probability::likely ();
3400 gimple_seq gimplify_stmt_list = NULL;
3401 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3402 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3403 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3404 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3405 poly_uint64 versioning_threshold
3406 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3407 tree version_simd_if_cond
3408 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3409 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3411 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3412 && !ordered_p (th, versioning_threshold))
3413 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3414 build_int_cst (TREE_TYPE (scalar_loop_iters),
3415 th - 1));
3416 if (maybe_ne (versioning_threshold, 0U))
3418 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3419 build_int_cst (TREE_TYPE (scalar_loop_iters),
3420 versioning_threshold - 1));
3421 if (cond_expr)
3422 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3423 expr, cond_expr);
3424 else
3425 cond_expr = expr;
3428 if (version_niter)
3429 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3431 if (cond_expr)
3432 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3433 &cond_expr_stmt_list,
3434 is_gimple_condexpr, NULL_TREE);
3436 if (version_align)
3437 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3438 &cond_expr_stmt_list);
3440 if (version_alias)
3442 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3443 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3444 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3447 if (version_simd_if_cond)
3449 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3450 if (flag_checking)
3451 if (basic_block bb
3452 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3453 gcc_assert (bb != loop->header
3454 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3455 && (scalar_loop == NULL
3456 || (bb != scalar_loop->header
3457 && dominated_by_p (CDI_DOMINATORS,
3458 scalar_loop->header, bb))));
3459 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3460 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3461 version_simd_if_cond, zero);
3462 if (cond_expr)
3463 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3464 c, cond_expr);
3465 else
3466 cond_expr = c;
3467 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_NOTE, vect_location,
3469 "created versioning for simd if condition check.\n");
3472 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3473 &gimplify_stmt_list,
3474 is_gimple_condexpr, NULL_TREE);
3475 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3477 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3478 invariant in. */
3479 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3480 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3481 !gsi_end_p (gsi); gsi_next (&gsi))
3483 gimple *stmt = gsi_stmt (gsi);
3484 update_stmt (stmt);
3485 ssa_op_iter iter;
3486 use_operand_p use_p;
3487 basic_block def_bb;
3488 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3489 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3490 && flow_bb_inside_loop_p (outermost, def_bb))
3491 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3494 /* Search for the outermost loop we can version. Avoid versioning of
3495 non-perfect nests but allow if-conversion versioned loops inside. */
3496 class loop *loop_to_version = loop;
3497 if (flow_loop_nested_p (outermost, loop))
3499 if (dump_enabled_p ())
3500 dump_printf_loc (MSG_NOTE, vect_location,
3501 "trying to apply versioning to outer loop %d\n",
3502 outermost->num);
3503 if (outermost->num == 0)
3504 outermost = superloop_at_depth (loop, 1);
3505 /* And avoid applying versioning on non-perfect nests. */
3506 while (loop_to_version != outermost
3507 && single_exit (loop_outer (loop_to_version))
3508 && (!loop_outer (loop_to_version)->inner->next
3509 || vect_loop_vectorized_call (loop_to_version))
3510 && (!loop_outer (loop_to_version)->inner->next
3511 || !loop_outer (loop_to_version)->inner->next->next))
3512 loop_to_version = loop_outer (loop_to_version);
3515 /* Apply versioning. If there is already a scalar version created by
3516 if-conversion re-use that. Note we cannot re-use the copy of
3517 an if-converted outer-loop when vectorizing the inner loop only. */
3518 gcond *cond;
3519 if ((!loop_to_version->inner || loop == loop_to_version)
3520 && loop_vectorized_call)
3522 gcc_assert (scalar_loop);
3523 condition_bb = gimple_bb (loop_vectorized_call);
3524 cond = as_a <gcond *> (last_stmt (condition_bb));
3525 gimple_cond_set_condition_from_tree (cond, cond_expr);
3526 update_stmt (cond);
3528 if (cond_expr_stmt_list)
3530 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3531 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3532 GSI_SAME_STMT);
3535 /* if-conversion uses profile_probability::always () for both paths,
3536 reset the paths probabilities appropriately. */
3537 edge te, fe;
3538 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3539 te->probability = prob;
3540 fe->probability = prob.invert ();
3541 /* We can scale loops counts immediately but have to postpone
3542 scaling the scalar loop because we re-use it during peeling. */
3543 scale_loop_frequencies (loop_to_version, te->probability);
3544 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3546 nloop = scalar_loop;
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "reusing %sloop version created by if conversion\n",
3550 loop_to_version != loop ? "outer " : "");
3552 else
3554 if (loop_to_version != loop
3555 && dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE, vect_location,
3557 "applying loop versioning to outer loop %d\n",
3558 loop_to_version->num);
3560 initialize_original_copy_tables ();
3561 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3562 prob, prob.invert (), prob, prob.invert (), true);
3563 gcc_assert (nloop);
3564 nloop = get_loop_copy (loop);
3566 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3567 reap those otherwise; they also refer to the original
3568 loops. */
3569 class loop *l = loop;
3570 while (gimple *call = vect_loop_vectorized_call (l))
3572 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3573 fold_loop_internal_call (call, boolean_false_node);
3574 l = loop_outer (l);
3576 free_original_copy_tables ();
3578 if (cond_expr_stmt_list)
3580 cond_exp_gsi = gsi_last_bb (condition_bb);
3581 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3582 GSI_SAME_STMT);
3585 /* Loop versioning violates an assumption we try to maintain during
3586 vectorization - that the loop exit block has a single predecessor.
3587 After versioning, the exit block of both loop versions is the same
3588 basic block (i.e. it has two predecessors). Just in order to simplify
3589 following transformations in the vectorizer, we fix this situation
3590 here by adding a new (empty) block on the exit-edge of the loop,
3591 with the proper loop-exit phis to maintain loop-closed-form.
3592 If loop versioning wasn't done from loop, but scalar_loop instead,
3593 merge_bb will have already just a single successor. */
3595 merge_bb = single_exit (loop_to_version)->dest;
3596 if (EDGE_COUNT (merge_bb->preds) >= 2)
3598 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3599 new_exit_bb = split_edge (single_exit (loop_to_version));
3600 new_exit_e = single_exit (loop_to_version);
3601 e = EDGE_SUCC (new_exit_bb, 0);
3603 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3604 gsi_next (&gsi))
3606 tree new_res;
3607 orig_phi = gsi.phi ();
3608 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3609 new_phi = create_phi_node (new_res, new_exit_bb);
3610 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3611 add_phi_arg (new_phi, arg, new_exit_e,
3612 gimple_phi_arg_location_from_edge (orig_phi, e));
3613 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3617 update_ssa (TODO_update_ssa);
3620 if (version_niter)
3622 /* The versioned loop could be infinite, we need to clear existing
3623 niter information which is copied from the original loop. */
3624 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3625 vect_free_loop_info_assumptions (nloop);
3628 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3629 && dump_enabled_p ())
3631 if (version_alias)
3632 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3633 vect_location,
3634 "loop versioned for vectorization because of "
3635 "possible aliasing\n");
3636 if (version_align)
3637 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3638 vect_location,
3639 "loop versioned for vectorization to enhance "
3640 "alignment\n");
3644 return nloop;