Remove extra newline
[official-gcc.git] / gcc / tree-vect-loop-manip.c
blob8c5e696b9951a90e0e900819a76809b12d75fc27
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
260 the mask should have during the first iteration and NEXT_MASK is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265 tree next_mask)
267 gphi *phi = create_phi_node (mask, loop->header);
268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324 rgroup_masks *src_rgm)
326 tree src_masktype = src_rgm->mask_type;
327 tree dest_masktype = dest_rgm->mask_type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
343 tree src = src_rgm->masks[i / 2];
344 tree dest = dest_rgm->masks[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
360 gimple_seq_add_stmt (seq, stmt);
362 return true;
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
376 tree src = src_rgm->masks[i / 2];
377 tree dest = dest_rgm->masks[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
382 return true;
384 return false;
387 /* Helper for vect_set_loop_condition_masked. Generate definitions for
388 all the masks in RGM and return a mask that is nonzero when the loop
389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
390 Use LOOP_COND_GSI to insert code before the exit gcond.
392 RGM belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
400 It is known that:
402 NITERS * RGM->max_nscalars_per_iter
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
407 VF * RGM->max_nscalars_per_iter
409 might overflow before hitting a value above:
411 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks for RGM. */
416 static tree
417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418 gimple_seq *preheader_seq,
419 gimple_stmt_iterator loop_cond_gsi,
420 rgroup_masks *rgm, tree niters, tree niters_skip,
421 bool might_wrap_p)
423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425 tree mask_type = rgm->mask_type;
426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
430 /* Calculate the maximum number of scalar values that the rgroup
431 handles in total, the number that it handles for each iteration
432 of the vector loop, and the number that it should skip during the
433 first iteration of the vector loop. */
434 tree nscalars_total = niters;
435 tree nscalars_step = build_int_cst (iv_type, vf);
436 tree nscalars_skip = niters_skip;
437 if (nscalars_per_iter != 1)
439 /* We checked before choosing to use a fully-masked loop that these
440 multiplications don't overflow. */
441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444 nscalars_total, compare_factor);
445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 nscalars_step, iv_factor);
447 if (nscalars_skip)
448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449 nscalars_skip, compare_factor);
452 /* Create an induction variable that counts the number of scalars
453 processed. */
454 tree index_before_incr, index_after_incr;
455 gimple_stmt_iterator incr_gsi;
456 bool insert_after;
457 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
461 tree zero_index = build_int_cst (compare_type, 0);
462 tree test_index, test_limit, first_limit;
463 gimple_stmt_iterator *test_gsi;
464 if (might_wrap_p)
466 /* In principle the loop should stop iterating once the incremented
467 IV reaches a value greater than or equal to:
469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
471 However, there's no guarantee that this addition doesn't overflow
472 the comparison type, or that the IV hits a value above it before
473 wrapping around. We therefore adjust the limit down by one
474 IV step:
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[infinite-prec] NSCALARS_STEP
479 and compare the IV against this limit _before_ incrementing it.
480 Since the comparison type is unsigned, we actually want the
481 subtraction to saturate at zero:
483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 -[sat] NSCALARS_STEP
486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
490 where the rightmost subtraction can be done directly in
491 COMPARE_TYPE. */
492 test_index = index_before_incr;
493 tree adjust = gimple_convert (preheader_seq, compare_type,
494 nscalars_step);
495 if (nscalars_skip)
496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 adjust, nscalars_skip);
498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499 nscalars_total, adjust);
500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501 test_limit, adjust);
502 test_gsi = &incr_gsi;
504 /* Get a safe limit for the first iteration. */
505 if (nscalars_skip)
507 /* The first vector iteration can handle at most NSCALARS_STEP
508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509 NSCALARS_SKIP to that cannot overflow. */
510 tree const_limit = build_int_cst (compare_type,
511 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 * nscalars_per_iter);
513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 nscalars_total, const_limit);
515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 first_limit, nscalars_skip);
518 else
519 /* For the first iteration it doesn't matter whether the IV hits
520 a value above NSCALARS_TOTAL. That only matters for the latch
521 condition. */
522 first_limit = nscalars_total;
524 else
526 /* Test the incremented IV, which will always hit a value above
527 the bound before wrapping. */
528 test_index = index_after_incr;
529 test_limit = nscalars_total;
530 if (nscalars_skip)
531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 test_limit, nscalars_skip);
533 test_gsi = &loop_cond_gsi;
535 first_limit = test_limit;
538 /* Convert the IV value to the comparison type (either a no-op or
539 a demotion). */
540 gimple_seq test_seq = NULL;
541 test_index = gimple_convert (&test_seq, compare_type, test_index);
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
544 /* Provide a definition of each mask in the group. */
545 tree next_mask = NULL_TREE;
546 tree mask;
547 unsigned int i;
548 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
550 /* Previous masks will cover BIAS scalars. This mask covers the
551 next batch. */
552 poly_uint64 bias = nscalars_per_mask * i;
553 tree bias_tree = build_int_cst (compare_type, bias);
554 gimple *tmp_stmt;
556 /* See whether the first iteration of the vector loop is known
557 to have a full mask. */
558 poly_uint64 const_limit;
559 bool first_iteration_full
560 = (poly_int_tree_p (first_limit, &const_limit)
561 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
563 /* Rather than have a new IV that starts at BIAS and goes up to
564 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565 and adjust the bound down by BIAS. */
566 tree this_test_limit = test_limit;
567 if (i != 0)
569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 compare_type, this_test_limit,
571 bias_tree);
572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 compare_type, this_test_limit,
574 bias_tree);
577 /* Create the initial mask. First include all scalars that
578 are within the loop limit. */
579 tree init_mask = NULL_TREE;
580 if (!first_iteration_full)
582 tree start, end;
583 if (first_limit == test_limit)
585 /* Use a natural test between zero (the initial IV value)
586 and the loop limit. The "else" block would be valid too,
587 but this choice can avoid the need to load BIAS_TREE into
588 a register. */
589 start = zero_index;
590 end = this_test_limit;
592 else
594 /* FIRST_LIMIT is the maximum number of scalars handled by the
595 first iteration of the vector loop. Test the portion
596 associated with this mask. */
597 start = bias_tree;
598 end = first_limit;
601 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602 tmp_stmt = vect_gen_while (init_mask, start, end);
603 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
606 /* Now AND out the bits that are within the number of skipped
607 scalars. */
608 poly_uint64 const_skip;
609 if (nscalars_skip
610 && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 && known_le (const_skip, bias)))
613 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614 bias_tree, nscalars_skip);
615 if (init_mask)
616 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617 init_mask, unskipped_mask);
618 else
619 init_mask = unskipped_mask;
622 if (!init_mask)
623 /* First iteration is full. */
624 init_mask = build_minus_one_cst (mask_type);
626 /* Get the mask value for the next iteration of the loop. */
627 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
631 vect_set_loop_mask (loop, mask, init_mask, next_mask);
633 return next_mask;
636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
638 number of iterations of the original scalar loop that should be
639 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
640 as for vect_set_loop_condition.
642 Insert the branch-back condition before LOOP_COND_GSI and return the
643 final gcond. */
645 static gcond *
646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647 tree niters, tree final_iv,
648 bool niters_maybe_zero,
649 gimple_stmt_iterator loop_cond_gsi)
651 gimple_seq preheader_seq = NULL;
652 gimple_seq header_seq = NULL;
654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655 unsigned int compare_precision = TYPE_PRECISION (compare_type);
656 tree orig_niters = niters;
658 /* Type of the initial value of NITERS. */
659 tree ni_actual_type = TREE_TYPE (niters);
660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
663 /* Convert NITERS to the same size as the compare. */
664 if (compare_precision > ni_actual_precision
665 && niters_maybe_zero)
667 /* We know that there is always at least one iteration, so if the
668 count is zero then it must have wrapped. Cope with this by
669 subtracting 1 before the conversion and adding 1 to the result. */
670 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672 niters, build_minus_one_cst (ni_actual_type));
673 niters = gimple_convert (&preheader_seq, compare_type, niters);
674 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675 niters, build_one_cst (compare_type));
677 else
678 niters = gimple_convert (&preheader_seq, compare_type, niters);
680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
682 /* Iterate over all the rgroups and fill in their masks. We could use
683 the first mask from any rgroup for the loop condition; here we
684 arbitrarily pick the last. */
685 tree test_mask = NULL_TREE;
686 rgroup_masks *rgm;
687 unsigned int i;
688 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689 FOR_EACH_VEC_ELT (*masks, i, rgm)
690 if (!rgm->masks.is_empty ())
692 /* First try using permutes. This adds a single vector
693 instruction to the loop for each mask, but needs no extra
694 loop invariants or IVs. */
695 unsigned int nmasks = i + 1;
696 if ((nmasks & 1) == 0)
698 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699 if (!half_rgm->masks.is_empty ()
700 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701 continue;
704 /* See whether zero-based IV would ever generate all-false masks
705 before wrapping around. */
706 bool might_wrap_p
707 = (iv_limit == -1
708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709 UNSIGNED)
710 > compare_precision));
712 /* Set up all masks for this group. */
713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714 &preheader_seq,
715 loop_cond_gsi, rgm,
716 niters, niters_skip,
717 might_wrap_p);
720 /* Emit all accumulated statements. */
721 add_preheader_seq (loop, preheader_seq);
722 add_header_seq (loop, header_seq);
724 /* Get a boolean result that tells us whether to iterate. */
725 edge exit_edge = single_exit (loop);
726 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729 NULL_TREE, NULL_TREE);
730 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
732 /* The loop iterates (NITERS - 1) / VF + 1 times.
733 Subtract one from this to get the latch count. */
734 tree step = build_int_cst (compare_type,
735 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737 build_minus_one_cst (compare_type));
738 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739 niters_minus_one, step);
741 if (final_iv)
743 gassign *assign = gimple_build_assign (final_iv, orig_niters);
744 gsi_insert_on_edge_immediate (single_exit (loop), assign);
747 return cond_stmt;
750 /* Like vect_set_loop_condition, but handle the case in which there
751 are no loop masks. */
753 static gcond *
754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755 tree step, tree final_iv,
756 bool niters_maybe_zero,
757 gimple_stmt_iterator loop_cond_gsi)
759 tree indx_before_incr, indx_after_incr;
760 gcond *cond_stmt;
761 gcond *orig_cond;
762 edge pe = loop_preheader_edge (loop);
763 edge exit_edge = single_exit (loop);
764 gimple_stmt_iterator incr_gsi;
765 bool insert_after;
766 enum tree_code code;
767 tree niters_type = TREE_TYPE (niters);
769 orig_cond = get_loop_exit_condition (loop);
770 gcc_assert (orig_cond);
771 loop_cond_gsi = gsi_for_stmt (orig_cond);
773 tree init, limit;
774 if (!niters_maybe_zero && integer_onep (step))
776 /* In this case we can use a simple 0-based IV:
779 x = 0;
783 x += 1;
785 while (x < NITERS); */
786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787 init = build_zero_cst (niters_type);
788 limit = niters;
790 else
792 /* The following works for all values of NITERS except 0:
795 x = 0;
799 x += STEP;
801 while (x <= NITERS - STEP);
803 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 but stops if x + STEP - 1 >= NITERS.
806 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 before wrapping around. There are two obvious ways of dealing with
808 this:
810 - start at STEP - 1 and compare x before incrementing it
811 - start at -1 and compare x after incrementing it
813 The latter is simpler and is what we use. The loop in this case
814 looks like:
817 x = -1;
821 x += STEP;
823 while (x < NITERS - STEP);
825 In both cases the loop limit is NITERS - STEP. */
826 gimple_seq seq = NULL;
827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829 if (seq)
831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 gcc_assert (!new_bb);
834 if (niters_maybe_zero)
836 /* Case C. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 init = build_all_ones_cst (niters_type);
840 else
842 /* Case B. */
843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 init = build_zero_cst (niters_type);
848 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849 create_iv (init, step, NULL_TREE, loop,
850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 true, NULL_TREE, true,
853 GSI_SAME_STMT);
854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855 true, GSI_SAME_STMT);
857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858 NULL_TREE);
860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
862 /* Record the number of latch iterations. */
863 if (limit == niters)
864 /* Case A: the loop iterates NITERS times. Subtract one to get the
865 latch count. */
866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 build_int_cst (niters_type, 1));
868 else
869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870 Subtract one from this to get the latch count. */
871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 limit, step);
874 if (final_iv)
876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 indx_after_incr, init);
878 gsi_insert_on_edge_immediate (single_exit (loop), assign);
881 return cond_stmt;
884 /* If we're using fully-masked loops, make LOOP iterate:
886 N == (NITERS - 1) / STEP + 1
888 times. When NITERS is zero, this is equivalent to making the loop
889 execute (1 << M) / STEP times, where M is the precision of NITERS.
890 NITERS_MAYBE_ZERO is true if this last case might occur.
892 If we're not using fully-masked loops, make LOOP iterate:
894 N == (NITERS - STEP) / STEP + 1
896 times, where NITERS is known to be outside the range [1, STEP - 1].
897 This is equivalent to making the loop execute NITERS / STEP times
898 when NITERS is nonzero and (1 << M) / STEP times otherwise.
899 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
901 If FINAL_IV is nonnull, it is an SSA name that should be set to
902 N * STEP on exit from the loop.
904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
906 void
907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908 tree niters, tree step, tree final_iv,
909 bool niters_maybe_zero)
911 gcond *cond_stmt;
912 gcond *orig_cond = get_loop_exit_condition (loop);
913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
915 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917 final_iv, niters_maybe_zero,
918 loop_cond_gsi);
919 else
920 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921 final_iv, niters_maybe_zero,
922 loop_cond_gsi);
924 /* Remove old loop exit test. */
925 stmt_vec_info orig_cond_info;
926 if (loop_vinfo
927 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928 loop_vinfo->remove_stmt (orig_cond_info);
929 else
930 gsi_remove (&loop_cond_gsi, true);
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934 cond_stmt);
937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938 For all PHI arguments in FROM->dest and TO->dest from those
939 edges ensure that TO->dest PHI arguments have current_def
940 to that in from. */
942 static void
943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
945 gimple_stmt_iterator gsi_from, gsi_to;
947 for (gsi_from = gsi_start_phis (from->dest),
948 gsi_to = gsi_start_phis (to->dest);
949 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
951 gimple *from_phi = gsi_stmt (gsi_from);
952 gimple *to_phi = gsi_stmt (gsi_to);
953 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
954 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955 if (virtual_operand_p (from_arg))
957 gsi_next (&gsi_from);
958 continue;
960 if (virtual_operand_p (to_arg))
962 gsi_next (&gsi_to);
963 continue;
965 if (TREE_CODE (from_arg) != SSA_NAME)
966 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
967 else if (TREE_CODE (to_arg) == SSA_NAME
968 && from_arg != to_arg)
970 if (get_current_def (to_arg) == NULL_TREE)
972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973 TREE_TYPE (get_current_def
974 (from_arg))));
975 set_current_def (to_arg, get_current_def (from_arg));
978 gsi_next (&gsi_from);
979 gsi_next (&gsi_to);
982 gphi *from_phi = get_virtual_phi (from->dest);
983 gphi *to_phi = get_virtual_phi (to->dest);
984 if (from_phi)
985 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
990 /* Given LOOP this function generates a new copy of it and puts it
991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994 entry or exit of LOOP. */
996 class loop *
997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998 class loop *scalar_loop, edge e)
1000 class loop *new_loop;
1001 basic_block *new_bbs, *bbs, *pbbs;
1002 bool at_exit;
1003 bool was_imm_dom;
1004 basic_block exit_dest;
1005 edge exit, new_exit;
1006 bool duplicate_outer_loop = false;
1008 exit = single_exit (loop);
1009 at_exit = (e == exit);
1010 if (!at_exit && e != loop_preheader_edge (loop))
1011 return NULL;
1013 if (scalar_loop == NULL)
1014 scalar_loop = loop;
1016 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017 pbbs = bbs + 1;
1018 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019 /* Allow duplication of outer loops. */
1020 if (scalar_loop->inner)
1021 duplicate_outer_loop = true;
1022 /* Check whether duplication is possible. */
1023 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1025 free (bbs);
1026 return NULL;
1029 /* Generate new loop structure. */
1030 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031 duplicate_subloops (scalar_loop, new_loop);
1033 exit_dest = exit->dest;
1034 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035 exit_dest) == loop->header ?
1036 true : false);
1038 /* Also copy the pre-header, this avoids jumping through hoops to
1039 duplicate the loop entry PHI arguments. Create an empty
1040 pre-header unconditionally for this. */
1041 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042 edge entry_e = single_pred_edge (preheader);
1043 bbs[0] = preheader;
1044 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1046 exit = single_exit (scalar_loop);
1047 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048 &exit, 1, &new_exit, NULL,
1049 at_exit ? loop->latch : e->src, true);
1050 exit = single_exit (loop);
1051 basic_block new_preheader = new_bbs[0];
1053 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1055 if (scalar_loop != loop)
1057 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1058 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1059 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1060 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1061 header) to have current_def set, so copy them over. */
1062 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1063 exit);
1064 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1066 EDGE_SUCC (loop->latch, 0));
1069 if (at_exit) /* Add the loop copy at exit. */
1071 if (scalar_loop != loop)
1073 gphi_iterator gsi;
1074 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1076 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1077 gsi_next (&gsi))
1079 gphi *phi = gsi.phi ();
1080 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1081 location_t orig_locus
1082 = gimple_phi_arg_location_from_edge (phi, e);
1084 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1087 redirect_edge_and_branch_force (e, new_preheader);
1088 flush_pending_stmts (e);
1089 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1090 if (was_imm_dom || duplicate_outer_loop)
1091 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1093 /* And remove the non-necessary forwarder again. Keep the other
1094 one so we have a proper pre-header for the loop at the exit edge. */
1095 redirect_edge_pred (single_succ_edge (preheader),
1096 single_pred (preheader));
1097 delete_basic_block (preheader);
1098 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1099 loop_preheader_edge (scalar_loop)->src);
1101 else /* Add the copy at entry. */
1103 if (scalar_loop != loop)
1105 /* Remove the non-necessary forwarder of scalar_loop again. */
1106 redirect_edge_pred (single_succ_edge (preheader),
1107 single_pred (preheader));
1108 delete_basic_block (preheader);
1109 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1110 loop_preheader_edge (scalar_loop)->src);
1111 preheader = split_edge (loop_preheader_edge (loop));
1112 entry_e = single_pred_edge (preheader);
1115 redirect_edge_and_branch_force (entry_e, new_preheader);
1116 flush_pending_stmts (entry_e);
1117 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1119 redirect_edge_and_branch_force (new_exit, preheader);
1120 flush_pending_stmts (new_exit);
1121 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1123 /* And remove the non-necessary forwarder again. Keep the other
1124 one so we have a proper pre-header for the loop at the exit edge. */
1125 redirect_edge_pred (single_succ_edge (new_preheader),
1126 single_pred (new_preheader));
1127 delete_basic_block (new_preheader);
1128 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1129 loop_preheader_edge (new_loop)->src);
1132 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1133 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1134 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1136 if (scalar_loop != loop)
1138 /* Update new_loop->header PHIs, so that on the preheader
1139 edge they are the ones from loop rather than scalar_loop. */
1140 gphi_iterator gsi_orig, gsi_new;
1141 edge orig_e = loop_preheader_edge (loop);
1142 edge new_e = loop_preheader_edge (new_loop);
1144 for (gsi_orig = gsi_start_phis (loop->header),
1145 gsi_new = gsi_start_phis (new_loop->header);
1146 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1149 gphi *orig_phi = gsi_orig.phi ();
1150 gphi *new_phi = gsi_new.phi ();
1151 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152 location_t orig_locus
1153 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1155 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1159 free (new_bbs);
1160 free (bbs);
1162 checking_verify_dominators (CDI_DOMINATORS);
1164 return new_loop;
1168 /* Given the condition expression COND, put it as the last statement of
1169 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1171 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1172 new edge as irreducible if IRREDUCIBLE_P is true. */
1174 static edge
1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176 basic_block guard_to, basic_block dom_bb,
1177 profile_probability probability, bool irreducible_p)
1179 gimple_stmt_iterator gsi;
1180 edge new_e, enter_e;
1181 gcond *cond_stmt;
1182 gimple_seq gimplify_stmt_list = NULL;
1184 enter_e = EDGE_SUCC (guard_bb, 0);
1185 enter_e->flags &= ~EDGE_FALLTHRU;
1186 enter_e->flags |= EDGE_FALSE_VALUE;
1187 gsi = gsi_last_bb (guard_bb);
1189 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190 NULL_TREE);
1191 if (gimplify_stmt_list)
1192 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1194 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195 gsi = gsi_last_bb (guard_bb);
1196 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1198 /* Add new edge to connect guard block to the merge/loop-exit block. */
1199 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1201 new_e->probability = probability;
1202 if (irreducible_p)
1203 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1205 enter_e->probability = probability.invert ();
1206 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1208 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1209 if (enter_e->dest->loop_father->header == enter_e->dest)
1210 split_edge (enter_e);
1212 return new_e;
1216 /* This function verifies that the following restrictions apply to LOOP:
1217 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218 for innermost loop and 5 basic blocks for outer-loop.
1219 (2) it is single entry, single exit
1220 (3) its exit condition is the last stmt in the header
1221 (4) E is the entry/exit edge of LOOP.
1224 bool
1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1227 edge exit_e = single_exit (loop);
1228 edge entry_e = loop_preheader_edge (loop);
1229 gcond *orig_cond = get_loop_exit_condition (loop);
1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231 unsigned int num_bb = loop->inner? 5 : 2;
1233 /* All loops have an outer scope; the only case loop->outer is NULL is for
1234 the function itself. */
1235 if (!loop_outer (loop)
1236 || loop->num_nodes != num_bb
1237 || !empty_block_p (loop->latch)
1238 || !single_exit (loop)
1239 /* Verify that new loop exit condition can be trivially modified. */
1240 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241 || (e != exit_e && e != entry_e))
1242 return false;
1244 return true;
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248 in the exit bb and rename all the uses after the loop. This simplifies
1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250 (but normally loop closed SSA form doesn't require virtual PHIs to be
1251 in the same form). Doing this early simplifies the checking what
1252 uses should be renamed.
1254 If we create a new phi after the loop, return the definition that
1255 applies on entry to the loop, otherwise return null. */
1257 static tree
1258 create_lcssa_for_virtual_phi (class loop *loop)
1260 gphi_iterator gsi;
1261 edge exit_e = single_exit (loop);
1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1266 gphi *phi = gsi.phi ();
1267 for (gsi = gsi_start_phis (exit_e->dest);
1268 !gsi_end_p (gsi); gsi_next (&gsi))
1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270 break;
1271 if (gsi_end_p (gsi))
1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 imm_use_iterator imm_iter;
1277 gimple *stmt;
1278 use_operand_p use_p;
1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283 gimple_phi_set_result (new_phi, new_vop);
1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285 if (stmt != new_phi
1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 SET_USE (use_p, new_vop);
1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1292 break;
1294 return NULL_TREE;
1297 /* Function vect_get_loop_location.
1299 Extract the location of the loop in the source code.
1300 If the loop is not well formed for vectorization, an estimated
1301 location is calculated.
1302 Return the loop location if succeed and NULL if not. */
1304 dump_user_location_t
1305 find_loop_location (class loop *loop)
1307 gimple *stmt = NULL;
1308 basic_block bb;
1309 gimple_stmt_iterator si;
1311 if (!loop)
1312 return dump_user_location_t ();
1314 stmt = get_loop_exit_condition (loop);
1316 if (stmt
1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318 return stmt;
1320 /* If we got here the loop is probably not "well formed",
1321 try to estimate the loop location */
1323 if (!loop->header)
1324 return dump_user_location_t ();
1326 bb = loop->header;
1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1330 stmt = gsi_stmt (si);
1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332 return stmt;
1335 return dump_user_location_t ();
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339 loop to be vectorized. */
1341 static bool
1342 iv_phi_p (stmt_vec_info stmt_info)
1344 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345 if (virtual_operand_p (PHI_RESULT (phi)))
1346 return false;
1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350 return false;
1352 return true;
1355 /* Function vect_can_advance_ivs_p
1357 In case the number of iterations that LOOP iterates is unknown at compile
1358 time, an epilog loop will be generated, and the loop induction variables
1359 (IVs) will be "advanced" to the value they are supposed to take just before
1360 the epilog loop. Here we check that the access function of the loop IVs
1361 and the expression that represents the loop bound are simple enough.
1362 These restrictions will be relaxed in the future. */
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block bb = loop->header;
1369 gphi_iterator gsi;
1371 /* Analyze phi functions of the loop header. */
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1377 tree evolution_part;
1379 gphi *phi = gsi.phi ();
1380 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383 phi_info->stmt);
1385 /* Skip virtual phi's. The data dependences that are associated with
1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1388 Skip reduction phis. */
1389 if (!iv_phi_p (phi_info))
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "reduc or virtual phi. skip.\n");
1394 continue;
1397 /* Analyze the evolution function. */
1399 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400 if (evolution_part == NULL_TREE)
1402 if (dump_enabled_p ())
1403 dump_printf (MSG_MISSED_OPTIMIZATION,
1404 "No access function or evolution.\n");
1405 return false;
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are not invariants in the loop. */
1411 if (!expr_invariant_in_loop_p (loop, evolution_part))
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution not invariant in loop.\n");
1416 return false;
1419 /* FORNOW: We do not transform initial conditions of IVs
1420 which evolution functions are a polynomial of degree >= 2. */
1422 if (tree_is_chrec (evolution_part))
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "evolution is chrec.\n");
1427 return false;
1431 return true;
1435 /* Function vect_update_ivs_after_vectorizer.
1437 "Advance" the induction variables of LOOP to the value they should take
1438 after the execution of LOOP. This is currently necessary because the
1439 vectorizer does not handle induction variables that are used after the
1440 loop. Such a situation occurs when the last iterations of LOOP are
1441 peeled, because:
1442 1. We introduced new uses after LOOP for IVs that were not originally used
1443 after LOOP: the IVs of LOOP are now used by an epilog loop.
1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445 times, whereas the loop IVs should be bumped N times.
1447 Input:
1448 - LOOP - a loop that is going to be vectorized. The last few iterations
1449 of LOOP were peeled.
1450 - NITERS - the number of iterations that LOOP executes (before it is
1451 vectorized). i.e, the number of times the ivs should be bumped.
1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453 coming out from LOOP on which there are uses of the LOOP ivs
1454 (this is the path from LOOP->exit to epilog_loop->preheader).
1456 The new definitions of the ivs are placed in LOOP->exit.
1457 The phi args associated with the edge UPDATE_E in the bb
1458 UPDATE_E->dest are updated accordingly.
1460 Assumption 1: Like the rest of the vectorizer, this function assumes
1461 a single loop exit that has a single predecessor.
1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464 organized in the same order.
1466 Assumption 3: The access function of the ivs is simple enough (see
1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470 coming out of LOOP on which the ivs of LOOP are used (this is the path
1471 that leads to the epilog loop; other paths skip the epilog loop). This
1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473 needs to have its phis updated.
1476 static void
1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 tree niters, edge update_e)
1480 gphi_iterator gsi, gsi1;
1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482 basic_block update_bb = update_e->dest;
1483 basic_block exit_bb = single_exit (loop)->dest;
1485 /* Make sure there exists a single-predecessor exit bb: */
1486 gcc_assert (single_pred_p (exit_bb));
1487 gcc_assert (single_succ_edge (exit_bb) == update_e);
1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491 gsi_next (&gsi), gsi_next (&gsi1))
1493 tree init_expr;
1494 tree step_expr, off;
1495 tree type;
1496 tree var, ni, ni_name;
1497 gimple_stmt_iterator last_gsi;
1499 gphi *phi = gsi.phi ();
1500 gphi *phi1 = gsi1.phi ();
1501 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_NOTE, vect_location,
1504 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1506 /* Skip reduction and virtual phis. */
1507 if (!iv_phi_p (phi_info))
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "reduc or virtual phi. skip.\n");
1512 continue;
1515 type = TREE_TYPE (gimple_phi_result (phi));
1516 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517 step_expr = unshare_expr (step_expr);
1519 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520 of degree >= 2 or exponential. */
1521 gcc_assert (!tree_is_chrec (step_expr));
1523 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1525 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526 fold_convert (TREE_TYPE (step_expr), niters),
1527 step_expr);
1528 if (POINTER_TYPE_P (type))
1529 ni = fold_build_pointer_plus (init_expr, off);
1530 else
1531 ni = fold_build2 (PLUS_EXPR, type,
1532 init_expr, fold_convert (type, off));
1534 var = create_tmp_var (type, "tmp");
1536 last_gsi = gsi_last_bb (exit_bb);
1537 gimple_seq new_stmts = NULL;
1538 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539 /* Exit_bb shouldn't be empty. */
1540 if (!gsi_end_p (last_gsi))
1541 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542 else
1543 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1545 /* Fix phi expressions in the successor bb. */
1546 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1550 /* Return a gimple value containing the misalignment (measured in vector
1551 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552 it is away from a perfectly aligned address. Add any new statements
1553 to SEQ. */
1555 static tree
1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559 stmt_vec_info stmt_info = dr_info->stmt;
1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563 unsigned HOST_WIDE_INT target_align_c;
1564 tree target_align_minus_1;
1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567 size_zero_node) < 0;
1568 tree offset = (negative
1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570 : size_zero_node);
1571 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1572 stmt_info, seq,
1573 offset);
1574 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1575 if (target_align.is_constant (&target_align_c))
1576 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1577 else
1579 tree vla = build_int_cst (type, target_align);
1580 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1581 fold_build2 (MINUS_EXPR, type,
1582 build_int_cst (type, 0), vla));
1583 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1584 build_int_cst (type, 1));
1587 HOST_WIDE_INT elem_size
1588 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1589 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1591 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1592 tree int_start_addr = fold_convert (type, start_addr);
1593 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1594 target_align_minus_1);
1596 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1597 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1598 elem_size_log);
1600 return misalign_in_elems;
1603 /* Function vect_gen_prolog_loop_niters
1605 Generate the number of iterations which should be peeled as prolog for the
1606 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1607 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1608 As a result, after the execution of this loop, the data reference DR will
1609 refer to an aligned location. The following computation is generated:
1611 If the misalignment of DR is known at compile time:
1612 addr_mis = int mis = DR_MISALIGNMENT (dr);
1613 Else, compute address misalignment in bytes:
1614 addr_mis = addr & (target_align - 1)
1616 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1618 (elem_size = element type size; an element is the scalar element whose type
1619 is the inner type of the vectype)
1621 The computations will be emitted at the end of BB. We also compute and
1622 store upper bound (included) of the result in BOUND.
1624 When the step of the data-ref in the loop is not 1 (as in interleaved data
1625 and SLP), the number of iterations of the prolog must be divided by the step
1626 (which is equal to the size of interleaved group).
1628 The above formulas assume that VF == number of elements in the vector. This
1629 may not hold when there are multiple-types in the loop.
1630 In this case, for some data-references in the loop the VF does not represent
1631 the number of elements that fit in the vector. Therefore, instead of VF we
1632 use TYPE_VECTOR_SUBPARTS. */
1634 static tree
1635 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1636 basic_block bb, int *bound)
1638 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1639 tree var;
1640 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1641 gimple_seq stmts = NULL, new_stmts = NULL;
1642 tree iters, iters_name;
1643 stmt_vec_info stmt_info = dr_info->stmt;
1644 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1645 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1647 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1649 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_NOTE, vect_location,
1653 "known peeling = %d.\n", npeel);
1655 iters = build_int_cst (niters_type, npeel);
1656 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1658 else
1660 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1661 tree type = TREE_TYPE (misalign_in_elems);
1662 HOST_WIDE_INT elem_size
1663 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1664 /* We only do prolog peeling if the target alignment is known at compile
1665 time. */
1666 poly_uint64 align_in_elems =
1667 exact_div (target_align, elem_size);
1668 tree align_in_elems_minus_1 =
1669 build_int_cst (type, align_in_elems - 1);
1670 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1672 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1673 & (align_in_elems - 1)). */
1674 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1675 size_zero_node) < 0;
1676 if (negative)
1677 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1678 align_in_elems_tree);
1679 else
1680 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1681 misalign_in_elems);
1682 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1683 iters = fold_convert (niters_type, iters);
1684 unsigned HOST_WIDE_INT align_in_elems_c;
1685 if (align_in_elems.is_constant (&align_in_elems_c))
1686 *bound = align_in_elems_c - 1;
1687 else
1688 *bound = -1;
1691 if (dump_enabled_p ())
1692 dump_printf_loc (MSG_NOTE, vect_location,
1693 "niters for prolog loop: %T\n", iters);
1695 var = create_tmp_var (niters_type, "prolog_loop_niters");
1696 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1698 if (new_stmts)
1699 gimple_seq_add_seq (&stmts, new_stmts);
1700 if (stmts)
1702 gcc_assert (single_succ_p (bb));
1703 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1704 if (gsi_end_p (gsi))
1705 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1706 else
1707 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1709 return iters_name;
1713 /* Function vect_update_init_of_dr
1715 If CODE is PLUS, the vector loop starts NITERS iterations after the
1716 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1717 iterations before the scalar one (using masking to skip inactive
1718 elements). This function updates the information recorded in DR to
1719 account for the difference. Specifically, it updates the OFFSET
1720 field of DR_INFO. */
1722 static void
1723 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1725 struct data_reference *dr = dr_info->dr;
1726 tree offset = dr_info->offset;
1727 if (!offset)
1728 offset = build_zero_cst (sizetype);
1730 niters = fold_build2 (MULT_EXPR, sizetype,
1731 fold_convert (sizetype, niters),
1732 fold_convert (sizetype, DR_STEP (dr)));
1733 offset = fold_build2 (code, sizetype,
1734 fold_convert (sizetype, offset), niters);
1735 dr_info->offset = offset;
1739 /* Function vect_update_inits_of_drs
1741 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1742 CODE and NITERS are as for vect_update_inits_of_dr. */
1744 void
1745 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1746 tree_code code)
1748 unsigned int i;
1749 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1750 struct data_reference *dr;
1752 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1754 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1755 here, but since we might use these niters to update the epilogues niters
1756 and data references we can't insert them here as this definition might not
1757 always dominate its uses. */
1758 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1759 niters = fold_convert (sizetype, niters);
1761 FOR_EACH_VEC_ELT (datarefs, i, dr)
1763 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1764 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1765 vect_update_init_of_dr (dr_info, niters, code);
1769 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770 by masking. This involves calculating the number of iterations to
1771 be peeled and then aligning all memory references appropriately. */
1773 void
1774 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1776 tree misalign_in_elems;
1777 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1779 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1781 /* From the information recorded in LOOP_VINFO get the number of iterations
1782 that need to be skipped via masking. */
1783 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1785 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787 misalign_in_elems = build_int_cst (type, misalign);
1789 else
1791 gimple_seq seq1 = NULL, seq2 = NULL;
1792 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793 misalign_in_elems = fold_convert (type, misalign_in_elems);
1794 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795 &seq2, true, NULL_TREE);
1796 gimple_seq_add_seq (&seq1, seq2);
1797 if (seq1)
1799 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801 gcc_assert (!new_bb);
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location,
1807 "misalignment for fully-masked loop: %T\n",
1808 misalign_in_elems);
1810 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1812 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1815 /* This function builds ni_name = number of iterations. Statements
1816 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1817 it to TRUE if new ssa_var is generated. */
1819 tree
1820 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1822 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1823 if (TREE_CODE (ni) == INTEGER_CST)
1824 return ni;
1825 else
1827 tree ni_name, var;
1828 gimple_seq stmts = NULL;
1829 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1831 var = create_tmp_var (TREE_TYPE (ni), "niters");
1832 ni_name = force_gimple_operand (ni, &stmts, false, var);
1833 if (stmts)
1835 gsi_insert_seq_on_edge_immediate (pe, stmts);
1836 if (new_var_p != NULL)
1837 *new_var_p = true;
1840 return ni_name;
1844 /* Calculate the number of iterations above which vectorized loop will be
1845 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1846 of prolog loop. If it's integer const, the integer number is also passed
1847 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1848 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1849 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1850 threshold below which the scalar (rather than vectorized) loop will be
1851 executed. This function stores the upper bound (inclusive) of the result
1852 in BOUND_SCALAR. */
1854 static tree
1855 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856 int bound_prolog, poly_int64 bound_epilog, int th,
1857 poly_uint64 *bound_scalar,
1858 bool check_profitability)
1860 tree type = TREE_TYPE (niters_prolog);
1861 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862 build_int_cst (type, bound_epilog));
1864 *bound_scalar = bound_prolog + bound_epilog;
1865 if (check_profitability)
1867 /* TH indicates the minimum niters of vectorized loop, while we
1868 compute the maximum niters of scalar loop. */
1869 th--;
1870 /* Peeling for constant times. */
1871 if (int_niters_prolog >= 0)
1873 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1874 return build_int_cst (type, *bound_scalar);
1876 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1877 and BOUND_EPILOG are inclusive upper bounds. */
1878 if (known_ge (th, bound_prolog + bound_epilog))
1880 *bound_scalar = th;
1881 return build_int_cst (type, th);
1883 /* Need to do runtime comparison. */
1884 else if (maybe_gt (th, bound_epilog))
1886 *bound_scalar = upper_bound (*bound_scalar, th);
1887 return fold_build2 (MAX_EXPR, type,
1888 build_int_cst (type, th), niters);
1891 return niters;
1894 /* NITERS is the number of times that the original scalar loop executes
1895 after peeling. Work out the maximum number of iterations N that can
1896 be handled by the vectorized form of the loop and then either:
1898 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1900 niters_vector = N
1902 b) set *STEP_VECTOR_PTR to one and generate:
1904 niters_vector = N / vf
1906 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1908 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1910 void
1911 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912 tree *niters_vector_ptr, tree *step_vector_ptr,
1913 bool niters_no_overflow)
1915 tree ni_minus_gap, var;
1916 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1918 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919 tree log_vf = NULL_TREE;
1921 /* If epilogue loop is required because of data accesses with gaps, we
1922 subtract one iteration from the total number of iterations here for
1923 correct calculation of RATIO. */
1924 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1926 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927 build_one_cst (type));
1928 if (!is_gimple_val (ni_minus_gap))
1930 var = create_tmp_var (type, "ni_gap");
1931 gimple *stmts = NULL;
1932 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1933 true, var);
1934 gsi_insert_seq_on_edge_immediate (pe, stmts);
1937 else
1938 ni_minus_gap = niters;
1940 unsigned HOST_WIDE_INT const_vf;
1941 if (vf.is_constant (&const_vf)
1942 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1944 /* Create: niters >> log2(vf) */
1945 /* If it's known that niters == number of latch executions + 1 doesn't
1946 overflow, we can generate niters >> log2(vf); otherwise we generate
1947 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1948 will be at least one. */
1949 log_vf = build_int_cst (type, exact_log2 (const_vf));
1950 if (niters_no_overflow)
1951 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1952 else
1953 niters_vector
1954 = fold_build2 (PLUS_EXPR, type,
1955 fold_build2 (RSHIFT_EXPR, type,
1956 fold_build2 (MINUS_EXPR, type,
1957 ni_minus_gap,
1958 build_int_cst (type, vf)),
1959 log_vf),
1960 build_int_cst (type, 1));
1961 step_vector = build_one_cst (type);
1963 else
1965 niters_vector = ni_minus_gap;
1966 step_vector = build_int_cst (type, vf);
1969 if (!is_gimple_val (niters_vector))
1971 var = create_tmp_var (type, "bnd");
1972 gimple_seq stmts = NULL;
1973 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1974 gsi_insert_seq_on_edge_immediate (pe, stmts);
1975 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976 we set range information to make niters analyzer's life easier. */
1977 if (stmts != NULL && log_vf)
1978 set_range_info (niters_vector, VR_RANGE,
1979 wi::to_wide (build_int_cst (type, 1)),
1980 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1981 TYPE_MAX_VALUE (type),
1982 log_vf)));
1984 *niters_vector_ptr = niters_vector;
1985 *step_vector_ptr = step_vector;
1987 return;
1990 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1991 loop specified by LOOP_VINFO after vectorization, compute the number
1992 of iterations before vectorization (niters_vector * vf) and store it
1993 to NITERS_VECTOR_MULT_VF_PTR. */
1995 static void
1996 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1997 tree niters_vector,
1998 tree *niters_vector_mult_vf_ptr)
2000 /* We should be using a step_vector of VF if VF is variable. */
2001 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2002 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2003 tree type = TREE_TYPE (niters_vector);
2004 tree log_vf = build_int_cst (type, exact_log2 (vf));
2005 basic_block exit_bb = single_exit (loop)->dest;
2007 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2008 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2009 niters_vector, log_vf);
2010 if (!is_gimple_val (niters_vector_mult_vf))
2012 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2013 gimple_seq stmts = NULL;
2014 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2015 &stmts, true, var);
2016 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2017 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2019 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2022 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2023 this function searches for the corresponding lcssa phi node in exit
2024 bb of LOOP. If it is found, return the phi result; otherwise return
2025 NULL. */
2027 static tree
2028 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2029 gphi *lcssa_phi)
2031 gphi_iterator gsi;
2032 edge e = single_exit (loop);
2034 gcc_assert (single_pred_p (e->dest));
2035 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2037 gphi *phi = gsi.phi ();
2038 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2039 PHI_ARG_DEF (lcssa_phi, 0), 0))
2040 return PHI_RESULT (phi);
2042 return NULL_TREE;
2045 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2046 from SECOND/FIRST and puts it at the original loop's preheader/exit
2047 edge, the two loops are arranged as below:
2049 preheader_a:
2050 first_loop:
2051 header_a:
2052 i_1 = PHI<i_0, i_2>;
2054 i_2 = i_1 + 1;
2055 if (cond_a)
2056 goto latch_a;
2057 else
2058 goto between_bb;
2059 latch_a:
2060 goto header_a;
2062 between_bb:
2063 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2065 second_loop:
2066 header_b:
2067 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2068 or with i_2 if no LCSSA phi is created
2069 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2071 i_4 = i_3 + 1;
2072 if (cond_b)
2073 goto latch_b;
2074 else
2075 goto exit_bb;
2076 latch_b:
2077 goto header_b;
2079 exit_bb:
2081 This function creates loop closed SSA for the first loop; update the
2082 second loop's PHI nodes by replacing argument on incoming edge with the
2083 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2084 is false, Loop closed ssa phis will only be created for non-iv phis for
2085 the first loop.
2087 This function assumes exit bb of the first loop is preheader bb of the
2088 second loop, i.e, between_bb in the example code. With PHIs updated,
2089 the second loop will execute rest iterations of the first. */
2091 static void
2092 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2093 class loop *first, class loop *second,
2094 bool create_lcssa_for_iv_phis)
2096 gphi_iterator gsi_update, gsi_orig;
2097 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2099 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2100 edge second_preheader_e = loop_preheader_edge (second);
2101 basic_block between_bb = single_exit (first)->dest;
2103 gcc_assert (between_bb == second_preheader_e->src);
2104 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2105 /* Either the first loop or the second is the loop to be vectorized. */
2106 gcc_assert (loop == first || loop == second);
2108 for (gsi_orig = gsi_start_phis (first->header),
2109 gsi_update = gsi_start_phis (second->header);
2110 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2111 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2113 gphi *orig_phi = gsi_orig.phi ();
2114 gphi *update_phi = gsi_update.phi ();
2116 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2117 /* Generate lcssa PHI node for the first loop. */
2118 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2119 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2120 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2122 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2123 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2124 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2125 arg = new_res;
2128 /* Update PHI node in the second loop by replacing arg on the loop's
2129 incoming edge. */
2130 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2133 /* For epilogue peeling we have to make sure to copy all LC PHIs
2134 for correct vectorization of live stmts. */
2135 if (loop == first)
2137 basic_block orig_exit = single_exit (second)->dest;
2138 for (gsi_orig = gsi_start_phis (orig_exit);
2139 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2141 gphi *orig_phi = gsi_orig.phi ();
2142 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2143 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2144 continue;
2146 /* Already created in the above loop. */
2147 if (find_guard_arg (first, second, orig_phi))
2148 continue;
2150 tree new_res = copy_ssa_name (orig_arg);
2151 gphi *lcphi = create_phi_node (new_res, between_bb);
2152 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2157 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2158 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2159 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2160 appear like below:
2162 guard_bb:
2163 if (cond)
2164 goto merge_bb;
2165 else
2166 goto skip_loop;
2168 skip_loop:
2169 header_a:
2170 i_1 = PHI<i_0, i_2>;
2172 i_2 = i_1 + 1;
2173 if (cond_a)
2174 goto latch_a;
2175 else
2176 goto exit_a;
2177 latch_a:
2178 goto header_a;
2180 exit_a:
2181 i_5 = PHI<i_2>;
2183 merge_bb:
2184 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2186 update_loop:
2187 header_b:
2188 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2190 i_4 = i_3 + 1;
2191 if (cond_b)
2192 goto latch_b;
2193 else
2194 goto exit_bb;
2195 latch_b:
2196 goto header_b;
2198 exit_bb:
2200 This function creates PHI nodes at merge_bb and replaces the use of i_5
2201 in the update_loop's PHI node with the result of new PHI result. */
2203 static void
2204 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2205 class loop *update_loop,
2206 edge guard_edge, edge merge_edge)
2208 location_t merge_loc, guard_loc;
2209 edge orig_e = loop_preheader_edge (skip_loop);
2210 edge update_e = loop_preheader_edge (update_loop);
2211 gphi_iterator gsi_orig, gsi_update;
2213 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2214 gsi_update = gsi_start_phis (update_loop->header));
2215 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2216 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2218 gphi *orig_phi = gsi_orig.phi ();
2219 gphi *update_phi = gsi_update.phi ();
2221 /* Generate new phi node at merge bb of the guard. */
2222 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2223 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2225 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2226 args in NEW_PHI for these edges. */
2227 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2228 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2229 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2230 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2231 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2232 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2234 /* Update phi in UPDATE_PHI. */
2235 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2239 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2240 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2241 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2242 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2243 The CFG looks like:
2245 loop:
2246 header_a:
2247 i_1 = PHI<i_0, i_2>;
2249 i_2 = i_1 + 1;
2250 if (cond_a)
2251 goto latch_a;
2252 else
2253 goto exit_a;
2254 latch_a:
2255 goto header_a;
2257 exit_a:
2259 guard_bb:
2260 if (cond)
2261 goto merge_bb;
2262 else
2263 goto epilog_loop;
2265 ;; fall_through_bb
2267 epilog_loop:
2268 header_b:
2269 i_3 = PHI<i_2, i_4>;
2271 i_4 = i_3 + 1;
2272 if (cond_b)
2273 goto latch_b;
2274 else
2275 goto merge_bb;
2276 latch_b:
2277 goto header_b;
2279 merge_bb:
2280 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2282 exit_bb:
2283 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2285 For each name used out side EPILOG (i.e - for each name that has a lcssa
2286 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2287 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2288 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2289 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2290 in exit_bb will also be updated. */
2292 static void
2293 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2294 edge guard_edge, edge merge_edge)
2296 gphi_iterator gsi;
2297 basic_block merge_bb = guard_edge->dest;
2299 gcc_assert (single_succ_p (merge_bb));
2300 edge e = single_succ_edge (merge_bb);
2301 basic_block exit_bb = e->dest;
2302 gcc_assert (single_pred_p (exit_bb));
2303 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2305 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2307 gphi *update_phi = gsi.phi ();
2308 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2310 tree merge_arg = NULL_TREE;
2312 /* If the old argument is a SSA_NAME use its current_def. */
2313 if (TREE_CODE (old_arg) == SSA_NAME)
2314 merge_arg = get_current_def (old_arg);
2315 /* If it's a constant or doesn't have a current_def, just use the old
2316 argument. */
2317 if (!merge_arg)
2318 merge_arg = old_arg;
2320 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2321 /* If the var is live after loop but not a reduction, we simply
2322 use the old arg. */
2323 if (!guard_arg)
2324 guard_arg = old_arg;
2326 /* Create new phi node in MERGE_BB: */
2327 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2328 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2330 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2331 the two PHI args in merge_phi for these edges. */
2332 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2333 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2335 /* Update the original phi in exit_bb. */
2336 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2340 /* EPILOG loop is duplicated from the original loop for vectorizing,
2341 the arg of its loop closed ssa PHI needs to be updated. */
2343 static void
2344 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2346 gphi_iterator gsi;
2347 basic_block exit_bb = single_exit (epilog)->dest;
2349 gcc_assert (single_pred_p (exit_bb));
2350 edge e = EDGE_PRED (exit_bb, 0);
2351 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2352 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2355 /* Function vect_do_peeling.
2357 Input:
2358 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2360 preheader:
2361 LOOP:
2362 header_bb:
2363 loop_body
2364 if (exit_loop_cond) goto exit_bb
2365 else goto header_bb
2366 exit_bb:
2368 - NITERS: The number of iterations of the loop.
2369 - NITERSM1: The number of iterations of the loop's latch.
2370 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2371 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2372 CHECK_PROFITABILITY is true.
2373 Output:
2374 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2375 iterate after vectorization; see vect_set_loop_condition for details.
2376 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2377 should be set to the number of scalar iterations handled by the
2378 vector loop. The SSA name is only used on exit from the loop.
2380 This function peels prolog and epilog from the loop, adds guards skipping
2381 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2382 would look like:
2384 guard_bb_1:
2385 if (prefer_scalar_loop) goto merge_bb_1
2386 else goto guard_bb_2
2388 guard_bb_2:
2389 if (skip_prolog) goto merge_bb_2
2390 else goto prolog_preheader
2392 prolog_preheader:
2393 PROLOG:
2394 prolog_header_bb:
2395 prolog_body
2396 if (exit_prolog_cond) goto prolog_exit_bb
2397 else goto prolog_header_bb
2398 prolog_exit_bb:
2400 merge_bb_2:
2402 vector_preheader:
2403 VECTOR LOOP:
2404 vector_header_bb:
2405 vector_body
2406 if (exit_vector_cond) goto vector_exit_bb
2407 else goto vector_header_bb
2408 vector_exit_bb:
2410 guard_bb_3:
2411 if (skip_epilog) goto merge_bb_3
2412 else goto epilog_preheader
2414 merge_bb_1:
2416 epilog_preheader:
2417 EPILOG:
2418 epilog_header_bb:
2419 epilog_body
2420 if (exit_epilog_cond) goto merge_bb_3
2421 else goto epilog_header_bb
2423 merge_bb_3:
2425 Note this function peels prolog and epilog only if it's necessary,
2426 as well as guards.
2427 This function returns the epilogue loop if a decision was made to vectorize
2428 it, otherwise NULL.
2430 The analysis resulting in this epilogue loop's loop_vec_info was performed
2431 in the same vect_analyze_loop call as the main loop's. At that time
2432 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2433 vectorization factors than the main loop. This list is stored in the main
2434 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2435 vectorize the epilogue loop for a lower vectorization factor, the
2436 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2437 updated and linked to the epilogue loop. This is later used to vectorize
2438 the epilogue. The reason the loop_vec_info needs updating is that it was
2439 constructed based on the original main loop, and the epilogue loop is a
2440 copy of this loop, so all links pointing to statements in the original loop
2441 need updating. Furthermore, these loop_vec_infos share the
2442 data_reference's records, which will also need to be updated.
2444 TODO: Guard for prefer_scalar_loop should be emitted along with
2445 versioning conditions if loop versioning is needed. */
2448 class loop *
2449 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2450 tree *niters_vector, tree *step_vector,
2451 tree *niters_vector_mult_vf_var, int th,
2452 bool check_profitability, bool niters_no_overflow,
2453 tree *advance)
2455 edge e, guard_e;
2456 tree type = TREE_TYPE (niters), guard_cond;
2457 basic_block guard_bb, guard_to;
2458 profile_probability prob_prolog, prob_vector, prob_epilog;
2459 int estimated_vf;
2460 int prolog_peeling = 0;
2461 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2462 /* We currently do not support prolog peeling if the target alignment is not
2463 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2464 target alignment being constant. */
2465 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2466 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2467 return NULL;
2469 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2470 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2472 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2473 poly_uint64 bound_epilog = 0;
2474 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2475 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2476 bound_epilog += vf - 1;
2477 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2478 bound_epilog += 1;
2479 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2480 poly_uint64 bound_scalar = bound_epilog;
2482 if (!prolog_peeling && !epilog_peeling)
2483 return NULL;
2485 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2486 estimated_vf = vect_vf_for_cost (loop_vinfo);
2487 if (estimated_vf == 2)
2488 estimated_vf = 3;
2489 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2490 .apply_scale (estimated_vf - 1, estimated_vf);
2492 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2493 class loop *first_loop = loop;
2494 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2496 /* We might have a queued need to update virtual SSA form. As we
2497 delete the update SSA machinery below after doing a regular
2498 incremental SSA update during loop copying make sure we don't
2499 lose that fact.
2500 ??? Needing to update virtual SSA form by renaming is unfortunate
2501 but not all of the vectorizer code inserting new loads / stores
2502 properly assigns virtual operands to those statements. */
2503 update_ssa (TODO_update_ssa_only_virtuals);
2505 create_lcssa_for_virtual_phi (loop);
2507 /* If we're vectorizing an epilogue loop, the update_ssa above will
2508 have ensured that the virtual operand is in SSA form throughout the
2509 vectorized main loop. Normally it is possible to trace the updated
2510 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2511 back to scalar-stmt vuses, meaning that the effect of the SSA update
2512 remains local to the main loop. However, there are rare cases in
2513 which the vectorized loop has vdefs even when the original scalar
2514 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2515 introduces clobbers of the temporary vector array, which in turn
2516 needs new vdefs. If the scalar loop doesn't write to memory, these
2517 new vdefs will be the only ones in the vector loop.
2519 In that case, update_ssa will have added a new virtual phi to the
2520 main loop, which previously didn't need one. Ensure that we (locally)
2521 maintain LCSSA form for the virtual operand, just as we would have
2522 done if the virtual phi had existed from the outset. This makes it
2523 easier to duplicate the scalar epilogue loop below. */
2524 tree vop_to_rename = NULL_TREE;
2525 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2527 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2528 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2531 if (MAY_HAVE_DEBUG_BIND_STMTS)
2533 gcc_assert (!adjust_vec.exists ());
2534 adjust_vec.create (32);
2536 initialize_original_copy_tables ();
2538 /* Record the anchor bb at which the guard should be placed if the scalar
2539 loop might be preferred. */
2540 basic_block anchor = loop_preheader_edge (loop)->src;
2542 /* Generate the number of iterations for the prolog loop. We do this here
2543 so that we can also get the upper bound on the number of iterations. */
2544 tree niters_prolog;
2545 int bound_prolog = 0;
2546 if (prolog_peeling)
2547 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2548 &bound_prolog);
2549 else
2550 niters_prolog = build_int_cst (type, 0);
2552 loop_vec_info epilogue_vinfo = NULL;
2553 if (vect_epilogues)
2555 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2556 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2559 tree niters_vector_mult_vf = NULL_TREE;
2560 /* Saving NITERs before the loop, as this may be changed by prologue. */
2561 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2562 edge update_e = NULL, skip_e = NULL;
2563 unsigned int lowest_vf = constant_lower_bound (vf);
2564 /* If we know the number of scalar iterations for the main loop we should
2565 check whether after the main loop there are enough iterations left over
2566 for the epilogue. */
2567 if (vect_epilogues
2568 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2569 && prolog_peeling >= 0
2570 && known_eq (vf, lowest_vf))
2572 unsigned HOST_WIDE_INT eiters
2573 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2574 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2576 eiters -= prolog_peeling;
2577 eiters
2578 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2580 unsigned int ratio;
2581 unsigned int epilogue_gaps
2582 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2583 while (!(constant_multiple_p
2584 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2585 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2586 && eiters >= lowest_vf / ratio + epilogue_gaps))
2588 delete epilogue_vinfo;
2589 epilogue_vinfo = NULL;
2590 if (loop_vinfo->epilogue_vinfos.length () == 0)
2592 vect_epilogues = false;
2593 break;
2595 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2596 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2597 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2600 /* Prolog loop may be skipped. */
2601 bool skip_prolog = (prolog_peeling != 0);
2602 /* Skip this loop to epilog when there are not enough iterations to enter this
2603 vectorized loop. If true we should perform runtime checks on the NITERS
2604 to check whether we should skip the current vectorized loop. If we know
2605 the number of scalar iterations we may choose to add a runtime check if
2606 this number "maybe" smaller than the number of iterations required
2607 when we know the number of scalar iterations may potentially
2608 be smaller than the number of iterations required to enter this loop, for
2609 this we use the upper bounds on the prolog and epilog peeling. When we
2610 don't know the number of iterations and don't require versioning it is
2611 because we have asserted that there are enough scalar iterations to enter
2612 the main loop, so this skip is not necessary. When we are versioning then
2613 we only add such a skip if we have chosen to vectorize the epilogue. */
2614 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2615 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2616 bound_prolog + bound_epilog)
2617 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2618 || vect_epilogues));
2619 /* Epilog loop must be executed if the number of iterations for epilog
2620 loop is known at compile time, otherwise we need to add a check at
2621 the end of vector loop and skip to the end of epilog loop. */
2622 bool skip_epilog = (prolog_peeling < 0
2623 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624 || !vf.is_constant ());
2625 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2626 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2627 skip_epilog = false;
2629 if (skip_vector)
2631 split_edge (loop_preheader_edge (loop));
2633 /* Due to the order in which we peel prolog and epilog, we first
2634 propagate probability to the whole loop. The purpose is to
2635 avoid adjusting probabilities of both prolog and vector loops
2636 separately. Note in this case, the probability of epilog loop
2637 needs to be scaled back later. */
2638 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2639 if (prob_vector.initialized_p ())
2641 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2642 scale_loop_profile (loop, prob_vector, 0);
2646 dump_user_location_t loop_loc = find_loop_location (loop);
2647 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2648 if (vect_epilogues)
2649 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2650 use the original scalar loop as remaining epilogue if necessary. */
2651 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2652 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2654 if (prolog_peeling)
2656 e = loop_preheader_edge (loop);
2657 if (!slpeel_can_duplicate_loop_p (loop, e))
2659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2660 "loop can't be duplicated to preheader edge.\n");
2661 gcc_unreachable ();
2663 /* Peel prolog and put it on preheader edge of loop. */
2664 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2665 if (!prolog)
2667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2668 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2669 gcc_unreachable ();
2671 prolog->force_vectorize = false;
2672 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2673 first_loop = prolog;
2674 reset_original_copy_tables ();
2676 /* Update the number of iterations for prolog loop. */
2677 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2678 vect_set_loop_condition (prolog, NULL, niters_prolog,
2679 step_prolog, NULL_TREE, false);
2681 /* Skip the prolog loop. */
2682 if (skip_prolog)
2684 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2685 niters_prolog, build_int_cst (type, 0));
2686 guard_bb = loop_preheader_edge (prolog)->src;
2687 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2688 guard_to = split_edge (loop_preheader_edge (loop));
2689 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2690 guard_to, guard_bb,
2691 prob_prolog.invert (),
2692 irred_flag);
2693 e = EDGE_PRED (guard_to, 0);
2694 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2695 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2697 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2698 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2701 /* Update init address of DRs. */
2702 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2703 /* Update niters for vector loop. */
2704 LOOP_VINFO_NITERS (loop_vinfo)
2705 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2706 LOOP_VINFO_NITERSM1 (loop_vinfo)
2707 = fold_build2 (MINUS_EXPR, type,
2708 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2709 bool new_var_p = false;
2710 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2711 /* It's guaranteed that vector loop bound before vectorization is at
2712 least VF, so set range information for newly generated var. */
2713 if (new_var_p)
2714 set_range_info (niters, VR_RANGE,
2715 wi::to_wide (build_int_cst (type, vf)),
2716 wi::to_wide (TYPE_MAX_VALUE (type)));
2718 /* Prolog iterates at most bound_prolog times, latch iterates at
2719 most bound_prolog - 1 times. */
2720 record_niter_bound (prolog, bound_prolog - 1, false, true);
2721 delete_update_ssa ();
2722 adjust_vec_debug_stmts ();
2723 scev_reset ();
2726 if (epilog_peeling)
2728 e = single_exit (loop);
2729 if (!slpeel_can_duplicate_loop_p (loop, e))
2731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2732 "loop can't be duplicated to exit edge.\n");
2733 gcc_unreachable ();
2735 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2736 said epilog then we should use a copy of the main loop as a starting
2737 point. This loop may have already had some preliminary transformations
2738 to allow for more optimal vectorization, for example if-conversion.
2739 If we are not vectorizing the epilog then we should use the scalar loop
2740 as the transformations mentioned above make less or no sense when not
2741 vectorizing. */
2742 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2743 if (vop_to_rename)
2745 /* Vectorizing the main loop can sometimes introduce a vdef to
2746 a loop that previously didn't have one; see the comment above
2747 the definition of VOP_TO_RENAME for details. The definition
2748 D that holds on E will then be different from the definition
2749 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2750 rename VOP_TO_RENAME to D when copying the loop.
2752 The virtual operand is in LCSSA form for the main loop,
2753 and no stmt between the main loop and E needs a vdef,
2754 so we know that D is provided by a phi rather than by a
2755 vdef on a normal gimple stmt. */
2756 basic_block vdef_bb = e->src;
2757 gphi *vphi;
2758 while (!(vphi = get_virtual_phi (vdef_bb)))
2759 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2760 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2761 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2763 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2764 if (!epilog)
2766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2767 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2768 gcc_unreachable ();
2770 epilog->force_vectorize = false;
2771 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2773 /* Scalar version loop may be preferred. In this case, add guard
2774 and skip to epilog. Note this only happens when the number of
2775 iterations of loop is unknown at compile time, otherwise this
2776 won't be vectorized. */
2777 if (skip_vector)
2779 /* Additional epilogue iteration is peeled if gap exists. */
2780 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2781 bound_prolog, bound_epilog,
2782 th, &bound_scalar,
2783 check_profitability);
2784 /* Build guard against NITERSM1 since NITERS may overflow. */
2785 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2786 guard_bb = anchor;
2787 guard_to = split_edge (loop_preheader_edge (epilog));
2788 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2789 guard_to, guard_bb,
2790 prob_vector.invert (),
2791 irred_flag);
2792 skip_e = guard_e;
2793 e = EDGE_PRED (guard_to, 0);
2794 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2795 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2797 /* Simply propagate profile info from guard_bb to guard_to which is
2798 a merge point of control flow. */
2799 guard_to->count = guard_bb->count;
2801 /* Scale probability of epilog loop back.
2802 FIXME: We should avoid scaling down and back up. Profile may
2803 get lost if we scale down to 0. */
2804 basic_block *bbs = get_loop_body (epilog);
2805 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2806 bbs[i]->count = bbs[i]->count.apply_scale
2807 (bbs[i]->count,
2808 bbs[i]->count.apply_probability
2809 (prob_vector));
2810 free (bbs);
2813 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2814 /* If loop is peeled for non-zero constant times, now niters refers to
2815 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2816 overflows. */
2817 niters_no_overflow |= (prolog_peeling > 0);
2818 vect_gen_vector_loop_niters (loop_vinfo, niters,
2819 niters_vector, step_vector,
2820 niters_no_overflow);
2821 if (!integer_onep (*step_vector))
2823 /* On exit from the loop we will have an easy way of calcalating
2824 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2825 until then. */
2826 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2827 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2828 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2830 else
2831 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2832 &niters_vector_mult_vf);
2833 /* Update IVs of original loop as if they were advanced by
2834 niters_vector_mult_vf steps. */
2835 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2836 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2837 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2838 update_e);
2840 if (skip_epilog)
2842 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2843 niters, niters_vector_mult_vf);
2844 guard_bb = single_exit (loop)->dest;
2845 guard_to = split_edge (single_exit (epilog));
2846 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2847 skip_vector ? anchor : guard_bb,
2848 prob_epilog.invert (),
2849 irred_flag);
2850 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2851 single_exit (epilog));
2852 /* Only need to handle basic block before epilog loop if it's not
2853 the guard_bb, which is the case when skip_vector is true. */
2854 if (guard_bb != bb_before_epilog)
2856 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2858 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2860 scale_loop_profile (epilog, prob_epilog, 0);
2862 else
2863 slpeel_update_phi_nodes_for_lcssa (epilog);
2865 unsigned HOST_WIDE_INT bound;
2866 if (bound_scalar.is_constant (&bound))
2868 gcc_assert (bound != 0);
2869 /* -1 to convert loop iterations to latch iterations. */
2870 record_niter_bound (epilog, bound - 1, false, true);
2873 delete_update_ssa ();
2874 adjust_vec_debug_stmts ();
2875 scev_reset ();
2878 if (vect_epilogues)
2880 epilog->aux = epilogue_vinfo;
2881 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2883 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2885 /* We now must calculate the number of NITERS performed by the previous
2886 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2887 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2888 niters_prolog, niters_vector_mult_vf);
2890 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2891 determine whether we are coming from the previous vectorized loop
2892 using the update_e edge or the skip_vector basic block using the
2893 skip_e edge. */
2894 if (skip_vector)
2896 gcc_assert (update_e != NULL && skip_e != NULL);
2897 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2898 update_e->dest);
2899 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2900 gimple *stmt = gimple_build_assign (new_ssa, niters);
2901 gimple_stmt_iterator gsi;
2902 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2903 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2905 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2906 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2908 else
2910 gsi = gsi_last_bb (update_e->src);
2911 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2914 niters = new_ssa;
2915 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2916 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2917 UNKNOWN_LOCATION);
2918 niters = PHI_RESULT (new_phi);
2921 /* Subtract the number of iterations performed by the vectorized loop
2922 from the number of total iterations. */
2923 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2924 before_loop_niters,
2925 niters);
2927 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2928 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2929 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2930 epilogue_niters,
2931 build_one_cst (TREE_TYPE (epilogue_niters)));
2933 /* Set ADVANCE to the number of iterations performed by the previous
2934 loop and its prologue. */
2935 *advance = niters;
2937 /* Redo the peeling for niter analysis as the NITERs and alignment
2938 may have been updated to take the main loop into account. */
2939 determine_peel_for_niter (epilogue_vinfo);
2942 adjust_vec.release ();
2943 free_original_copy_tables ();
2945 return vect_epilogues ? epilog : NULL;
2948 /* Function vect_create_cond_for_niters_checks.
2950 Create a conditional expression that represents the run-time checks for
2951 loop's niter. The loop is guaranteed to terminate if the run-time
2952 checks hold.
2954 Input:
2955 COND_EXPR - input conditional expression. New conditions will be chained
2956 with logical AND operation. If it is NULL, then the function
2957 is used to return the number of alias checks.
2958 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2959 to be checked.
2961 Output:
2962 COND_EXPR - conditional expression.
2964 The returned COND_EXPR is the conditional expression to be used in the
2965 if statement that controls which version of the loop gets executed at
2966 runtime. */
2968 static void
2969 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2971 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2973 if (*cond_expr)
2974 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2975 *cond_expr, part_cond_expr);
2976 else
2977 *cond_expr = part_cond_expr;
2980 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2981 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2983 static void
2984 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2986 if (*cond_expr)
2987 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2988 *cond_expr, part_cond_expr);
2989 else
2990 *cond_expr = part_cond_expr;
2993 /* Function vect_create_cond_for_align_checks.
2995 Create a conditional expression that represents the alignment checks for
2996 all of data references (array element references) whose alignment must be
2997 checked at runtime.
2999 Input:
3000 COND_EXPR - input conditional expression. New conditions will be chained
3001 with logical AND operation.
3002 LOOP_VINFO - two fields of the loop information are used.
3003 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3004 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3006 Output:
3007 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3008 expression.
3009 The returned value is the conditional expression to be used in the if
3010 statement that controls which version of the loop gets executed at runtime.
3012 The algorithm makes two assumptions:
3013 1) The number of bytes "n" in a vector is a power of 2.
3014 2) An address "a" is aligned if a%n is zero and that this
3015 test can be done as a&(n-1) == 0. For example, for 16
3016 byte vectors the test is a&0xf == 0. */
3018 static void
3019 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3020 tree *cond_expr,
3021 gimple_seq *cond_expr_stmt_list)
3023 vec<stmt_vec_info> may_misalign_stmts
3024 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3025 stmt_vec_info stmt_info;
3026 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3027 tree mask_cst;
3028 unsigned int i;
3029 tree int_ptrsize_type;
3030 char tmp_name[20];
3031 tree or_tmp_name = NULL_TREE;
3032 tree and_tmp_name;
3033 gimple *and_stmt;
3034 tree ptrsize_zero;
3035 tree part_cond_expr;
3037 /* Check that mask is one less than a power of 2, i.e., mask is
3038 all zeros followed by all ones. */
3039 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3041 int_ptrsize_type = signed_type_for (ptr_type_node);
3043 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3044 of the first vector of the i'th data reference. */
3046 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3048 gimple_seq new_stmt_list = NULL;
3049 tree addr_base;
3050 tree addr_tmp_name;
3051 tree new_or_tmp_name;
3052 gimple *addr_stmt, *or_stmt;
3053 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3054 bool negative = tree_int_cst_compare
3055 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3056 tree offset = negative
3057 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3059 /* create: addr_tmp = (int)(address_of_first_vector) */
3060 addr_base =
3061 vect_create_addr_base_for_vector_ref (loop_vinfo,
3062 stmt_info, &new_stmt_list,
3063 offset);
3064 if (new_stmt_list != NULL)
3065 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3067 sprintf (tmp_name, "addr2int%d", i);
3068 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3069 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3070 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3072 /* The addresses are OR together. */
3074 if (or_tmp_name != NULL_TREE)
3076 /* create: or_tmp = or_tmp | addr_tmp */
3077 sprintf (tmp_name, "orptrs%d", i);
3078 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3079 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3080 or_tmp_name, addr_tmp_name);
3081 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3082 or_tmp_name = new_or_tmp_name;
3084 else
3085 or_tmp_name = addr_tmp_name;
3087 } /* end for i */
3089 mask_cst = build_int_cst (int_ptrsize_type, mask);
3091 /* create: and_tmp = or_tmp & mask */
3092 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3094 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3095 or_tmp_name, mask_cst);
3096 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3098 /* Make and_tmp the left operand of the conditional test against zero.
3099 if and_tmp has a nonzero bit then some address is unaligned. */
3100 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3101 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3102 and_tmp_name, ptrsize_zero);
3103 chain_cond_expr (cond_expr, part_cond_expr);
3106 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3107 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3108 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3109 and this new condition are true. Treat a null *COND_EXPR as "true". */
3111 static void
3112 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3114 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3115 unsigned int i;
3116 vec_object_pair *pair;
3117 FOR_EACH_VEC_ELT (pairs, i, pair)
3119 tree addr1 = build_fold_addr_expr (pair->first);
3120 tree addr2 = build_fold_addr_expr (pair->second);
3121 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3122 addr1, addr2);
3123 chain_cond_expr (cond_expr, part_cond_expr);
3127 /* Create an expression that is true when all lower-bound conditions for
3128 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3130 static void
3131 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3133 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3134 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3136 tree expr = lower_bounds[i].expr;
3137 tree type = unsigned_type_for (TREE_TYPE (expr));
3138 expr = fold_convert (type, expr);
3139 poly_uint64 bound = lower_bounds[i].min_value;
3140 if (!lower_bounds[i].unsigned_p)
3142 expr = fold_build2 (PLUS_EXPR, type, expr,
3143 build_int_cstu (type, bound - 1));
3144 bound += bound - 1;
3146 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3147 build_int_cstu (type, bound));
3148 chain_cond_expr (cond_expr, part_cond_expr);
3152 /* Function vect_create_cond_for_alias_checks.
3154 Create a conditional expression that represents the run-time checks for
3155 overlapping of address ranges represented by a list of data references
3156 relations passed as input.
3158 Input:
3159 COND_EXPR - input conditional expression. New conditions will be chained
3160 with logical AND operation. If it is NULL, then the function
3161 is used to return the number of alias checks.
3162 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3163 to be checked.
3165 Output:
3166 COND_EXPR - conditional expression.
3168 The returned COND_EXPR is the conditional expression to be used in the if
3169 statement that controls which version of the loop gets executed at runtime.
3172 void
3173 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3175 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3176 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3178 if (comp_alias_ddrs.is_empty ())
3179 return;
3181 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3182 &comp_alias_ddrs, cond_expr);
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE, vect_location,
3185 "created %u versioning for alias checks.\n",
3186 comp_alias_ddrs.length ());
3190 /* Function vect_loop_versioning.
3192 If the loop has data references that may or may not be aligned or/and
3193 has data reference relations whose independence was not proven then
3194 two versions of the loop need to be generated, one which is vectorized
3195 and one which isn't. A test is then generated to control which of the
3196 loops is executed. The test checks for the alignment of all of the
3197 data references that may or may not be aligned. An additional
3198 sequence of runtime tests is generated for each pairs of DDRs whose
3199 independence was not proven. The vectorized version of loop is
3200 executed only if both alias and alignment tests are passed.
3202 The test generated to check which version of loop is executed
3203 is modified to also check for profitability as indicated by the
3204 cost model threshold TH.
3206 The versioning precondition(s) are placed in *COND_EXPR and
3207 *COND_EXPR_STMT_LIST. */
3209 class loop *
3210 vect_loop_versioning (loop_vec_info loop_vinfo,
3211 gimple *loop_vectorized_call)
3213 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3214 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3215 basic_block condition_bb;
3216 gphi_iterator gsi;
3217 gimple_stmt_iterator cond_exp_gsi;
3218 basic_block merge_bb;
3219 basic_block new_exit_bb;
3220 edge new_exit_e, e;
3221 gphi *orig_phi, *new_phi;
3222 tree cond_expr = NULL_TREE;
3223 gimple_seq cond_expr_stmt_list = NULL;
3224 tree arg;
3225 profile_probability prob = profile_probability::likely ();
3226 gimple_seq gimplify_stmt_list = NULL;
3227 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3228 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3229 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3230 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3231 poly_uint64 versioning_threshold
3232 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3233 tree version_simd_if_cond
3234 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3235 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3237 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3238 && !ordered_p (th, versioning_threshold))
3239 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3240 build_int_cst (TREE_TYPE (scalar_loop_iters),
3241 th - 1));
3242 if (maybe_ne (versioning_threshold, 0U))
3244 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3245 build_int_cst (TREE_TYPE (scalar_loop_iters),
3246 versioning_threshold - 1));
3247 if (cond_expr)
3248 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3249 expr, cond_expr);
3250 else
3251 cond_expr = expr;
3254 if (version_niter)
3255 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3257 if (cond_expr)
3258 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3259 &cond_expr_stmt_list,
3260 is_gimple_condexpr, NULL_TREE);
3262 if (version_align)
3263 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3264 &cond_expr_stmt_list);
3266 if (version_alias)
3268 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3269 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3270 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3273 if (version_simd_if_cond)
3275 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3276 if (flag_checking)
3277 if (basic_block bb
3278 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3279 gcc_assert (bb != loop->header
3280 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3281 && (scalar_loop == NULL
3282 || (bb != scalar_loop->header
3283 && dominated_by_p (CDI_DOMINATORS,
3284 scalar_loop->header, bb))));
3285 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3286 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3287 version_simd_if_cond, zero);
3288 if (cond_expr)
3289 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3290 c, cond_expr);
3291 else
3292 cond_expr = c;
3293 if (dump_enabled_p ())
3294 dump_printf_loc (MSG_NOTE, vect_location,
3295 "created versioning for simd if condition check.\n");
3298 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3299 &gimplify_stmt_list,
3300 is_gimple_condexpr, NULL_TREE);
3301 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3303 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3304 invariant in. */
3305 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3306 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3307 !gsi_end_p (gsi); gsi_next (&gsi))
3309 gimple *stmt = gsi_stmt (gsi);
3310 update_stmt (stmt);
3311 ssa_op_iter iter;
3312 use_operand_p use_p;
3313 basic_block def_bb;
3314 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3315 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3316 && flow_bb_inside_loop_p (outermost, def_bb))
3317 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3320 /* Search for the outermost loop we can version. Avoid versioning of
3321 non-perfect nests but allow if-conversion versioned loops inside. */
3322 class loop *loop_to_version = loop;
3323 if (flow_loop_nested_p (outermost, loop))
3325 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_NOTE, vect_location,
3327 "trying to apply versioning to outer loop %d\n",
3328 outermost->num);
3329 if (outermost->num == 0)
3330 outermost = superloop_at_depth (loop, 1);
3331 /* And avoid applying versioning on non-perfect nests. */
3332 while (loop_to_version != outermost
3333 && single_exit (loop_outer (loop_to_version))
3334 && (!loop_outer (loop_to_version)->inner->next
3335 || vect_loop_vectorized_call (loop_to_version))
3336 && (!loop_outer (loop_to_version)->inner->next
3337 || !loop_outer (loop_to_version)->inner->next->next))
3338 loop_to_version = loop_outer (loop_to_version);
3341 /* Apply versioning. If there is already a scalar version created by
3342 if-conversion re-use that. Note we cannot re-use the copy of
3343 an if-converted outer-loop when vectorizing the inner loop only. */
3344 gcond *cond;
3345 if ((!loop_to_version->inner || loop == loop_to_version)
3346 && loop_vectorized_call)
3348 gcc_assert (scalar_loop);
3349 condition_bb = gimple_bb (loop_vectorized_call);
3350 cond = as_a <gcond *> (last_stmt (condition_bb));
3351 gimple_cond_set_condition_from_tree (cond, cond_expr);
3352 update_stmt (cond);
3354 if (cond_expr_stmt_list)
3356 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3357 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3358 GSI_SAME_STMT);
3361 /* if-conversion uses profile_probability::always () for both paths,
3362 reset the paths probabilities appropriately. */
3363 edge te, fe;
3364 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3365 te->probability = prob;
3366 fe->probability = prob.invert ();
3367 /* We can scale loops counts immediately but have to postpone
3368 scaling the scalar loop because we re-use it during peeling. */
3369 scale_loop_frequencies (loop_to_version, te->probability);
3370 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3372 nloop = scalar_loop;
3373 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE, vect_location,
3375 "reusing %sloop version created by if conversion\n",
3376 loop_to_version != loop ? "outer " : "");
3378 else
3380 if (loop_to_version != loop
3381 && dump_enabled_p ())
3382 dump_printf_loc (MSG_NOTE, vect_location,
3383 "applying loop versioning to outer loop %d\n",
3384 loop_to_version->num);
3386 initialize_original_copy_tables ();
3387 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3388 prob, prob.invert (), prob, prob.invert (), true);
3389 gcc_assert (nloop);
3390 nloop = get_loop_copy (loop);
3392 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3393 reap those otherwise; they also refer to the original
3394 loops. */
3395 class loop *l = loop;
3396 while (gimple *call = vect_loop_vectorized_call (l))
3398 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3399 fold_loop_internal_call (call, boolean_false_node);
3400 l = loop_outer (l);
3402 free_original_copy_tables ();
3404 if (cond_expr_stmt_list)
3406 cond_exp_gsi = gsi_last_bb (condition_bb);
3407 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3408 GSI_SAME_STMT);
3411 /* Loop versioning violates an assumption we try to maintain during
3412 vectorization - that the loop exit block has a single predecessor.
3413 After versioning, the exit block of both loop versions is the same
3414 basic block (i.e. it has two predecessors). Just in order to simplify
3415 following transformations in the vectorizer, we fix this situation
3416 here by adding a new (empty) block on the exit-edge of the loop,
3417 with the proper loop-exit phis to maintain loop-closed-form.
3418 If loop versioning wasn't done from loop, but scalar_loop instead,
3419 merge_bb will have already just a single successor. */
3421 merge_bb = single_exit (loop_to_version)->dest;
3422 if (EDGE_COUNT (merge_bb->preds) >= 2)
3424 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3425 new_exit_bb = split_edge (single_exit (loop_to_version));
3426 new_exit_e = single_exit (loop_to_version);
3427 e = EDGE_SUCC (new_exit_bb, 0);
3429 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3430 gsi_next (&gsi))
3432 tree new_res;
3433 orig_phi = gsi.phi ();
3434 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3435 new_phi = create_phi_node (new_res, new_exit_bb);
3436 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3437 add_phi_arg (new_phi, arg, new_exit_e,
3438 gimple_phi_arg_location_from_edge (orig_phi, e));
3439 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3443 update_ssa (TODO_update_ssa);
3446 if (version_niter)
3448 /* The versioned loop could be infinite, we need to clear existing
3449 niter information which is copied from the original loop. */
3450 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3451 vect_free_loop_info_assumptions (nloop);
3452 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3453 loop_constraint_set (loop, LOOP_C_INFINITE);
3456 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3457 && dump_enabled_p ())
3459 if (version_alias)
3460 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3461 vect_location,
3462 "loop versioned for vectorization because of "
3463 "possible aliasing\n");
3464 if (version_align)
3465 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3466 vect_location,
3467 "loop versioned for vectorization to enhance "
3468 "alignment\n");
3472 return nloop;