testsuite: remove SPE tests.
[official-gcc.git] / gcc / tree-vect-loop-manip.c
blob169ea64fd9e9e633341ed28143ed83bb730cde38
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 rgroup_controls *src_rgm)
326 tree src_masktype = src_rgm->type;
327 tree dest_masktype = dest_rgm->type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
343 tree src = src_rgm->controls[i / 2];
344 tree dest = dest_rgm->controls[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
360 gimple_seq_add_stmt (seq, stmt);
362 return true;
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
376 tree src = src_rgm->controls[i / 2];
377 tree dest = dest_rgm->controls[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
382 return true;
384 return false;
387 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
388 for all the rgroup controls in RGC and return a control that is nonzero
389 when the loop needs to iterate. Add any new preheader statements to
390 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
392 RGC belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
400 It is known that:
402 NITERS * RGC->max_nscalars_per_iter
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
407 VF * RGC->max_nscalars_per_iter
409 might overflow before hitting a value above:
411 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks for RGC. */
416 static tree
417 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
418 gimple_seq *preheader_seq,
419 gimple_stmt_iterator loop_cond_gsi,
420 rgroup_controls *rgc, tree niters,
421 tree niters_skip, bool might_wrap_p)
423 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
424 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
425 tree ctrl_type = rgc->type;
426 unsigned int nscalars_per_iter = rgc->max_nscalars_per_iter;
427 poly_uint64 nscalars_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
430 /* Calculate the maximum number of scalar values that the rgroup
431 handles in total, the number that it handles for each iteration
432 of the vector loop, and the number that it should skip during the
433 first iteration of the vector loop. */
434 tree nscalars_total = niters;
435 tree nscalars_step = build_int_cst (iv_type, vf);
436 tree nscalars_skip = niters_skip;
437 if (nscalars_per_iter != 1)
439 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
440 these multiplications don't overflow. */
441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444 nscalars_total, compare_factor);
445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 nscalars_step, iv_factor);
447 if (nscalars_skip)
448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449 nscalars_skip, compare_factor);
452 /* Create an induction variable that counts the number of scalars
453 processed. */
454 tree index_before_incr, index_after_incr;
455 gimple_stmt_iterator incr_gsi;
456 bool insert_after;
457 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
461 tree zero_index = build_int_cst (compare_type, 0);
462 tree test_index, test_limit, first_limit;
463 gimple_stmt_iterator *test_gsi;
464 if (might_wrap_p)
466 /* In principle the loop should stop iterating once the incremented
467 IV reaches a value greater than or equal to:
469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
471 However, there's no guarantee that this addition doesn't overflow
472 the comparison type, or that the IV hits a value above it before
473 wrapping around. We therefore adjust the limit down by one
474 IV step:
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[infinite-prec] NSCALARS_STEP
479 and compare the IV against this limit _before_ incrementing it.
480 Since the comparison type is unsigned, we actually want the
481 subtraction to saturate at zero:
483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 -[sat] NSCALARS_STEP
486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
490 where the rightmost subtraction can be done directly in
491 COMPARE_TYPE. */
492 test_index = index_before_incr;
493 tree adjust = gimple_convert (preheader_seq, compare_type,
494 nscalars_step);
495 if (nscalars_skip)
496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 adjust, nscalars_skip);
498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499 nscalars_total, adjust);
500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501 test_limit, adjust);
502 test_gsi = &incr_gsi;
504 /* Get a safe limit for the first iteration. */
505 if (nscalars_skip)
507 /* The first vector iteration can handle at most NSCALARS_STEP
508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509 NSCALARS_SKIP to that cannot overflow. */
510 tree const_limit = build_int_cst (compare_type,
511 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 * nscalars_per_iter);
513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 nscalars_total, const_limit);
515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 first_limit, nscalars_skip);
518 else
519 /* For the first iteration it doesn't matter whether the IV hits
520 a value above NSCALARS_TOTAL. That only matters for the latch
521 condition. */
522 first_limit = nscalars_total;
524 else
526 /* Test the incremented IV, which will always hit a value above
527 the bound before wrapping. */
528 test_index = index_after_incr;
529 test_limit = nscalars_total;
530 if (nscalars_skip)
531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 test_limit, nscalars_skip);
533 test_gsi = &loop_cond_gsi;
535 first_limit = test_limit;
538 /* Convert the IV value to the comparison type (either a no-op or
539 a demotion). */
540 gimple_seq test_seq = NULL;
541 test_index = gimple_convert (&test_seq, compare_type, test_index);
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
544 /* Provide a definition of each control in the group. */
545 tree next_ctrl = NULL_TREE;
546 tree ctrl;
547 unsigned int i;
548 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
550 /* Previous controls will cover BIAS scalars. This control covers the
551 next batch. */
552 poly_uint64 bias = nscalars_per_ctrl * i;
553 tree bias_tree = build_int_cst (compare_type, bias);
554 gimple *tmp_stmt;
556 /* See whether the first iteration of the vector loop is known
557 to have a full control. */
558 poly_uint64 const_limit;
559 bool first_iteration_full
560 = (poly_int_tree_p (first_limit, &const_limit)
561 && known_ge (const_limit, (i + 1) * nscalars_per_ctrl));
563 /* Rather than have a new IV that starts at BIAS and goes up to
564 TEST_LIMIT, prefer to use the same 0-based IV for each control
565 and adjust the bound down by BIAS. */
566 tree this_test_limit = test_limit;
567 if (i != 0)
569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 compare_type, this_test_limit,
571 bias_tree);
572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 compare_type, this_test_limit,
574 bias_tree);
577 /* Create the initial control. First include all scalars that
578 are within the loop limit. */
579 tree init_ctrl = NULL_TREE;
580 if (!first_iteration_full)
582 tree start, end;
583 if (first_limit == test_limit)
585 /* Use a natural test between zero (the initial IV value)
586 and the loop limit. The "else" block would be valid too,
587 but this choice can avoid the need to load BIAS_TREE into
588 a register. */
589 start = zero_index;
590 end = this_test_limit;
592 else
594 /* FIRST_LIMIT is the maximum number of scalars handled by the
595 first iteration of the vector loop. Test the portion
596 associated with this control. */
597 start = bias_tree;
598 end = first_limit;
601 init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
602 tmp_stmt = vect_gen_while (init_ctrl, start, end);
603 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
606 /* Now AND out the bits that are within the number of skipped
607 scalars. */
608 poly_uint64 const_skip;
609 if (nscalars_skip
610 && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 && known_le (const_skip, bias)))
613 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
614 bias_tree, nscalars_skip);
615 if (init_ctrl)
616 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
617 init_ctrl, unskipped_mask);
618 else
619 init_ctrl = unskipped_mask;
622 if (!init_ctrl)
623 /* First iteration is full. */
624 init_ctrl = build_minus_one_cst (ctrl_type);
626 /* Get the control value for the next iteration of the loop. */
627 next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
628 gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
631 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
633 return next_ctrl;
636 /* Set up the iteration condition and rgroup controls for LOOP, given
637 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
638 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
639 the number of iterations of the original scalar loop that should be
640 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
641 for vect_set_loop_condition.
643 Insert the branch-back condition before LOOP_COND_GSI and return the
644 final gcond. */
646 static gcond *
647 vect_set_loop_condition_partial_vectors (class loop *loop,
648 loop_vec_info loop_vinfo, tree niters,
649 tree final_iv, bool niters_maybe_zero,
650 gimple_stmt_iterator loop_cond_gsi)
652 gimple_seq preheader_seq = NULL;
653 gimple_seq header_seq = NULL;
655 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
656 unsigned int compare_precision = TYPE_PRECISION (compare_type);
657 tree orig_niters = niters;
659 /* Type of the initial value of NITERS. */
660 tree ni_actual_type = TREE_TYPE (niters);
661 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
662 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
664 /* Convert NITERS to the same size as the compare. */
665 if (compare_precision > ni_actual_precision
666 && niters_maybe_zero)
668 /* We know that there is always at least one iteration, so if the
669 count is zero then it must have wrapped. Cope with this by
670 subtracting 1 before the conversion and adding 1 to the result. */
671 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
672 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
673 niters, build_minus_one_cst (ni_actual_type));
674 niters = gimple_convert (&preheader_seq, compare_type, niters);
675 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
676 niters, build_one_cst (compare_type));
678 else
679 niters = gimple_convert (&preheader_seq, compare_type, niters);
681 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
683 /* Iterate over all the rgroups and fill in their controls. We could use
684 the first control from any rgroup for the loop condition; here we
685 arbitrarily pick the last. */
686 tree test_ctrl = NULL_TREE;
687 rgroup_controls *rgc;
688 unsigned int i;
689 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
690 FOR_EACH_VEC_ELT (*masks, i, rgc)
691 if (!rgc->controls.is_empty ())
693 /* First try using permutes. This adds a single vector
694 instruction to the loop for each mask, but needs no extra
695 loop invariants or IVs. */
696 unsigned int nmasks = i + 1;
697 if ((nmasks & 1) == 0)
699 rgroup_controls *half_rgc = &(*masks)[nmasks / 2 - 1];
700 if (!half_rgc->controls.is_empty ()
701 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
702 continue;
705 /* See whether zero-based IV would ever generate all-false masks
706 before wrapping around. */
707 bool might_wrap_p
708 = (iv_limit == -1
709 || (wi::min_precision (iv_limit * rgc->max_nscalars_per_iter,
710 UNSIGNED)
711 > compare_precision));
713 /* Set up all controls for this group. */
714 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
715 &preheader_seq,
716 loop_cond_gsi, rgc,
717 niters, niters_skip,
718 might_wrap_p);
721 /* Emit all accumulated statements. */
722 add_preheader_seq (loop, preheader_seq);
723 add_header_seq (loop, header_seq);
725 /* Get a boolean result that tells us whether to iterate. */
726 edge exit_edge = single_exit (loop);
727 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
728 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
729 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
730 NULL_TREE, NULL_TREE);
731 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
733 /* The loop iterates (NITERS - 1) / VF + 1 times.
734 Subtract one from this to get the latch count. */
735 tree step = build_int_cst (compare_type,
736 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
737 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
738 build_minus_one_cst (compare_type));
739 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
740 niters_minus_one, step);
742 if (final_iv)
744 gassign *assign = gimple_build_assign (final_iv, orig_niters);
745 gsi_insert_on_edge_immediate (single_exit (loop), assign);
748 return cond_stmt;
751 /* Like vect_set_loop_condition, but handle the case in which the vector
752 loop handles exactly VF scalars per iteration. */
754 static gcond *
755 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
756 tree final_iv, bool niters_maybe_zero,
757 gimple_stmt_iterator loop_cond_gsi)
759 tree indx_before_incr, indx_after_incr;
760 gcond *cond_stmt;
761 gcond *orig_cond;
762 edge pe = loop_preheader_edge (loop);
763 edge exit_edge = single_exit (loop);
764 gimple_stmt_iterator incr_gsi;
765 bool insert_after;
766 enum tree_code code;
767 tree niters_type = TREE_TYPE (niters);
769 orig_cond = get_loop_exit_condition (loop);
770 gcc_assert (orig_cond);
771 loop_cond_gsi = gsi_for_stmt (orig_cond);
773 tree init, limit;
774 if (!niters_maybe_zero && integer_onep (step))
776 /* In this case we can use a simple 0-based IV:
779 x = 0;
783 x += 1;
785 while (x < NITERS); */
786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787 init = build_zero_cst (niters_type);
788 limit = niters;
790 else
792 /* The following works for all values of NITERS except 0:
795 x = 0;
799 x += STEP;
801 while (x <= NITERS - STEP);
803 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 but stops if x + STEP - 1 >= NITERS.
806 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 before wrapping around. There are two obvious ways of dealing with
808 this:
810 - start at STEP - 1 and compare x before incrementing it
811 - start at -1 and compare x after incrementing it
813 The latter is simpler and is what we use. The loop in this case
814 looks like:
817 x = -1;
821 x += STEP;
823 while (x < NITERS - STEP);
825 In both cases the loop limit is NITERS - STEP. */
826 gimple_seq seq = NULL;
827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829 if (seq)
831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 gcc_assert (!new_bb);
834 if (niters_maybe_zero)
836 /* Case C. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 init = build_all_ones_cst (niters_type);
840 else
842 /* Case B. */
843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 init = build_zero_cst (niters_type);
848 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849 create_iv (init, step, NULL_TREE, loop,
850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 true, NULL_TREE, true,
853 GSI_SAME_STMT);
854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855 true, GSI_SAME_STMT);
857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858 NULL_TREE);
860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
862 /* Record the number of latch iterations. */
863 if (limit == niters)
864 /* Case A: the loop iterates NITERS times. Subtract one to get the
865 latch count. */
866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 build_int_cst (niters_type, 1));
868 else
869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870 Subtract one from this to get the latch count. */
871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 limit, step);
874 if (final_iv)
876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 indx_after_incr, init);
878 gsi_insert_on_edge_immediate (single_exit (loop), assign);
881 return cond_stmt;
884 /* If we're using fully-masked loops, make LOOP iterate:
886 N == (NITERS - 1) / STEP + 1
888 times. When NITERS is zero, this is equivalent to making the loop
889 execute (1 << M) / STEP times, where M is the precision of NITERS.
890 NITERS_MAYBE_ZERO is true if this last case might occur.
892 If we're not using fully-masked loops, make LOOP iterate:
894 N == (NITERS - STEP) / STEP + 1
896 times, where NITERS is known to be outside the range [1, STEP - 1].
897 This is equivalent to making the loop execute NITERS / STEP times
898 when NITERS is nonzero and (1 << M) / STEP times otherwise.
899 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
901 If FINAL_IV is nonnull, it is an SSA name that should be set to
902 N * STEP on exit from the loop.
904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
906 void
907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908 tree niters, tree step, tree final_iv,
909 bool niters_maybe_zero)
911 gcond *cond_stmt;
912 gcond *orig_cond = get_loop_exit_condition (loop);
913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
915 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
916 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
917 niters, final_iv,
918 niters_maybe_zero,
919 loop_cond_gsi);
920 else
921 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
922 niters_maybe_zero,
923 loop_cond_gsi);
925 /* Remove old loop exit test. */
926 stmt_vec_info orig_cond_info;
927 if (loop_vinfo
928 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
929 loop_vinfo->remove_stmt (orig_cond_info);
930 else
931 gsi_remove (&loop_cond_gsi, true);
933 if (dump_enabled_p ())
934 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
935 cond_stmt);
938 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
939 For all PHI arguments in FROM->dest and TO->dest from those
940 edges ensure that TO->dest PHI arguments have current_def
941 to that in from. */
943 static void
944 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
946 gimple_stmt_iterator gsi_from, gsi_to;
948 for (gsi_from = gsi_start_phis (from->dest),
949 gsi_to = gsi_start_phis (to->dest);
950 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
952 gimple *from_phi = gsi_stmt (gsi_from);
953 gimple *to_phi = gsi_stmt (gsi_to);
954 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
955 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
956 if (virtual_operand_p (from_arg))
958 gsi_next (&gsi_from);
959 continue;
961 if (virtual_operand_p (to_arg))
963 gsi_next (&gsi_to);
964 continue;
966 if (TREE_CODE (from_arg) != SSA_NAME)
967 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
968 else if (TREE_CODE (to_arg) == SSA_NAME
969 && from_arg != to_arg)
971 if (get_current_def (to_arg) == NULL_TREE)
973 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
974 TREE_TYPE (get_current_def
975 (from_arg))));
976 set_current_def (to_arg, get_current_def (from_arg));
979 gsi_next (&gsi_from);
980 gsi_next (&gsi_to);
983 gphi *from_phi = get_virtual_phi (from->dest);
984 gphi *to_phi = get_virtual_phi (to->dest);
985 if (from_phi)
986 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
987 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
991 /* Given LOOP this function generates a new copy of it and puts it
992 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
993 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
994 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
995 entry or exit of LOOP. */
997 class loop *
998 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
999 class loop *scalar_loop, edge e)
1001 class loop *new_loop;
1002 basic_block *new_bbs, *bbs, *pbbs;
1003 bool at_exit;
1004 bool was_imm_dom;
1005 basic_block exit_dest;
1006 edge exit, new_exit;
1007 bool duplicate_outer_loop = false;
1009 exit = single_exit (loop);
1010 at_exit = (e == exit);
1011 if (!at_exit && e != loop_preheader_edge (loop))
1012 return NULL;
1014 if (scalar_loop == NULL)
1015 scalar_loop = loop;
1017 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1018 pbbs = bbs + 1;
1019 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1020 /* Allow duplication of outer loops. */
1021 if (scalar_loop->inner)
1022 duplicate_outer_loop = true;
1023 /* Check whether duplication is possible. */
1024 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1026 free (bbs);
1027 return NULL;
1030 /* Generate new loop structure. */
1031 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1032 duplicate_subloops (scalar_loop, new_loop);
1034 exit_dest = exit->dest;
1035 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1036 exit_dest) == loop->header ?
1037 true : false);
1039 /* Also copy the pre-header, this avoids jumping through hoops to
1040 duplicate the loop entry PHI arguments. Create an empty
1041 pre-header unconditionally for this. */
1042 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1043 edge entry_e = single_pred_edge (preheader);
1044 bbs[0] = preheader;
1045 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1047 exit = single_exit (scalar_loop);
1048 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1049 &exit, 1, &new_exit, NULL,
1050 at_exit ? loop->latch : e->src, true);
1051 exit = single_exit (loop);
1052 basic_block new_preheader = new_bbs[0];
1054 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1056 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1057 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1058 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1060 if (scalar_loop != loop)
1062 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1065 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066 header) to have current_def set, so copy them over. */
1067 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068 exit);
1069 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1071 EDGE_SUCC (loop->latch, 0));
1074 if (at_exit) /* Add the loop copy at exit. */
1076 if (scalar_loop != loop)
1078 gphi_iterator gsi;
1079 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1081 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082 gsi_next (&gsi))
1084 gphi *phi = gsi.phi ();
1085 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086 location_t orig_locus
1087 = gimple_phi_arg_location_from_edge (phi, e);
1089 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1092 redirect_edge_and_branch_force (e, new_preheader);
1093 flush_pending_stmts (e);
1094 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1095 if (was_imm_dom || duplicate_outer_loop)
1096 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1098 /* And remove the non-necessary forwarder again. Keep the other
1099 one so we have a proper pre-header for the loop at the exit edge. */
1100 redirect_edge_pred (single_succ_edge (preheader),
1101 single_pred (preheader));
1102 delete_basic_block (preheader);
1103 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104 loop_preheader_edge (scalar_loop)->src);
1106 else /* Add the copy at entry. */
1108 if (scalar_loop != loop)
1110 /* Remove the non-necessary forwarder of scalar_loop again. */
1111 redirect_edge_pred (single_succ_edge (preheader),
1112 single_pred (preheader));
1113 delete_basic_block (preheader);
1114 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115 loop_preheader_edge (scalar_loop)->src);
1116 preheader = split_edge (loop_preheader_edge (loop));
1117 entry_e = single_pred_edge (preheader);
1120 redirect_edge_and_branch_force (entry_e, new_preheader);
1121 flush_pending_stmts (entry_e);
1122 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1124 redirect_edge_and_branch_force (new_exit, preheader);
1125 flush_pending_stmts (new_exit);
1126 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1128 /* And remove the non-necessary forwarder again. Keep the other
1129 one so we have a proper pre-header for the loop at the exit edge. */
1130 redirect_edge_pred (single_succ_edge (new_preheader),
1131 single_pred (new_preheader));
1132 delete_basic_block (new_preheader);
1133 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134 loop_preheader_edge (new_loop)->src);
1137 if (scalar_loop != loop)
1139 /* Update new_loop->header PHIs, so that on the preheader
1140 edge they are the ones from loop rather than scalar_loop. */
1141 gphi_iterator gsi_orig, gsi_new;
1142 edge orig_e = loop_preheader_edge (loop);
1143 edge new_e = loop_preheader_edge (new_loop);
1145 for (gsi_orig = gsi_start_phis (loop->header),
1146 gsi_new = gsi_start_phis (new_loop->header);
1147 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1148 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1150 gphi *orig_phi = gsi_orig.phi ();
1151 gphi *new_phi = gsi_new.phi ();
1152 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1153 location_t orig_locus
1154 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1156 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1160 free (new_bbs);
1161 free (bbs);
1163 checking_verify_dominators (CDI_DOMINATORS);
1165 return new_loop;
1169 /* Given the condition expression COND, put it as the last statement of
1170 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1171 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1172 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1173 new edge as irreducible if IRREDUCIBLE_P is true. */
1175 static edge
1176 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1177 basic_block guard_to, basic_block dom_bb,
1178 profile_probability probability, bool irreducible_p)
1180 gimple_stmt_iterator gsi;
1181 edge new_e, enter_e;
1182 gcond *cond_stmt;
1183 gimple_seq gimplify_stmt_list = NULL;
1185 enter_e = EDGE_SUCC (guard_bb, 0);
1186 enter_e->flags &= ~EDGE_FALLTHRU;
1187 enter_e->flags |= EDGE_FALSE_VALUE;
1188 gsi = gsi_last_bb (guard_bb);
1190 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1191 NULL_TREE);
1192 if (gimplify_stmt_list)
1193 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1195 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1196 gsi = gsi_last_bb (guard_bb);
1197 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1199 /* Add new edge to connect guard block to the merge/loop-exit block. */
1200 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1202 new_e->probability = probability;
1203 if (irreducible_p)
1204 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1206 enter_e->probability = probability.invert ();
1207 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1209 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1210 if (enter_e->dest->loop_father->header == enter_e->dest)
1211 split_edge (enter_e);
1213 return new_e;
1217 /* This function verifies that the following restrictions apply to LOOP:
1218 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1219 for innermost loop and 5 basic blocks for outer-loop.
1220 (2) it is single entry, single exit
1221 (3) its exit condition is the last stmt in the header
1222 (4) E is the entry/exit edge of LOOP.
1225 bool
1226 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1228 edge exit_e = single_exit (loop);
1229 edge entry_e = loop_preheader_edge (loop);
1230 gcond *orig_cond = get_loop_exit_condition (loop);
1231 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1232 unsigned int num_bb = loop->inner? 5 : 2;
1234 /* All loops have an outer scope; the only case loop->outer is NULL is for
1235 the function itself. */
1236 if (!loop_outer (loop)
1237 || loop->num_nodes != num_bb
1238 || !empty_block_p (loop->latch)
1239 || !single_exit (loop)
1240 /* Verify that new loop exit condition can be trivially modified. */
1241 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1242 || (e != exit_e && e != entry_e))
1243 return false;
1245 return true;
1248 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1249 in the exit bb and rename all the uses after the loop. This simplifies
1250 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1251 (but normally loop closed SSA form doesn't require virtual PHIs to be
1252 in the same form). Doing this early simplifies the checking what
1253 uses should be renamed.
1255 If we create a new phi after the loop, return the definition that
1256 applies on entry to the loop, otherwise return null. */
1258 static tree
1259 create_lcssa_for_virtual_phi (class loop *loop)
1261 gphi_iterator gsi;
1262 edge exit_e = single_exit (loop);
1264 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1265 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1267 gphi *phi = gsi.phi ();
1268 for (gsi = gsi_start_phis (exit_e->dest);
1269 !gsi_end_p (gsi); gsi_next (&gsi))
1270 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1271 break;
1272 if (gsi_end_p (gsi))
1274 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1275 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1276 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1277 imm_use_iterator imm_iter;
1278 gimple *stmt;
1279 use_operand_p use_p;
1281 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1282 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1283 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1284 gimple_phi_set_result (new_phi, new_vop);
1285 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1286 if (stmt != new_phi
1287 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1288 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1289 SET_USE (use_p, new_vop);
1291 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1293 break;
1295 return NULL_TREE;
1298 /* Function vect_get_loop_location.
1300 Extract the location of the loop in the source code.
1301 If the loop is not well formed for vectorization, an estimated
1302 location is calculated.
1303 Return the loop location if succeed and NULL if not. */
1305 dump_user_location_t
1306 find_loop_location (class loop *loop)
1308 gimple *stmt = NULL;
1309 basic_block bb;
1310 gimple_stmt_iterator si;
1312 if (!loop)
1313 return dump_user_location_t ();
1315 stmt = get_loop_exit_condition (loop);
1317 if (stmt
1318 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1319 return stmt;
1321 /* If we got here the loop is probably not "well formed",
1322 try to estimate the loop location */
1324 if (!loop->header)
1325 return dump_user_location_t ();
1327 bb = loop->header;
1329 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1331 stmt = gsi_stmt (si);
1332 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1333 return stmt;
1336 return dump_user_location_t ();
1339 /* Return true if the phi described by STMT_INFO defines an IV of the
1340 loop to be vectorized. */
1342 static bool
1343 iv_phi_p (stmt_vec_info stmt_info)
1345 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1346 if (virtual_operand_p (PHI_RESULT (phi)))
1347 return false;
1349 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1350 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1351 return false;
1353 return true;
1356 /* Function vect_can_advance_ivs_p
1358 In case the number of iterations that LOOP iterates is unknown at compile
1359 time, an epilog loop will be generated, and the loop induction variables
1360 (IVs) will be "advanced" to the value they are supposed to take just before
1361 the epilog loop. Here we check that the access function of the loop IVs
1362 and the expression that represents the loop bound are simple enough.
1363 These restrictions will be relaxed in the future. */
1365 bool
1366 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1368 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1369 basic_block bb = loop->header;
1370 gphi_iterator gsi;
1372 /* Analyze phi functions of the loop header. */
1374 if (dump_enabled_p ())
1375 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1376 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1378 tree evolution_part;
1380 gphi *phi = gsi.phi ();
1381 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1382 if (dump_enabled_p ())
1383 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1384 phi_info->stmt);
1386 /* Skip virtual phi's. The data dependences that are associated with
1387 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1389 Skip reduction phis. */
1390 if (!iv_phi_p (phi_info))
1392 if (dump_enabled_p ())
1393 dump_printf_loc (MSG_NOTE, vect_location,
1394 "reduc or virtual phi. skip.\n");
1395 continue;
1398 /* Analyze the evolution function. */
1400 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1401 if (evolution_part == NULL_TREE)
1403 if (dump_enabled_p ())
1404 dump_printf (MSG_MISSED_OPTIMIZATION,
1405 "No access function or evolution.\n");
1406 return false;
1409 /* FORNOW: We do not transform initial conditions of IVs
1410 which evolution functions are not invariants in the loop. */
1412 if (!expr_invariant_in_loop_p (loop, evolution_part))
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416 "evolution not invariant in loop.\n");
1417 return false;
1420 /* FORNOW: We do not transform initial conditions of IVs
1421 which evolution functions are a polynomial of degree >= 2. */
1423 if (tree_is_chrec (evolution_part))
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1427 "evolution is chrec.\n");
1428 return false;
1432 return true;
1436 /* Function vect_update_ivs_after_vectorizer.
1438 "Advance" the induction variables of LOOP to the value they should take
1439 after the execution of LOOP. This is currently necessary because the
1440 vectorizer does not handle induction variables that are used after the
1441 loop. Such a situation occurs when the last iterations of LOOP are
1442 peeled, because:
1443 1. We introduced new uses after LOOP for IVs that were not originally used
1444 after LOOP: the IVs of LOOP are now used by an epilog loop.
1445 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1446 times, whereas the loop IVs should be bumped N times.
1448 Input:
1449 - LOOP - a loop that is going to be vectorized. The last few iterations
1450 of LOOP were peeled.
1451 - NITERS - the number of iterations that LOOP executes (before it is
1452 vectorized). i.e, the number of times the ivs should be bumped.
1453 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1454 coming out from LOOP on which there are uses of the LOOP ivs
1455 (this is the path from LOOP->exit to epilog_loop->preheader).
1457 The new definitions of the ivs are placed in LOOP->exit.
1458 The phi args associated with the edge UPDATE_E in the bb
1459 UPDATE_E->dest are updated accordingly.
1461 Assumption 1: Like the rest of the vectorizer, this function assumes
1462 a single loop exit that has a single predecessor.
1464 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1465 organized in the same order.
1467 Assumption 3: The access function of the ivs is simple enough (see
1468 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1470 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1471 coming out of LOOP on which the ivs of LOOP are used (this is the path
1472 that leads to the epilog loop; other paths skip the epilog loop). This
1473 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1474 needs to have its phis updated.
1477 static void
1478 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1479 tree niters, edge update_e)
1481 gphi_iterator gsi, gsi1;
1482 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1483 basic_block update_bb = update_e->dest;
1484 basic_block exit_bb = single_exit (loop)->dest;
1486 /* Make sure there exists a single-predecessor exit bb: */
1487 gcc_assert (single_pred_p (exit_bb));
1488 gcc_assert (single_succ_edge (exit_bb) == update_e);
1490 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1491 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1492 gsi_next (&gsi), gsi_next (&gsi1))
1494 tree init_expr;
1495 tree step_expr, off;
1496 tree type;
1497 tree var, ni, ni_name;
1498 gimple_stmt_iterator last_gsi;
1500 gphi *phi = gsi.phi ();
1501 gphi *phi1 = gsi1.phi ();
1502 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_NOTE, vect_location,
1505 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1507 /* Skip reduction and virtual phis. */
1508 if (!iv_phi_p (phi_info))
1510 if (dump_enabled_p ())
1511 dump_printf_loc (MSG_NOTE, vect_location,
1512 "reduc or virtual phi. skip.\n");
1513 continue;
1516 type = TREE_TYPE (gimple_phi_result (phi));
1517 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1518 step_expr = unshare_expr (step_expr);
1520 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1521 of degree >= 2 or exponential. */
1522 gcc_assert (!tree_is_chrec (step_expr));
1524 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1526 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1527 fold_convert (TREE_TYPE (step_expr), niters),
1528 step_expr);
1529 if (POINTER_TYPE_P (type))
1530 ni = fold_build_pointer_plus (init_expr, off);
1531 else
1532 ni = fold_build2 (PLUS_EXPR, type,
1533 init_expr, fold_convert (type, off));
1535 var = create_tmp_var (type, "tmp");
1537 last_gsi = gsi_last_bb (exit_bb);
1538 gimple_seq new_stmts = NULL;
1539 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1540 /* Exit_bb shouldn't be empty. */
1541 if (!gsi_end_p (last_gsi))
1542 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1543 else
1544 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1546 /* Fix phi expressions in the successor bb. */
1547 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1551 /* Return a gimple value containing the misalignment (measured in vector
1552 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1553 it is away from a perfectly aligned address. Add any new statements
1554 to SEQ. */
1556 static tree
1557 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1559 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1560 stmt_vec_info stmt_info = dr_info->stmt;
1561 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1563 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1564 unsigned HOST_WIDE_INT target_align_c;
1565 tree target_align_minus_1;
1567 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1568 size_zero_node) < 0;
1569 tree offset = (negative
1570 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1571 : size_zero_node);
1572 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1573 stmt_info, seq,
1574 offset);
1575 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1576 if (target_align.is_constant (&target_align_c))
1577 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1578 else
1580 tree vla = build_int_cst (type, target_align);
1581 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1582 fold_build2 (MINUS_EXPR, type,
1583 build_int_cst (type, 0), vla));
1584 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1585 build_int_cst (type, 1));
1588 HOST_WIDE_INT elem_size
1589 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1590 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1592 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1593 tree int_start_addr = fold_convert (type, start_addr);
1594 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1595 target_align_minus_1);
1597 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1598 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1599 elem_size_log);
1601 return misalign_in_elems;
1604 /* Function vect_gen_prolog_loop_niters
1606 Generate the number of iterations which should be peeled as prolog for the
1607 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1608 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1609 As a result, after the execution of this loop, the data reference DR will
1610 refer to an aligned location. The following computation is generated:
1612 If the misalignment of DR is known at compile time:
1613 addr_mis = int mis = DR_MISALIGNMENT (dr);
1614 Else, compute address misalignment in bytes:
1615 addr_mis = addr & (target_align - 1)
1617 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1619 (elem_size = element type size; an element is the scalar element whose type
1620 is the inner type of the vectype)
1622 The computations will be emitted at the end of BB. We also compute and
1623 store upper bound (included) of the result in BOUND.
1625 When the step of the data-ref in the loop is not 1 (as in interleaved data
1626 and SLP), the number of iterations of the prolog must be divided by the step
1627 (which is equal to the size of interleaved group).
1629 The above formulas assume that VF == number of elements in the vector. This
1630 may not hold when there are multiple-types in the loop.
1631 In this case, for some data-references in the loop the VF does not represent
1632 the number of elements that fit in the vector. Therefore, instead of VF we
1633 use TYPE_VECTOR_SUBPARTS. */
1635 static tree
1636 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1637 basic_block bb, int *bound)
1639 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1640 tree var;
1641 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1642 gimple_seq stmts = NULL, new_stmts = NULL;
1643 tree iters, iters_name;
1644 stmt_vec_info stmt_info = dr_info->stmt;
1645 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1646 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1648 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1650 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_NOTE, vect_location,
1654 "known peeling = %d.\n", npeel);
1656 iters = build_int_cst (niters_type, npeel);
1657 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1659 else
1661 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1662 tree type = TREE_TYPE (misalign_in_elems);
1663 HOST_WIDE_INT elem_size
1664 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1665 /* We only do prolog peeling if the target alignment is known at compile
1666 time. */
1667 poly_uint64 align_in_elems =
1668 exact_div (target_align, elem_size);
1669 tree align_in_elems_minus_1 =
1670 build_int_cst (type, align_in_elems - 1);
1671 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1673 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1674 & (align_in_elems - 1)). */
1675 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1676 size_zero_node) < 0;
1677 if (negative)
1678 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1679 align_in_elems_tree);
1680 else
1681 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1682 misalign_in_elems);
1683 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1684 iters = fold_convert (niters_type, iters);
1685 unsigned HOST_WIDE_INT align_in_elems_c;
1686 if (align_in_elems.is_constant (&align_in_elems_c))
1687 *bound = align_in_elems_c - 1;
1688 else
1689 *bound = -1;
1692 if (dump_enabled_p ())
1693 dump_printf_loc (MSG_NOTE, vect_location,
1694 "niters for prolog loop: %T\n", iters);
1696 var = create_tmp_var (niters_type, "prolog_loop_niters");
1697 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1699 if (new_stmts)
1700 gimple_seq_add_seq (&stmts, new_stmts);
1701 if (stmts)
1703 gcc_assert (single_succ_p (bb));
1704 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1705 if (gsi_end_p (gsi))
1706 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1707 else
1708 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1710 return iters_name;
1714 /* Function vect_update_init_of_dr
1716 If CODE is PLUS, the vector loop starts NITERS iterations after the
1717 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1718 iterations before the scalar one (using masking to skip inactive
1719 elements). This function updates the information recorded in DR to
1720 account for the difference. Specifically, it updates the OFFSET
1721 field of DR_INFO. */
1723 static void
1724 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1726 struct data_reference *dr = dr_info->dr;
1727 tree offset = dr_info->offset;
1728 if (!offset)
1729 offset = build_zero_cst (sizetype);
1731 niters = fold_build2 (MULT_EXPR, sizetype,
1732 fold_convert (sizetype, niters),
1733 fold_convert (sizetype, DR_STEP (dr)));
1734 offset = fold_build2 (code, sizetype,
1735 fold_convert (sizetype, offset), niters);
1736 dr_info->offset = offset;
1740 /* Function vect_update_inits_of_drs
1742 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1743 CODE and NITERS are as for vect_update_inits_of_dr. */
1745 void
1746 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1747 tree_code code)
1749 unsigned int i;
1750 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1751 struct data_reference *dr;
1753 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1755 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1756 here, but since we might use these niters to update the epilogues niters
1757 and data references we can't insert them here as this definition might not
1758 always dominate its uses. */
1759 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1760 niters = fold_convert (sizetype, niters);
1762 FOR_EACH_VEC_ELT (datarefs, i, dr)
1764 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1765 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1766 vect_update_init_of_dr (dr_info, niters, code);
1770 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1771 by masking. This involves calculating the number of iterations to
1772 be peeled and then aligning all memory references appropriately. */
1774 void
1775 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1777 tree misalign_in_elems;
1778 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1780 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1782 /* From the information recorded in LOOP_VINFO get the number of iterations
1783 that need to be skipped via masking. */
1784 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1786 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1787 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1788 misalign_in_elems = build_int_cst (type, misalign);
1790 else
1792 gimple_seq seq1 = NULL, seq2 = NULL;
1793 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1794 misalign_in_elems = fold_convert (type, misalign_in_elems);
1795 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1796 &seq2, true, NULL_TREE);
1797 gimple_seq_add_seq (&seq1, seq2);
1798 if (seq1)
1800 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1801 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1802 gcc_assert (!new_bb);
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE, vect_location,
1808 "misalignment for fully-masked loop: %T\n",
1809 misalign_in_elems);
1811 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1813 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1816 /* This function builds ni_name = number of iterations. Statements
1817 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1818 it to TRUE if new ssa_var is generated. */
1820 tree
1821 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1823 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1824 if (TREE_CODE (ni) == INTEGER_CST)
1825 return ni;
1826 else
1828 tree ni_name, var;
1829 gimple_seq stmts = NULL;
1830 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1832 var = create_tmp_var (TREE_TYPE (ni), "niters");
1833 ni_name = force_gimple_operand (ni, &stmts, false, var);
1834 if (stmts)
1836 gsi_insert_seq_on_edge_immediate (pe, stmts);
1837 if (new_var_p != NULL)
1838 *new_var_p = true;
1841 return ni_name;
1845 /* Calculate the number of iterations above which vectorized loop will be
1846 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1847 of prolog loop. If it's integer const, the integer number is also passed
1848 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1849 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1850 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1851 threshold below which the scalar (rather than vectorized) loop will be
1852 executed. This function stores the upper bound (inclusive) of the result
1853 in BOUND_SCALAR. */
1855 static tree
1856 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1857 int bound_prolog, poly_int64 bound_epilog, int th,
1858 poly_uint64 *bound_scalar,
1859 bool check_profitability)
1861 tree type = TREE_TYPE (niters_prolog);
1862 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1863 build_int_cst (type, bound_epilog));
1865 *bound_scalar = bound_prolog + bound_epilog;
1866 if (check_profitability)
1868 /* TH indicates the minimum niters of vectorized loop, while we
1869 compute the maximum niters of scalar loop. */
1870 th--;
1871 /* Peeling for constant times. */
1872 if (int_niters_prolog >= 0)
1874 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1875 return build_int_cst (type, *bound_scalar);
1877 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1878 and BOUND_EPILOG are inclusive upper bounds. */
1879 if (known_ge (th, bound_prolog + bound_epilog))
1881 *bound_scalar = th;
1882 return build_int_cst (type, th);
1884 /* Need to do runtime comparison. */
1885 else if (maybe_gt (th, bound_epilog))
1887 *bound_scalar = upper_bound (*bound_scalar, th);
1888 return fold_build2 (MAX_EXPR, type,
1889 build_int_cst (type, th), niters);
1892 return niters;
1895 /* NITERS is the number of times that the original scalar loop executes
1896 after peeling. Work out the maximum number of iterations N that can
1897 be handled by the vectorized form of the loop and then either:
1899 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1901 niters_vector = N
1903 b) set *STEP_VECTOR_PTR to one and generate:
1905 niters_vector = N / vf
1907 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1908 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1909 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1911 void
1912 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1913 tree *niters_vector_ptr, tree *step_vector_ptr,
1914 bool niters_no_overflow)
1916 tree ni_minus_gap, var;
1917 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1918 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1919 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1920 tree log_vf = NULL_TREE;
1922 /* If epilogue loop is required because of data accesses with gaps, we
1923 subtract one iteration from the total number of iterations here for
1924 correct calculation of RATIO. */
1925 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1927 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1928 build_one_cst (type));
1929 if (!is_gimple_val (ni_minus_gap))
1931 var = create_tmp_var (type, "ni_gap");
1932 gimple *stmts = NULL;
1933 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1934 true, var);
1935 gsi_insert_seq_on_edge_immediate (pe, stmts);
1938 else
1939 ni_minus_gap = niters;
1941 unsigned HOST_WIDE_INT const_vf;
1942 if (vf.is_constant (&const_vf)
1943 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1945 /* Create: niters >> log2(vf) */
1946 /* If it's known that niters == number of latch executions + 1 doesn't
1947 overflow, we can generate niters >> log2(vf); otherwise we generate
1948 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1949 will be at least one. */
1950 log_vf = build_int_cst (type, exact_log2 (const_vf));
1951 if (niters_no_overflow)
1952 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1953 else
1954 niters_vector
1955 = fold_build2 (PLUS_EXPR, type,
1956 fold_build2 (RSHIFT_EXPR, type,
1957 fold_build2 (MINUS_EXPR, type,
1958 ni_minus_gap,
1959 build_int_cst (type, vf)),
1960 log_vf),
1961 build_int_cst (type, 1));
1962 step_vector = build_one_cst (type);
1964 else
1966 niters_vector = ni_minus_gap;
1967 step_vector = build_int_cst (type, vf);
1970 if (!is_gimple_val (niters_vector))
1972 var = create_tmp_var (type, "bnd");
1973 gimple_seq stmts = NULL;
1974 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1975 gsi_insert_seq_on_edge_immediate (pe, stmts);
1976 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1977 we set range information to make niters analyzer's life easier. */
1978 if (stmts != NULL && log_vf)
1979 set_range_info (niters_vector, VR_RANGE,
1980 wi::to_wide (build_int_cst (type, 1)),
1981 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1982 TYPE_MAX_VALUE (type),
1983 log_vf)));
1985 *niters_vector_ptr = niters_vector;
1986 *step_vector_ptr = step_vector;
1988 return;
1991 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1992 loop specified by LOOP_VINFO after vectorization, compute the number
1993 of iterations before vectorization (niters_vector * vf) and store it
1994 to NITERS_VECTOR_MULT_VF_PTR. */
1996 static void
1997 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1998 tree niters_vector,
1999 tree *niters_vector_mult_vf_ptr)
2001 /* We should be using a step_vector of VF if VF is variable. */
2002 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2003 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2004 tree type = TREE_TYPE (niters_vector);
2005 tree log_vf = build_int_cst (type, exact_log2 (vf));
2006 basic_block exit_bb = single_exit (loop)->dest;
2008 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2009 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2010 niters_vector, log_vf);
2011 if (!is_gimple_val (niters_vector_mult_vf))
2013 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2014 gimple_seq stmts = NULL;
2015 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2016 &stmts, true, var);
2017 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2018 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2020 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2023 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2024 this function searches for the corresponding lcssa phi node in exit
2025 bb of LOOP. If it is found, return the phi result; otherwise return
2026 NULL. */
2028 static tree
2029 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2030 gphi *lcssa_phi)
2032 gphi_iterator gsi;
2033 edge e = single_exit (loop);
2035 gcc_assert (single_pred_p (e->dest));
2036 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2038 gphi *phi = gsi.phi ();
2039 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2040 PHI_ARG_DEF (lcssa_phi, 0), 0))
2041 return PHI_RESULT (phi);
2043 return NULL_TREE;
2046 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2047 from SECOND/FIRST and puts it at the original loop's preheader/exit
2048 edge, the two loops are arranged as below:
2050 preheader_a:
2051 first_loop:
2052 header_a:
2053 i_1 = PHI<i_0, i_2>;
2055 i_2 = i_1 + 1;
2056 if (cond_a)
2057 goto latch_a;
2058 else
2059 goto between_bb;
2060 latch_a:
2061 goto header_a;
2063 between_bb:
2064 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2066 second_loop:
2067 header_b:
2068 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2069 or with i_2 if no LCSSA phi is created
2070 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2072 i_4 = i_3 + 1;
2073 if (cond_b)
2074 goto latch_b;
2075 else
2076 goto exit_bb;
2077 latch_b:
2078 goto header_b;
2080 exit_bb:
2082 This function creates loop closed SSA for the first loop; update the
2083 second loop's PHI nodes by replacing argument on incoming edge with the
2084 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2085 is false, Loop closed ssa phis will only be created for non-iv phis for
2086 the first loop.
2088 This function assumes exit bb of the first loop is preheader bb of the
2089 second loop, i.e, between_bb in the example code. With PHIs updated,
2090 the second loop will execute rest iterations of the first. */
2092 static void
2093 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2094 class loop *first, class loop *second,
2095 bool create_lcssa_for_iv_phis)
2097 gphi_iterator gsi_update, gsi_orig;
2098 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2100 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2101 edge second_preheader_e = loop_preheader_edge (second);
2102 basic_block between_bb = single_exit (first)->dest;
2104 gcc_assert (between_bb == second_preheader_e->src);
2105 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2106 /* Either the first loop or the second is the loop to be vectorized. */
2107 gcc_assert (loop == first || loop == second);
2109 for (gsi_orig = gsi_start_phis (first->header),
2110 gsi_update = gsi_start_phis (second->header);
2111 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2112 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2114 gphi *orig_phi = gsi_orig.phi ();
2115 gphi *update_phi = gsi_update.phi ();
2117 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2118 /* Generate lcssa PHI node for the first loop. */
2119 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2120 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2121 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2123 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2124 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2125 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2126 arg = new_res;
2129 /* Update PHI node in the second loop by replacing arg on the loop's
2130 incoming edge. */
2131 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2134 /* For epilogue peeling we have to make sure to copy all LC PHIs
2135 for correct vectorization of live stmts. */
2136 if (loop == first)
2138 basic_block orig_exit = single_exit (second)->dest;
2139 for (gsi_orig = gsi_start_phis (orig_exit);
2140 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2142 gphi *orig_phi = gsi_orig.phi ();
2143 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2144 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2145 continue;
2147 /* Already created in the above loop. */
2148 if (find_guard_arg (first, second, orig_phi))
2149 continue;
2151 tree new_res = copy_ssa_name (orig_arg);
2152 gphi *lcphi = create_phi_node (new_res, between_bb);
2153 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2158 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2159 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2160 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2161 appear like below:
2163 guard_bb:
2164 if (cond)
2165 goto merge_bb;
2166 else
2167 goto skip_loop;
2169 skip_loop:
2170 header_a:
2171 i_1 = PHI<i_0, i_2>;
2173 i_2 = i_1 + 1;
2174 if (cond_a)
2175 goto latch_a;
2176 else
2177 goto exit_a;
2178 latch_a:
2179 goto header_a;
2181 exit_a:
2182 i_5 = PHI<i_2>;
2184 merge_bb:
2185 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2187 update_loop:
2188 header_b:
2189 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2191 i_4 = i_3 + 1;
2192 if (cond_b)
2193 goto latch_b;
2194 else
2195 goto exit_bb;
2196 latch_b:
2197 goto header_b;
2199 exit_bb:
2201 This function creates PHI nodes at merge_bb and replaces the use of i_5
2202 in the update_loop's PHI node with the result of new PHI result. */
2204 static void
2205 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2206 class loop *update_loop,
2207 edge guard_edge, edge merge_edge)
2209 location_t merge_loc, guard_loc;
2210 edge orig_e = loop_preheader_edge (skip_loop);
2211 edge update_e = loop_preheader_edge (update_loop);
2212 gphi_iterator gsi_orig, gsi_update;
2214 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2215 gsi_update = gsi_start_phis (update_loop->header));
2216 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2217 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2219 gphi *orig_phi = gsi_orig.phi ();
2220 gphi *update_phi = gsi_update.phi ();
2222 /* Generate new phi node at merge bb of the guard. */
2223 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2224 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2226 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2227 args in NEW_PHI for these edges. */
2228 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2229 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2230 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2231 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2232 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2233 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2235 /* Update phi in UPDATE_PHI. */
2236 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2240 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2241 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2242 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2243 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2244 The CFG looks like:
2246 loop:
2247 header_a:
2248 i_1 = PHI<i_0, i_2>;
2250 i_2 = i_1 + 1;
2251 if (cond_a)
2252 goto latch_a;
2253 else
2254 goto exit_a;
2255 latch_a:
2256 goto header_a;
2258 exit_a:
2260 guard_bb:
2261 if (cond)
2262 goto merge_bb;
2263 else
2264 goto epilog_loop;
2266 ;; fall_through_bb
2268 epilog_loop:
2269 header_b:
2270 i_3 = PHI<i_2, i_4>;
2272 i_4 = i_3 + 1;
2273 if (cond_b)
2274 goto latch_b;
2275 else
2276 goto merge_bb;
2277 latch_b:
2278 goto header_b;
2280 merge_bb:
2281 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2283 exit_bb:
2284 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2286 For each name used out side EPILOG (i.e - for each name that has a lcssa
2287 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2288 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2289 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2290 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2291 in exit_bb will also be updated. */
2293 static void
2294 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2295 edge guard_edge, edge merge_edge)
2297 gphi_iterator gsi;
2298 basic_block merge_bb = guard_edge->dest;
2300 gcc_assert (single_succ_p (merge_bb));
2301 edge e = single_succ_edge (merge_bb);
2302 basic_block exit_bb = e->dest;
2303 gcc_assert (single_pred_p (exit_bb));
2304 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2306 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2308 gphi *update_phi = gsi.phi ();
2309 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2311 tree merge_arg = NULL_TREE;
2313 /* If the old argument is a SSA_NAME use its current_def. */
2314 if (TREE_CODE (old_arg) == SSA_NAME)
2315 merge_arg = get_current_def (old_arg);
2316 /* If it's a constant or doesn't have a current_def, just use the old
2317 argument. */
2318 if (!merge_arg)
2319 merge_arg = old_arg;
2321 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2322 /* If the var is live after loop but not a reduction, we simply
2323 use the old arg. */
2324 if (!guard_arg)
2325 guard_arg = old_arg;
2327 /* Create new phi node in MERGE_BB: */
2328 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2329 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2331 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2332 the two PHI args in merge_phi for these edges. */
2333 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2334 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2336 /* Update the original phi in exit_bb. */
2337 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2341 /* EPILOG loop is duplicated from the original loop for vectorizing,
2342 the arg of its loop closed ssa PHI needs to be updated. */
2344 static void
2345 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2347 gphi_iterator gsi;
2348 basic_block exit_bb = single_exit (epilog)->dest;
2350 gcc_assert (single_pred_p (exit_bb));
2351 edge e = EDGE_PRED (exit_bb, 0);
2352 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2353 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2356 /* Function vect_do_peeling.
2358 Input:
2359 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2361 preheader:
2362 LOOP:
2363 header_bb:
2364 loop_body
2365 if (exit_loop_cond) goto exit_bb
2366 else goto header_bb
2367 exit_bb:
2369 - NITERS: The number of iterations of the loop.
2370 - NITERSM1: The number of iterations of the loop's latch.
2371 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2372 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2373 CHECK_PROFITABILITY is true.
2374 Output:
2375 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2376 iterate after vectorization; see vect_set_loop_condition for details.
2377 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2378 should be set to the number of scalar iterations handled by the
2379 vector loop. The SSA name is only used on exit from the loop.
2381 This function peels prolog and epilog from the loop, adds guards skipping
2382 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2383 would look like:
2385 guard_bb_1:
2386 if (prefer_scalar_loop) goto merge_bb_1
2387 else goto guard_bb_2
2389 guard_bb_2:
2390 if (skip_prolog) goto merge_bb_2
2391 else goto prolog_preheader
2393 prolog_preheader:
2394 PROLOG:
2395 prolog_header_bb:
2396 prolog_body
2397 if (exit_prolog_cond) goto prolog_exit_bb
2398 else goto prolog_header_bb
2399 prolog_exit_bb:
2401 merge_bb_2:
2403 vector_preheader:
2404 VECTOR LOOP:
2405 vector_header_bb:
2406 vector_body
2407 if (exit_vector_cond) goto vector_exit_bb
2408 else goto vector_header_bb
2409 vector_exit_bb:
2411 guard_bb_3:
2412 if (skip_epilog) goto merge_bb_3
2413 else goto epilog_preheader
2415 merge_bb_1:
2417 epilog_preheader:
2418 EPILOG:
2419 epilog_header_bb:
2420 epilog_body
2421 if (exit_epilog_cond) goto merge_bb_3
2422 else goto epilog_header_bb
2424 merge_bb_3:
2426 Note this function peels prolog and epilog only if it's necessary,
2427 as well as guards.
2428 This function returns the epilogue loop if a decision was made to vectorize
2429 it, otherwise NULL.
2431 The analysis resulting in this epilogue loop's loop_vec_info was performed
2432 in the same vect_analyze_loop call as the main loop's. At that time
2433 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2434 vectorization factors than the main loop. This list is stored in the main
2435 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2436 vectorize the epilogue loop for a lower vectorization factor, the
2437 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2438 updated and linked to the epilogue loop. This is later used to vectorize
2439 the epilogue. The reason the loop_vec_info needs updating is that it was
2440 constructed based on the original main loop, and the epilogue loop is a
2441 copy of this loop, so all links pointing to statements in the original loop
2442 need updating. Furthermore, these loop_vec_infos share the
2443 data_reference's records, which will also need to be updated.
2445 TODO: Guard for prefer_scalar_loop should be emitted along with
2446 versioning conditions if loop versioning is needed. */
2449 class loop *
2450 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2451 tree *niters_vector, tree *step_vector,
2452 tree *niters_vector_mult_vf_var, int th,
2453 bool check_profitability, bool niters_no_overflow,
2454 tree *advance)
2456 edge e, guard_e;
2457 tree type = TREE_TYPE (niters), guard_cond;
2458 basic_block guard_bb, guard_to;
2459 profile_probability prob_prolog, prob_vector, prob_epilog;
2460 int estimated_vf;
2461 int prolog_peeling = 0;
2462 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2463 /* We currently do not support prolog peeling if the target alignment is not
2464 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2465 target alignment being constant. */
2466 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2467 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2468 return NULL;
2470 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2471 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2473 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2474 poly_uint64 bound_epilog = 0;
2475 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2476 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2477 bound_epilog += vf - 1;
2478 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2479 bound_epilog += 1;
2480 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2481 poly_uint64 bound_scalar = bound_epilog;
2483 if (!prolog_peeling && !epilog_peeling)
2484 return NULL;
2486 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2487 estimated_vf = vect_vf_for_cost (loop_vinfo);
2488 if (estimated_vf == 2)
2489 estimated_vf = 3;
2490 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2491 .apply_scale (estimated_vf - 1, estimated_vf);
2493 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2494 class loop *first_loop = loop;
2495 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2497 /* We might have a queued need to update virtual SSA form. As we
2498 delete the update SSA machinery below after doing a regular
2499 incremental SSA update during loop copying make sure we don't
2500 lose that fact.
2501 ??? Needing to update virtual SSA form by renaming is unfortunate
2502 but not all of the vectorizer code inserting new loads / stores
2503 properly assigns virtual operands to those statements. */
2504 update_ssa (TODO_update_ssa_only_virtuals);
2506 create_lcssa_for_virtual_phi (loop);
2508 /* If we're vectorizing an epilogue loop, the update_ssa above will
2509 have ensured that the virtual operand is in SSA form throughout the
2510 vectorized main loop. Normally it is possible to trace the updated
2511 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2512 back to scalar-stmt vuses, meaning that the effect of the SSA update
2513 remains local to the main loop. However, there are rare cases in
2514 which the vectorized loop has vdefs even when the original scalar
2515 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2516 introduces clobbers of the temporary vector array, which in turn
2517 needs new vdefs. If the scalar loop doesn't write to memory, these
2518 new vdefs will be the only ones in the vector loop.
2520 In that case, update_ssa will have added a new virtual phi to the
2521 main loop, which previously didn't need one. Ensure that we (locally)
2522 maintain LCSSA form for the virtual operand, just as we would have
2523 done if the virtual phi had existed from the outset. This makes it
2524 easier to duplicate the scalar epilogue loop below. */
2525 tree vop_to_rename = NULL_TREE;
2526 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2528 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2529 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2532 if (MAY_HAVE_DEBUG_BIND_STMTS)
2534 gcc_assert (!adjust_vec.exists ());
2535 adjust_vec.create (32);
2537 initialize_original_copy_tables ();
2539 /* Record the anchor bb at which the guard should be placed if the scalar
2540 loop might be preferred. */
2541 basic_block anchor = loop_preheader_edge (loop)->src;
2543 /* Generate the number of iterations for the prolog loop. We do this here
2544 so that we can also get the upper bound on the number of iterations. */
2545 tree niters_prolog;
2546 int bound_prolog = 0;
2547 if (prolog_peeling)
2548 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2549 &bound_prolog);
2550 else
2551 niters_prolog = build_int_cst (type, 0);
2553 loop_vec_info epilogue_vinfo = NULL;
2554 if (vect_epilogues)
2556 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2557 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2560 tree niters_vector_mult_vf = NULL_TREE;
2561 /* Saving NITERs before the loop, as this may be changed by prologue. */
2562 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2563 edge update_e = NULL, skip_e = NULL;
2564 unsigned int lowest_vf = constant_lower_bound (vf);
2565 /* If we know the number of scalar iterations for the main loop we should
2566 check whether after the main loop there are enough iterations left over
2567 for the epilogue. */
2568 if (vect_epilogues
2569 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2570 && prolog_peeling >= 0
2571 && known_eq (vf, lowest_vf))
2573 unsigned HOST_WIDE_INT eiters
2574 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2575 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2577 eiters -= prolog_peeling;
2578 eiters
2579 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2581 unsigned int ratio;
2582 unsigned int epilogue_gaps
2583 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2584 while (!(constant_multiple_p
2585 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2586 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2587 && eiters >= lowest_vf / ratio + epilogue_gaps))
2589 delete epilogue_vinfo;
2590 epilogue_vinfo = NULL;
2591 if (loop_vinfo->epilogue_vinfos.length () == 0)
2593 vect_epilogues = false;
2594 break;
2596 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2597 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2598 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2601 /* Prolog loop may be skipped. */
2602 bool skip_prolog = (prolog_peeling != 0);
2603 /* Skip this loop to epilog when there are not enough iterations to enter this
2604 vectorized loop. If true we should perform runtime checks on the NITERS
2605 to check whether we should skip the current vectorized loop. If we know
2606 the number of scalar iterations we may choose to add a runtime check if
2607 this number "maybe" smaller than the number of iterations required
2608 when we know the number of scalar iterations may potentially
2609 be smaller than the number of iterations required to enter this loop, for
2610 this we use the upper bounds on the prolog and epilog peeling. When we
2611 don't know the number of iterations and don't require versioning it is
2612 because we have asserted that there are enough scalar iterations to enter
2613 the main loop, so this skip is not necessary. When we are versioning then
2614 we only add such a skip if we have chosen to vectorize the epilogue. */
2615 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2616 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2617 bound_prolog + bound_epilog)
2618 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2619 || vect_epilogues));
2620 /* Epilog loop must be executed if the number of iterations for epilog
2621 loop is known at compile time, otherwise we need to add a check at
2622 the end of vector loop and skip to the end of epilog loop. */
2623 bool skip_epilog = (prolog_peeling < 0
2624 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2625 || !vf.is_constant ());
2626 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2627 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2628 skip_epilog = false;
2630 if (skip_vector)
2632 split_edge (loop_preheader_edge (loop));
2634 /* Due to the order in which we peel prolog and epilog, we first
2635 propagate probability to the whole loop. The purpose is to
2636 avoid adjusting probabilities of both prolog and vector loops
2637 separately. Note in this case, the probability of epilog loop
2638 needs to be scaled back later. */
2639 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2640 if (prob_vector.initialized_p ())
2642 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2643 scale_loop_profile (loop, prob_vector, 0);
2647 dump_user_location_t loop_loc = find_loop_location (loop);
2648 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2649 if (vect_epilogues)
2650 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2651 use the original scalar loop as remaining epilogue if necessary. */
2652 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2653 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2655 if (prolog_peeling)
2657 e = loop_preheader_edge (loop);
2658 if (!slpeel_can_duplicate_loop_p (loop, e))
2660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2661 "loop can't be duplicated to preheader edge.\n");
2662 gcc_unreachable ();
2664 /* Peel prolog and put it on preheader edge of loop. */
2665 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2666 if (!prolog)
2668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2669 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2670 gcc_unreachable ();
2672 prolog->force_vectorize = false;
2673 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2674 first_loop = prolog;
2675 reset_original_copy_tables ();
2677 /* Update the number of iterations for prolog loop. */
2678 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2679 vect_set_loop_condition (prolog, NULL, niters_prolog,
2680 step_prolog, NULL_TREE, false);
2682 /* Skip the prolog loop. */
2683 if (skip_prolog)
2685 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2686 niters_prolog, build_int_cst (type, 0));
2687 guard_bb = loop_preheader_edge (prolog)->src;
2688 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2689 guard_to = split_edge (loop_preheader_edge (loop));
2690 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2691 guard_to, guard_bb,
2692 prob_prolog.invert (),
2693 irred_flag);
2694 e = EDGE_PRED (guard_to, 0);
2695 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2696 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2698 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2699 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2702 /* Update init address of DRs. */
2703 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2704 /* Update niters for vector loop. */
2705 LOOP_VINFO_NITERS (loop_vinfo)
2706 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2707 LOOP_VINFO_NITERSM1 (loop_vinfo)
2708 = fold_build2 (MINUS_EXPR, type,
2709 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2710 bool new_var_p = false;
2711 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2712 /* It's guaranteed that vector loop bound before vectorization is at
2713 least VF, so set range information for newly generated var. */
2714 if (new_var_p)
2715 set_range_info (niters, VR_RANGE,
2716 wi::to_wide (build_int_cst (type, vf)),
2717 wi::to_wide (TYPE_MAX_VALUE (type)));
2719 /* Prolog iterates at most bound_prolog times, latch iterates at
2720 most bound_prolog - 1 times. */
2721 record_niter_bound (prolog, bound_prolog - 1, false, true);
2722 delete_update_ssa ();
2723 adjust_vec_debug_stmts ();
2724 scev_reset ();
2727 if (epilog_peeling)
2729 e = single_exit (loop);
2730 if (!slpeel_can_duplicate_loop_p (loop, e))
2732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2733 "loop can't be duplicated to exit edge.\n");
2734 gcc_unreachable ();
2736 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2737 said epilog then we should use a copy of the main loop as a starting
2738 point. This loop may have already had some preliminary transformations
2739 to allow for more optimal vectorization, for example if-conversion.
2740 If we are not vectorizing the epilog then we should use the scalar loop
2741 as the transformations mentioned above make less or no sense when not
2742 vectorizing. */
2743 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2744 if (vop_to_rename)
2746 /* Vectorizing the main loop can sometimes introduce a vdef to
2747 a loop that previously didn't have one; see the comment above
2748 the definition of VOP_TO_RENAME for details. The definition
2749 D that holds on E will then be different from the definition
2750 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2751 rename VOP_TO_RENAME to D when copying the loop.
2753 The virtual operand is in LCSSA form for the main loop,
2754 and no stmt between the main loop and E needs a vdef,
2755 so we know that D is provided by a phi rather than by a
2756 vdef on a normal gimple stmt. */
2757 basic_block vdef_bb = e->src;
2758 gphi *vphi;
2759 while (!(vphi = get_virtual_phi (vdef_bb)))
2760 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2761 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2762 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2764 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2765 if (!epilog)
2767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2768 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2769 gcc_unreachable ();
2771 epilog->force_vectorize = false;
2772 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2774 /* Scalar version loop may be preferred. In this case, add guard
2775 and skip to epilog. Note this only happens when the number of
2776 iterations of loop is unknown at compile time, otherwise this
2777 won't be vectorized. */
2778 if (skip_vector)
2780 /* Additional epilogue iteration is peeled if gap exists. */
2781 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2782 bound_prolog, bound_epilog,
2783 th, &bound_scalar,
2784 check_profitability);
2785 /* Build guard against NITERSM1 since NITERS may overflow. */
2786 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2787 guard_bb = anchor;
2788 guard_to = split_edge (loop_preheader_edge (epilog));
2789 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2790 guard_to, guard_bb,
2791 prob_vector.invert (),
2792 irred_flag);
2793 skip_e = guard_e;
2794 e = EDGE_PRED (guard_to, 0);
2795 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2796 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2798 /* Simply propagate profile info from guard_bb to guard_to which is
2799 a merge point of control flow. */
2800 guard_to->count = guard_bb->count;
2802 /* Scale probability of epilog loop back.
2803 FIXME: We should avoid scaling down and back up. Profile may
2804 get lost if we scale down to 0. */
2805 basic_block *bbs = get_loop_body (epilog);
2806 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2807 bbs[i]->count = bbs[i]->count.apply_scale
2808 (bbs[i]->count,
2809 bbs[i]->count.apply_probability
2810 (prob_vector));
2811 free (bbs);
2814 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2815 /* If loop is peeled for non-zero constant times, now niters refers to
2816 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2817 overflows. */
2818 niters_no_overflow |= (prolog_peeling > 0);
2819 vect_gen_vector_loop_niters (loop_vinfo, niters,
2820 niters_vector, step_vector,
2821 niters_no_overflow);
2822 if (!integer_onep (*step_vector))
2824 /* On exit from the loop we will have an easy way of calcalating
2825 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2826 until then. */
2827 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2828 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2829 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2831 else
2832 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2833 &niters_vector_mult_vf);
2834 /* Update IVs of original loop as if they were advanced by
2835 niters_vector_mult_vf steps. */
2836 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2837 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2838 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2839 update_e);
2841 if (skip_epilog)
2843 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2844 niters, niters_vector_mult_vf);
2845 guard_bb = single_exit (loop)->dest;
2846 guard_to = split_edge (single_exit (epilog));
2847 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2848 skip_vector ? anchor : guard_bb,
2849 prob_epilog.invert (),
2850 irred_flag);
2851 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2852 single_exit (epilog));
2853 /* Only need to handle basic block before epilog loop if it's not
2854 the guard_bb, which is the case when skip_vector is true. */
2855 if (guard_bb != bb_before_epilog)
2857 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2859 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2861 scale_loop_profile (epilog, prob_epilog, 0);
2863 else
2864 slpeel_update_phi_nodes_for_lcssa (epilog);
2866 unsigned HOST_WIDE_INT bound;
2867 if (bound_scalar.is_constant (&bound))
2869 gcc_assert (bound != 0);
2870 /* -1 to convert loop iterations to latch iterations. */
2871 record_niter_bound (epilog, bound - 1, false, true);
2874 delete_update_ssa ();
2875 adjust_vec_debug_stmts ();
2876 scev_reset ();
2879 if (vect_epilogues)
2881 epilog->aux = epilogue_vinfo;
2882 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2884 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2886 /* We now must calculate the number of NITERS performed by the previous
2887 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2888 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2889 niters_prolog, niters_vector_mult_vf);
2891 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2892 determine whether we are coming from the previous vectorized loop
2893 using the update_e edge or the skip_vector basic block using the
2894 skip_e edge. */
2895 if (skip_vector)
2897 gcc_assert (update_e != NULL && skip_e != NULL);
2898 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2899 update_e->dest);
2900 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2901 gimple *stmt = gimple_build_assign (new_ssa, niters);
2902 gimple_stmt_iterator gsi;
2903 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2904 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2906 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2907 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2909 else
2911 gsi = gsi_last_bb (update_e->src);
2912 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2915 niters = new_ssa;
2916 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2917 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2918 UNKNOWN_LOCATION);
2919 niters = PHI_RESULT (new_phi);
2922 /* Subtract the number of iterations performed by the vectorized loop
2923 from the number of total iterations. */
2924 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2925 before_loop_niters,
2926 niters);
2928 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2929 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2930 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2931 epilogue_niters,
2932 build_one_cst (TREE_TYPE (epilogue_niters)));
2934 /* Set ADVANCE to the number of iterations performed by the previous
2935 loop and its prologue. */
2936 *advance = niters;
2938 /* Redo the peeling for niter analysis as the NITERs and alignment
2939 may have been updated to take the main loop into account. */
2940 determine_peel_for_niter (epilogue_vinfo);
2943 adjust_vec.release ();
2944 free_original_copy_tables ();
2946 return vect_epilogues ? epilog : NULL;
2949 /* Function vect_create_cond_for_niters_checks.
2951 Create a conditional expression that represents the run-time checks for
2952 loop's niter. The loop is guaranteed to terminate if the run-time
2953 checks hold.
2955 Input:
2956 COND_EXPR - input conditional expression. New conditions will be chained
2957 with logical AND operation. If it is NULL, then the function
2958 is used to return the number of alias checks.
2959 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2960 to be checked.
2962 Output:
2963 COND_EXPR - conditional expression.
2965 The returned COND_EXPR is the conditional expression to be used in the
2966 if statement that controls which version of the loop gets executed at
2967 runtime. */
2969 static void
2970 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2972 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2974 if (*cond_expr)
2975 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2976 *cond_expr, part_cond_expr);
2977 else
2978 *cond_expr = part_cond_expr;
2981 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2982 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2984 static void
2985 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2987 if (*cond_expr)
2988 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2989 *cond_expr, part_cond_expr);
2990 else
2991 *cond_expr = part_cond_expr;
2994 /* Function vect_create_cond_for_align_checks.
2996 Create a conditional expression that represents the alignment checks for
2997 all of data references (array element references) whose alignment must be
2998 checked at runtime.
3000 Input:
3001 COND_EXPR - input conditional expression. New conditions will be chained
3002 with logical AND operation.
3003 LOOP_VINFO - two fields of the loop information are used.
3004 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3005 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3007 Output:
3008 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3009 expression.
3010 The returned value is the conditional expression to be used in the if
3011 statement that controls which version of the loop gets executed at runtime.
3013 The algorithm makes two assumptions:
3014 1) The number of bytes "n" in a vector is a power of 2.
3015 2) An address "a" is aligned if a%n is zero and that this
3016 test can be done as a&(n-1) == 0. For example, for 16
3017 byte vectors the test is a&0xf == 0. */
3019 static void
3020 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3021 tree *cond_expr,
3022 gimple_seq *cond_expr_stmt_list)
3024 vec<stmt_vec_info> may_misalign_stmts
3025 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3026 stmt_vec_info stmt_info;
3027 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3028 tree mask_cst;
3029 unsigned int i;
3030 tree int_ptrsize_type;
3031 char tmp_name[20];
3032 tree or_tmp_name = NULL_TREE;
3033 tree and_tmp_name;
3034 gimple *and_stmt;
3035 tree ptrsize_zero;
3036 tree part_cond_expr;
3038 /* Check that mask is one less than a power of 2, i.e., mask is
3039 all zeros followed by all ones. */
3040 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3042 int_ptrsize_type = signed_type_for (ptr_type_node);
3044 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3045 of the first vector of the i'th data reference. */
3047 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3049 gimple_seq new_stmt_list = NULL;
3050 tree addr_base;
3051 tree addr_tmp_name;
3052 tree new_or_tmp_name;
3053 gimple *addr_stmt, *or_stmt;
3054 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3055 bool negative = tree_int_cst_compare
3056 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3057 tree offset = negative
3058 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3060 /* create: addr_tmp = (int)(address_of_first_vector) */
3061 addr_base =
3062 vect_create_addr_base_for_vector_ref (loop_vinfo,
3063 stmt_info, &new_stmt_list,
3064 offset);
3065 if (new_stmt_list != NULL)
3066 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3068 sprintf (tmp_name, "addr2int%d", i);
3069 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3070 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3071 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3073 /* The addresses are OR together. */
3075 if (or_tmp_name != NULL_TREE)
3077 /* create: or_tmp = or_tmp | addr_tmp */
3078 sprintf (tmp_name, "orptrs%d", i);
3079 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3080 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3081 or_tmp_name, addr_tmp_name);
3082 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3083 or_tmp_name = new_or_tmp_name;
3085 else
3086 or_tmp_name = addr_tmp_name;
3088 } /* end for i */
3090 mask_cst = build_int_cst (int_ptrsize_type, mask);
3092 /* create: and_tmp = or_tmp & mask */
3093 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3095 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3096 or_tmp_name, mask_cst);
3097 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3099 /* Make and_tmp the left operand of the conditional test against zero.
3100 if and_tmp has a nonzero bit then some address is unaligned. */
3101 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3102 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3103 and_tmp_name, ptrsize_zero);
3104 chain_cond_expr (cond_expr, part_cond_expr);
3107 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3108 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3109 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3110 and this new condition are true. Treat a null *COND_EXPR as "true". */
3112 static void
3113 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3115 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3116 unsigned int i;
3117 vec_object_pair *pair;
3118 FOR_EACH_VEC_ELT (pairs, i, pair)
3120 tree addr1 = build_fold_addr_expr (pair->first);
3121 tree addr2 = build_fold_addr_expr (pair->second);
3122 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3123 addr1, addr2);
3124 chain_cond_expr (cond_expr, part_cond_expr);
3128 /* Create an expression that is true when all lower-bound conditions for
3129 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3131 static void
3132 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3134 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3135 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3137 tree expr = lower_bounds[i].expr;
3138 tree type = unsigned_type_for (TREE_TYPE (expr));
3139 expr = fold_convert (type, expr);
3140 poly_uint64 bound = lower_bounds[i].min_value;
3141 if (!lower_bounds[i].unsigned_p)
3143 expr = fold_build2 (PLUS_EXPR, type, expr,
3144 build_int_cstu (type, bound - 1));
3145 bound += bound - 1;
3147 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3148 build_int_cstu (type, bound));
3149 chain_cond_expr (cond_expr, part_cond_expr);
3153 /* Function vect_create_cond_for_alias_checks.
3155 Create a conditional expression that represents the run-time checks for
3156 overlapping of address ranges represented by a list of data references
3157 relations passed as input.
3159 Input:
3160 COND_EXPR - input conditional expression. New conditions will be chained
3161 with logical AND operation. If it is NULL, then the function
3162 is used to return the number of alias checks.
3163 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3164 to be checked.
3166 Output:
3167 COND_EXPR - conditional expression.
3169 The returned COND_EXPR is the conditional expression to be used in the if
3170 statement that controls which version of the loop gets executed at runtime.
3173 void
3174 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3176 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3177 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3179 if (comp_alias_ddrs.is_empty ())
3180 return;
3182 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3183 &comp_alias_ddrs, cond_expr);
3184 if (dump_enabled_p ())
3185 dump_printf_loc (MSG_NOTE, vect_location,
3186 "created %u versioning for alias checks.\n",
3187 comp_alias_ddrs.length ());
3191 /* Function vect_loop_versioning.
3193 If the loop has data references that may or may not be aligned or/and
3194 has data reference relations whose independence was not proven then
3195 two versions of the loop need to be generated, one which is vectorized
3196 and one which isn't. A test is then generated to control which of the
3197 loops is executed. The test checks for the alignment of all of the
3198 data references that may or may not be aligned. An additional
3199 sequence of runtime tests is generated for each pairs of DDRs whose
3200 independence was not proven. The vectorized version of loop is
3201 executed only if both alias and alignment tests are passed.
3203 The test generated to check which version of loop is executed
3204 is modified to also check for profitability as indicated by the
3205 cost model threshold TH.
3207 The versioning precondition(s) are placed in *COND_EXPR and
3208 *COND_EXPR_STMT_LIST. */
3210 class loop *
3211 vect_loop_versioning (loop_vec_info loop_vinfo,
3212 gimple *loop_vectorized_call)
3214 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3215 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3216 basic_block condition_bb;
3217 gphi_iterator gsi;
3218 gimple_stmt_iterator cond_exp_gsi;
3219 basic_block merge_bb;
3220 basic_block new_exit_bb;
3221 edge new_exit_e, e;
3222 gphi *orig_phi, *new_phi;
3223 tree cond_expr = NULL_TREE;
3224 gimple_seq cond_expr_stmt_list = NULL;
3225 tree arg;
3226 profile_probability prob = profile_probability::likely ();
3227 gimple_seq gimplify_stmt_list = NULL;
3228 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3229 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3230 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3231 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3232 poly_uint64 versioning_threshold
3233 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3234 tree version_simd_if_cond
3235 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3236 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3238 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3239 && !ordered_p (th, versioning_threshold))
3240 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3241 build_int_cst (TREE_TYPE (scalar_loop_iters),
3242 th - 1));
3243 if (maybe_ne (versioning_threshold, 0U))
3245 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3246 build_int_cst (TREE_TYPE (scalar_loop_iters),
3247 versioning_threshold - 1));
3248 if (cond_expr)
3249 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3250 expr, cond_expr);
3251 else
3252 cond_expr = expr;
3255 if (version_niter)
3256 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3258 if (cond_expr)
3259 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3260 &cond_expr_stmt_list,
3261 is_gimple_condexpr, NULL_TREE);
3263 if (version_align)
3264 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3265 &cond_expr_stmt_list);
3267 if (version_alias)
3269 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3270 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3271 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3274 if (version_simd_if_cond)
3276 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3277 if (flag_checking)
3278 if (basic_block bb
3279 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3280 gcc_assert (bb != loop->header
3281 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3282 && (scalar_loop == NULL
3283 || (bb != scalar_loop->header
3284 && dominated_by_p (CDI_DOMINATORS,
3285 scalar_loop->header, bb))));
3286 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3287 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3288 version_simd_if_cond, zero);
3289 if (cond_expr)
3290 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3291 c, cond_expr);
3292 else
3293 cond_expr = c;
3294 if (dump_enabled_p ())
3295 dump_printf_loc (MSG_NOTE, vect_location,
3296 "created versioning for simd if condition check.\n");
3299 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3300 &gimplify_stmt_list,
3301 is_gimple_condexpr, NULL_TREE);
3302 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3304 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3305 invariant in. */
3306 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3307 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3308 !gsi_end_p (gsi); gsi_next (&gsi))
3310 gimple *stmt = gsi_stmt (gsi);
3311 update_stmt (stmt);
3312 ssa_op_iter iter;
3313 use_operand_p use_p;
3314 basic_block def_bb;
3315 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3316 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3317 && flow_bb_inside_loop_p (outermost, def_bb))
3318 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3321 /* Search for the outermost loop we can version. Avoid versioning of
3322 non-perfect nests but allow if-conversion versioned loops inside. */
3323 class loop *loop_to_version = loop;
3324 if (flow_loop_nested_p (outermost, loop))
3326 if (dump_enabled_p ())
3327 dump_printf_loc (MSG_NOTE, vect_location,
3328 "trying to apply versioning to outer loop %d\n",
3329 outermost->num);
3330 if (outermost->num == 0)
3331 outermost = superloop_at_depth (loop, 1);
3332 /* And avoid applying versioning on non-perfect nests. */
3333 while (loop_to_version != outermost
3334 && single_exit (loop_outer (loop_to_version))
3335 && (!loop_outer (loop_to_version)->inner->next
3336 || vect_loop_vectorized_call (loop_to_version))
3337 && (!loop_outer (loop_to_version)->inner->next
3338 || !loop_outer (loop_to_version)->inner->next->next))
3339 loop_to_version = loop_outer (loop_to_version);
3342 /* Apply versioning. If there is already a scalar version created by
3343 if-conversion re-use that. Note we cannot re-use the copy of
3344 an if-converted outer-loop when vectorizing the inner loop only. */
3345 gcond *cond;
3346 if ((!loop_to_version->inner || loop == loop_to_version)
3347 && loop_vectorized_call)
3349 gcc_assert (scalar_loop);
3350 condition_bb = gimple_bb (loop_vectorized_call);
3351 cond = as_a <gcond *> (last_stmt (condition_bb));
3352 gimple_cond_set_condition_from_tree (cond, cond_expr);
3353 update_stmt (cond);
3355 if (cond_expr_stmt_list)
3357 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3358 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3359 GSI_SAME_STMT);
3362 /* if-conversion uses profile_probability::always () for both paths,
3363 reset the paths probabilities appropriately. */
3364 edge te, fe;
3365 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3366 te->probability = prob;
3367 fe->probability = prob.invert ();
3368 /* We can scale loops counts immediately but have to postpone
3369 scaling the scalar loop because we re-use it during peeling. */
3370 scale_loop_frequencies (loop_to_version, te->probability);
3371 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3373 nloop = scalar_loop;
3374 if (dump_enabled_p ())
3375 dump_printf_loc (MSG_NOTE, vect_location,
3376 "reusing %sloop version created by if conversion\n",
3377 loop_to_version != loop ? "outer " : "");
3379 else
3381 if (loop_to_version != loop
3382 && dump_enabled_p ())
3383 dump_printf_loc (MSG_NOTE, vect_location,
3384 "applying loop versioning to outer loop %d\n",
3385 loop_to_version->num);
3387 initialize_original_copy_tables ();
3388 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3389 prob, prob.invert (), prob, prob.invert (), true);
3390 gcc_assert (nloop);
3391 nloop = get_loop_copy (loop);
3393 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3394 reap those otherwise; they also refer to the original
3395 loops. */
3396 class loop *l = loop;
3397 while (gimple *call = vect_loop_vectorized_call (l))
3399 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3400 fold_loop_internal_call (call, boolean_false_node);
3401 l = loop_outer (l);
3403 free_original_copy_tables ();
3405 if (cond_expr_stmt_list)
3407 cond_exp_gsi = gsi_last_bb (condition_bb);
3408 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3409 GSI_SAME_STMT);
3412 /* Loop versioning violates an assumption we try to maintain during
3413 vectorization - that the loop exit block has a single predecessor.
3414 After versioning, the exit block of both loop versions is the same
3415 basic block (i.e. it has two predecessors). Just in order to simplify
3416 following transformations in the vectorizer, we fix this situation
3417 here by adding a new (empty) block on the exit-edge of the loop,
3418 with the proper loop-exit phis to maintain loop-closed-form.
3419 If loop versioning wasn't done from loop, but scalar_loop instead,
3420 merge_bb will have already just a single successor. */
3422 merge_bb = single_exit (loop_to_version)->dest;
3423 if (EDGE_COUNT (merge_bb->preds) >= 2)
3425 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3426 new_exit_bb = split_edge (single_exit (loop_to_version));
3427 new_exit_e = single_exit (loop_to_version);
3428 e = EDGE_SUCC (new_exit_bb, 0);
3430 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3431 gsi_next (&gsi))
3433 tree new_res;
3434 orig_phi = gsi.phi ();
3435 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3436 new_phi = create_phi_node (new_res, new_exit_bb);
3437 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3438 add_phi_arg (new_phi, arg, new_exit_e,
3439 gimple_phi_arg_location_from_edge (orig_phi, e));
3440 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3444 update_ssa (TODO_update_ssa);
3447 if (version_niter)
3449 /* The versioned loop could be infinite, we need to clear existing
3450 niter information which is copied from the original loop. */
3451 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3452 vect_free_loop_info_assumptions (nloop);
3453 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3454 loop_constraint_set (loop, LOOP_C_INFINITE);
3457 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3458 && dump_enabled_p ())
3460 if (version_alias)
3461 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3462 vect_location,
3463 "loop versioned for vectorization because of "
3464 "possible aliasing\n");
3465 if (version_align)
3466 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3467 vect_location,
3468 "loop versioned for vectorization to enhance "
3469 "alignment\n");
3473 return nloop;