Add emergency dump after an ICE
[official-gcc.git] / gcc / tree-vect-loop-manip.c
blob47cfa6f4061a104e0baae24b25c738c0a2a58267
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 rgroup_controls *src_rgm)
326 tree src_masktype = src_rgm->type;
327 tree dest_masktype = dest_rgm->type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
343 tree src = src_rgm->controls[i / 2];
344 tree dest = dest_rgm->controls[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
360 gimple_seq_add_stmt (seq, stmt);
362 return true;
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
376 tree src = src_rgm->controls[i / 2];
377 tree dest = dest_rgm->controls[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
382 return true;
384 return false;
387 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
388 for all the rgroup controls in RGC and return a control that is nonzero
389 when the loop needs to iterate. Add any new preheader statements to
390 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
392 RGC belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
400 It is known that:
402 NITERS * RGC->max_nscalars_per_iter * RGC->factor
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
407 VF * RGC->max_nscalars_per_iter * RGC->factor
409 might overflow before hitting a value above:
411 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks or zero
415 lengths for RGC.
417 Note: the cost of the code generated by this function is modeled
418 by vect_estimate_min_profitable_iters, so changes here may need
419 corresponding changes there. */
421 static tree
422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 gimple_seq *preheader_seq,
424 gimple_stmt_iterator loop_cond_gsi,
425 rgroup_controls *rgc, tree niters,
426 tree niters_skip, bool might_wrap_p)
428 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
429 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
430 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
432 tree ctrl_type = rgc->type;
433 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
434 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
435 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
436 tree length_limit = NULL_TREE;
437 /* For length, we need length_limit to ensure length in range. */
438 if (!use_masks_p)
439 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
441 /* Calculate the maximum number of item values that the rgroup
442 handles in total, the number that it handles for each iteration
443 of the vector loop, and the number that it should skip during the
444 first iteration of the vector loop. */
445 tree nitems_total = niters;
446 tree nitems_step = build_int_cst (iv_type, vf);
447 tree nitems_skip = niters_skip;
448 if (nitems_per_iter != 1)
450 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
451 these multiplications don't overflow. */
452 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
453 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
454 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
455 nitems_total, compare_factor);
456 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
457 nitems_step, iv_factor);
458 if (nitems_skip)
459 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
460 nitems_skip, compare_factor);
463 /* Create an induction variable that counts the number of items
464 processed. */
465 tree index_before_incr, index_after_incr;
466 gimple_stmt_iterator incr_gsi;
467 bool insert_after;
468 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
469 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
470 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
472 tree zero_index = build_int_cst (compare_type, 0);
473 tree test_index, test_limit, first_limit;
474 gimple_stmt_iterator *test_gsi;
475 if (might_wrap_p)
477 /* In principle the loop should stop iterating once the incremented
478 IV reaches a value greater than or equal to:
480 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
482 However, there's no guarantee that this addition doesn't overflow
483 the comparison type, or that the IV hits a value above it before
484 wrapping around. We therefore adjust the limit down by one
485 IV step:
487 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
488 -[infinite-prec] NITEMS_STEP
490 and compare the IV against this limit _before_ incrementing it.
491 Since the comparison type is unsigned, we actually want the
492 subtraction to saturate at zero:
494 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
495 -[sat] NITEMS_STEP
497 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
499 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
501 where the rightmost subtraction can be done directly in
502 COMPARE_TYPE. */
503 test_index = index_before_incr;
504 tree adjust = gimple_convert (preheader_seq, compare_type,
505 nitems_step);
506 if (nitems_skip)
507 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
508 adjust, nitems_skip);
509 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
510 nitems_total, adjust);
511 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
512 test_limit, adjust);
513 test_gsi = &incr_gsi;
515 /* Get a safe limit for the first iteration. */
516 if (nitems_skip)
518 /* The first vector iteration can handle at most NITEMS_STEP
519 items. NITEMS_STEP <= CONST_LIMIT, and adding
520 NITEMS_SKIP to that cannot overflow. */
521 tree const_limit = build_int_cst (compare_type,
522 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
523 * nitems_per_iter);
524 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
525 nitems_total, const_limit);
526 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
527 first_limit, nitems_skip);
529 else
530 /* For the first iteration it doesn't matter whether the IV hits
531 a value above NITEMS_TOTAL. That only matters for the latch
532 condition. */
533 first_limit = nitems_total;
535 else
537 /* Test the incremented IV, which will always hit a value above
538 the bound before wrapping. */
539 test_index = index_after_incr;
540 test_limit = nitems_total;
541 if (nitems_skip)
542 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
543 test_limit, nitems_skip);
544 test_gsi = &loop_cond_gsi;
546 first_limit = test_limit;
549 /* Convert the IV value to the comparison type (either a no-op or
550 a demotion). */
551 gimple_seq test_seq = NULL;
552 test_index = gimple_convert (&test_seq, compare_type, test_index);
553 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
555 /* Provide a definition of each control in the group. */
556 tree next_ctrl = NULL_TREE;
557 tree ctrl;
558 unsigned int i;
559 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
561 /* Previous controls will cover BIAS items. This control covers the
562 next batch. */
563 poly_uint64 bias = nitems_per_ctrl * i;
564 tree bias_tree = build_int_cst (compare_type, bias);
566 /* See whether the first iteration of the vector loop is known
567 to have a full control. */
568 poly_uint64 const_limit;
569 bool first_iteration_full
570 = (poly_int_tree_p (first_limit, &const_limit)
571 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
573 /* Rather than have a new IV that starts at BIAS and goes up to
574 TEST_LIMIT, prefer to use the same 0-based IV for each control
575 and adjust the bound down by BIAS. */
576 tree this_test_limit = test_limit;
577 if (i != 0)
579 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
580 compare_type, this_test_limit,
581 bias_tree);
582 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
583 compare_type, this_test_limit,
584 bias_tree);
587 /* Create the initial control. First include all items that
588 are within the loop limit. */
589 tree init_ctrl = NULL_TREE;
590 if (!first_iteration_full)
592 tree start, end;
593 if (first_limit == test_limit)
595 /* Use a natural test between zero (the initial IV value)
596 and the loop limit. The "else" block would be valid too,
597 but this choice can avoid the need to load BIAS_TREE into
598 a register. */
599 start = zero_index;
600 end = this_test_limit;
602 else
604 /* FIRST_LIMIT is the maximum number of items handled by the
605 first iteration of the vector loop. Test the portion
606 associated with this control. */
607 start = bias_tree;
608 end = first_limit;
611 if (use_masks_p)
613 init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
614 gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
615 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
617 else
619 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
620 gimple_seq seq = vect_gen_len (init_ctrl, start,
621 end, length_limit);
622 gimple_seq_add_seq (preheader_seq, seq);
626 /* Now AND out the bits that are within the number of skipped
627 items. */
628 poly_uint64 const_skip;
629 if (nitems_skip
630 && !(poly_int_tree_p (nitems_skip, &const_skip)
631 && known_le (const_skip, bias)))
633 gcc_assert (use_masks_p);
634 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
635 bias_tree, nitems_skip);
636 if (init_ctrl)
637 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
638 init_ctrl, unskipped_mask);
639 else
640 init_ctrl = unskipped_mask;
643 if (!init_ctrl)
645 /* First iteration is full. */
646 if (use_masks_p)
647 init_ctrl = build_minus_one_cst (ctrl_type);
648 else
649 init_ctrl = length_limit;
652 /* Get the control value for the next iteration of the loop. */
653 if (use_masks_p)
655 next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
656 gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
657 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
659 else
661 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
662 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
663 length_limit);
664 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
667 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
669 return next_ctrl;
672 /* Set up the iteration condition and rgroup controls for LOOP, given
673 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
674 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
675 the number of iterations of the original scalar loop that should be
676 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
677 for vect_set_loop_condition.
679 Insert the branch-back condition before LOOP_COND_GSI and return the
680 final gcond. */
682 static gcond *
683 vect_set_loop_condition_partial_vectors (class loop *loop,
684 loop_vec_info loop_vinfo, tree niters,
685 tree final_iv, bool niters_maybe_zero,
686 gimple_stmt_iterator loop_cond_gsi)
688 gimple_seq preheader_seq = NULL;
689 gimple_seq header_seq = NULL;
691 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
692 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
693 unsigned int compare_precision = TYPE_PRECISION (compare_type);
694 tree orig_niters = niters;
696 /* Type of the initial value of NITERS. */
697 tree ni_actual_type = TREE_TYPE (niters);
698 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
699 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
701 /* Convert NITERS to the same size as the compare. */
702 if (compare_precision > ni_actual_precision
703 && niters_maybe_zero)
705 /* We know that there is always at least one iteration, so if the
706 count is zero then it must have wrapped. Cope with this by
707 subtracting 1 before the conversion and adding 1 to the result. */
708 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
709 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
710 niters, build_minus_one_cst (ni_actual_type));
711 niters = gimple_convert (&preheader_seq, compare_type, niters);
712 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
713 niters, build_one_cst (compare_type));
715 else
716 niters = gimple_convert (&preheader_seq, compare_type, niters);
718 /* Iterate over all the rgroups and fill in their controls. We could use
719 the first control from any rgroup for the loop condition; here we
720 arbitrarily pick the last. */
721 tree test_ctrl = NULL_TREE;
722 rgroup_controls *rgc;
723 unsigned int i;
724 auto_vec<rgroup_controls> *controls = use_masks_p
725 ? &LOOP_VINFO_MASKS (loop_vinfo)
726 : &LOOP_VINFO_LENS (loop_vinfo);
727 FOR_EACH_VEC_ELT (*controls, i, rgc)
728 if (!rgc->controls.is_empty ())
730 /* First try using permutes. This adds a single vector
731 instruction to the loop for each mask, but needs no extra
732 loop invariants or IVs. */
733 unsigned int nmasks = i + 1;
734 if (use_masks_p && (nmasks & 1) == 0)
736 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
737 if (!half_rgc->controls.is_empty ()
738 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
739 continue;
742 /* See whether zero-based IV would ever generate all-false masks
743 or zero length before wrapping around. */
744 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
746 /* Set up all controls for this group. */
747 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
748 &preheader_seq,
749 loop_cond_gsi, rgc,
750 niters, niters_skip,
751 might_wrap_p);
754 /* Emit all accumulated statements. */
755 add_preheader_seq (loop, preheader_seq);
756 add_header_seq (loop, header_seq);
758 /* Get a boolean result that tells us whether to iterate. */
759 edge exit_edge = single_exit (loop);
760 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
761 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
762 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
763 NULL_TREE, NULL_TREE);
764 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
766 /* The loop iterates (NITERS - 1) / VF + 1 times.
767 Subtract one from this to get the latch count. */
768 tree step = build_int_cst (compare_type,
769 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
770 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
771 build_minus_one_cst (compare_type));
772 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
773 niters_minus_one, step);
775 if (final_iv)
777 gassign *assign = gimple_build_assign (final_iv, orig_niters);
778 gsi_insert_on_edge_immediate (single_exit (loop), assign);
781 return cond_stmt;
784 /* Like vect_set_loop_condition, but handle the case in which the vector
785 loop handles exactly VF scalars per iteration. */
787 static gcond *
788 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
789 tree final_iv, bool niters_maybe_zero,
790 gimple_stmt_iterator loop_cond_gsi)
792 tree indx_before_incr, indx_after_incr;
793 gcond *cond_stmt;
794 gcond *orig_cond;
795 edge pe = loop_preheader_edge (loop);
796 edge exit_edge = single_exit (loop);
797 gimple_stmt_iterator incr_gsi;
798 bool insert_after;
799 enum tree_code code;
800 tree niters_type = TREE_TYPE (niters);
802 orig_cond = get_loop_exit_condition (loop);
803 gcc_assert (orig_cond);
804 loop_cond_gsi = gsi_for_stmt (orig_cond);
806 tree init, limit;
807 if (!niters_maybe_zero && integer_onep (step))
809 /* In this case we can use a simple 0-based IV:
812 x = 0;
816 x += 1;
818 while (x < NITERS); */
819 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
820 init = build_zero_cst (niters_type);
821 limit = niters;
823 else
825 /* The following works for all values of NITERS except 0:
828 x = 0;
832 x += STEP;
834 while (x <= NITERS - STEP);
836 so that the loop continues to iterate if x + STEP - 1 < NITERS
837 but stops if x + STEP - 1 >= NITERS.
839 However, if NITERS is zero, x never hits a value above NITERS - STEP
840 before wrapping around. There are two obvious ways of dealing with
841 this:
843 - start at STEP - 1 and compare x before incrementing it
844 - start at -1 and compare x after incrementing it
846 The latter is simpler and is what we use. The loop in this case
847 looks like:
850 x = -1;
854 x += STEP;
856 while (x < NITERS - STEP);
858 In both cases the loop limit is NITERS - STEP. */
859 gimple_seq seq = NULL;
860 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
861 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
862 if (seq)
864 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
865 gcc_assert (!new_bb);
867 if (niters_maybe_zero)
869 /* Case C. */
870 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
871 init = build_all_ones_cst (niters_type);
873 else
875 /* Case B. */
876 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
877 init = build_zero_cst (niters_type);
881 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
882 create_iv (init, step, NULL_TREE, loop,
883 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
884 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
885 true, NULL_TREE, true,
886 GSI_SAME_STMT);
887 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
888 true, GSI_SAME_STMT);
890 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
891 NULL_TREE);
893 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
895 /* Record the number of latch iterations. */
896 if (limit == niters)
897 /* Case A: the loop iterates NITERS times. Subtract one to get the
898 latch count. */
899 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
900 build_int_cst (niters_type, 1));
901 else
902 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
903 Subtract one from this to get the latch count. */
904 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
905 limit, step);
907 if (final_iv)
909 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
910 indx_after_incr, init);
911 gsi_insert_on_edge_immediate (single_exit (loop), assign);
914 return cond_stmt;
917 /* If we're using fully-masked loops, make LOOP iterate:
919 N == (NITERS - 1) / STEP + 1
921 times. When NITERS is zero, this is equivalent to making the loop
922 execute (1 << M) / STEP times, where M is the precision of NITERS.
923 NITERS_MAYBE_ZERO is true if this last case might occur.
925 If we're not using fully-masked loops, make LOOP iterate:
927 N == (NITERS - STEP) / STEP + 1
929 times, where NITERS is known to be outside the range [1, STEP - 1].
930 This is equivalent to making the loop execute NITERS / STEP times
931 when NITERS is nonzero and (1 << M) / STEP times otherwise.
932 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
934 If FINAL_IV is nonnull, it is an SSA name that should be set to
935 N * STEP on exit from the loop.
937 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
939 void
940 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
941 tree niters, tree step, tree final_iv,
942 bool niters_maybe_zero)
944 gcond *cond_stmt;
945 gcond *orig_cond = get_loop_exit_condition (loop);
946 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
948 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
949 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
950 niters, final_iv,
951 niters_maybe_zero,
952 loop_cond_gsi);
953 else
954 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
955 niters_maybe_zero,
956 loop_cond_gsi);
958 /* Remove old loop exit test. */
959 stmt_vec_info orig_cond_info;
960 if (loop_vinfo
961 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
962 loop_vinfo->remove_stmt (orig_cond_info);
963 else
964 gsi_remove (&loop_cond_gsi, true);
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
968 cond_stmt);
971 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
972 For all PHI arguments in FROM->dest and TO->dest from those
973 edges ensure that TO->dest PHI arguments have current_def
974 to that in from. */
976 static void
977 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
979 gimple_stmt_iterator gsi_from, gsi_to;
981 for (gsi_from = gsi_start_phis (from->dest),
982 gsi_to = gsi_start_phis (to->dest);
983 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
985 gimple *from_phi = gsi_stmt (gsi_from);
986 gimple *to_phi = gsi_stmt (gsi_to);
987 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
988 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
989 if (virtual_operand_p (from_arg))
991 gsi_next (&gsi_from);
992 continue;
994 if (virtual_operand_p (to_arg))
996 gsi_next (&gsi_to);
997 continue;
999 if (TREE_CODE (from_arg) != SSA_NAME)
1000 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1001 else if (TREE_CODE (to_arg) == SSA_NAME
1002 && from_arg != to_arg)
1004 if (get_current_def (to_arg) == NULL_TREE)
1006 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1007 TREE_TYPE (get_current_def
1008 (from_arg))));
1009 set_current_def (to_arg, get_current_def (from_arg));
1012 gsi_next (&gsi_from);
1013 gsi_next (&gsi_to);
1016 gphi *from_phi = get_virtual_phi (from->dest);
1017 gphi *to_phi = get_virtual_phi (to->dest);
1018 if (from_phi)
1019 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1020 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1024 /* Given LOOP this function generates a new copy of it and puts it
1025 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1026 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1027 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1028 entry or exit of LOOP. */
1030 class loop *
1031 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1032 class loop *scalar_loop, edge e)
1034 class loop *new_loop;
1035 basic_block *new_bbs, *bbs, *pbbs;
1036 bool at_exit;
1037 bool was_imm_dom;
1038 basic_block exit_dest;
1039 edge exit, new_exit;
1040 bool duplicate_outer_loop = false;
1042 exit = single_exit (loop);
1043 at_exit = (e == exit);
1044 if (!at_exit && e != loop_preheader_edge (loop))
1045 return NULL;
1047 if (scalar_loop == NULL)
1048 scalar_loop = loop;
1050 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1051 pbbs = bbs + 1;
1052 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1053 /* Allow duplication of outer loops. */
1054 if (scalar_loop->inner)
1055 duplicate_outer_loop = true;
1056 /* Check whether duplication is possible. */
1057 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1059 free (bbs);
1060 return NULL;
1063 /* Generate new loop structure. */
1064 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1065 duplicate_subloops (scalar_loop, new_loop);
1067 exit_dest = exit->dest;
1068 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1069 exit_dest) == loop->header ?
1070 true : false);
1072 /* Also copy the pre-header, this avoids jumping through hoops to
1073 duplicate the loop entry PHI arguments. Create an empty
1074 pre-header unconditionally for this. */
1075 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1076 edge entry_e = single_pred_edge (preheader);
1077 bbs[0] = preheader;
1078 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1080 exit = single_exit (scalar_loop);
1081 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1082 &exit, 1, &new_exit, NULL,
1083 at_exit ? loop->latch : e->src, true);
1084 exit = single_exit (loop);
1085 basic_block new_preheader = new_bbs[0];
1087 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1089 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1090 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1091 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1093 if (scalar_loop != loop)
1095 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1096 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1097 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1098 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1099 header) to have current_def set, so copy them over. */
1100 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1101 exit);
1102 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1104 EDGE_SUCC (loop->latch, 0));
1107 if (at_exit) /* Add the loop copy at exit. */
1109 if (scalar_loop != loop)
1111 gphi_iterator gsi;
1112 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1114 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1115 gsi_next (&gsi))
1117 gphi *phi = gsi.phi ();
1118 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1119 location_t orig_locus
1120 = gimple_phi_arg_location_from_edge (phi, e);
1122 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1125 redirect_edge_and_branch_force (e, new_preheader);
1126 flush_pending_stmts (e);
1127 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1128 if (was_imm_dom || duplicate_outer_loop)
1129 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1131 /* And remove the non-necessary forwarder again. Keep the other
1132 one so we have a proper pre-header for the loop at the exit edge. */
1133 redirect_edge_pred (single_succ_edge (preheader),
1134 single_pred (preheader));
1135 delete_basic_block (preheader);
1136 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1137 loop_preheader_edge (scalar_loop)->src);
1139 else /* Add the copy at entry. */
1141 if (scalar_loop != loop)
1143 /* Remove the non-necessary forwarder of scalar_loop again. */
1144 redirect_edge_pred (single_succ_edge (preheader),
1145 single_pred (preheader));
1146 delete_basic_block (preheader);
1147 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1148 loop_preheader_edge (scalar_loop)->src);
1149 preheader = split_edge (loop_preheader_edge (loop));
1150 entry_e = single_pred_edge (preheader);
1153 redirect_edge_and_branch_force (entry_e, new_preheader);
1154 flush_pending_stmts (entry_e);
1155 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1157 redirect_edge_and_branch_force (new_exit, preheader);
1158 flush_pending_stmts (new_exit);
1159 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1161 /* And remove the non-necessary forwarder again. Keep the other
1162 one so we have a proper pre-header for the loop at the exit edge. */
1163 redirect_edge_pred (single_succ_edge (new_preheader),
1164 single_pred (new_preheader));
1165 delete_basic_block (new_preheader);
1166 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1167 loop_preheader_edge (new_loop)->src);
1170 if (scalar_loop != loop)
1172 /* Update new_loop->header PHIs, so that on the preheader
1173 edge they are the ones from loop rather than scalar_loop. */
1174 gphi_iterator gsi_orig, gsi_new;
1175 edge orig_e = loop_preheader_edge (loop);
1176 edge new_e = loop_preheader_edge (new_loop);
1178 for (gsi_orig = gsi_start_phis (loop->header),
1179 gsi_new = gsi_start_phis (new_loop->header);
1180 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1181 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1183 gphi *orig_phi = gsi_orig.phi ();
1184 gphi *new_phi = gsi_new.phi ();
1185 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1186 location_t orig_locus
1187 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1189 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1193 free (new_bbs);
1194 free (bbs);
1196 checking_verify_dominators (CDI_DOMINATORS);
1198 return new_loop;
1202 /* Given the condition expression COND, put it as the last statement of
1203 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1204 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1205 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1206 new edge as irreducible if IRREDUCIBLE_P is true. */
1208 static edge
1209 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1210 basic_block guard_to, basic_block dom_bb,
1211 profile_probability probability, bool irreducible_p)
1213 gimple_stmt_iterator gsi;
1214 edge new_e, enter_e;
1215 gcond *cond_stmt;
1216 gimple_seq gimplify_stmt_list = NULL;
1218 enter_e = EDGE_SUCC (guard_bb, 0);
1219 enter_e->flags &= ~EDGE_FALLTHRU;
1220 enter_e->flags |= EDGE_FALSE_VALUE;
1221 gsi = gsi_last_bb (guard_bb);
1223 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1224 NULL_TREE);
1225 if (gimplify_stmt_list)
1226 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1228 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1229 gsi = gsi_last_bb (guard_bb);
1230 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1232 /* Add new edge to connect guard block to the merge/loop-exit block. */
1233 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1235 new_e->probability = probability;
1236 if (irreducible_p)
1237 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1239 enter_e->probability = probability.invert ();
1240 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1242 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1243 if (enter_e->dest->loop_father->header == enter_e->dest)
1244 split_edge (enter_e);
1246 return new_e;
1250 /* This function verifies that the following restrictions apply to LOOP:
1251 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1252 for innermost loop and 5 basic blocks for outer-loop.
1253 (2) it is single entry, single exit
1254 (3) its exit condition is the last stmt in the header
1255 (4) E is the entry/exit edge of LOOP.
1258 bool
1259 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1261 edge exit_e = single_exit (loop);
1262 edge entry_e = loop_preheader_edge (loop);
1263 gcond *orig_cond = get_loop_exit_condition (loop);
1264 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1265 unsigned int num_bb = loop->inner? 5 : 2;
1267 /* All loops have an outer scope; the only case loop->outer is NULL is for
1268 the function itself. */
1269 if (!loop_outer (loop)
1270 || loop->num_nodes != num_bb
1271 || !empty_block_p (loop->latch)
1272 || !single_exit (loop)
1273 /* Verify that new loop exit condition can be trivially modified. */
1274 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1275 || (e != exit_e && e != entry_e))
1276 return false;
1278 return true;
1281 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1282 in the exit bb and rename all the uses after the loop. This simplifies
1283 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1284 (but normally loop closed SSA form doesn't require virtual PHIs to be
1285 in the same form). Doing this early simplifies the checking what
1286 uses should be renamed.
1288 If we create a new phi after the loop, return the definition that
1289 applies on entry to the loop, otherwise return null. */
1291 static tree
1292 create_lcssa_for_virtual_phi (class loop *loop)
1294 gphi_iterator gsi;
1295 edge exit_e = single_exit (loop);
1297 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1298 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1300 gphi *phi = gsi.phi ();
1301 for (gsi = gsi_start_phis (exit_e->dest);
1302 !gsi_end_p (gsi); gsi_next (&gsi))
1303 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1304 break;
1305 if (gsi_end_p (gsi))
1307 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1308 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1309 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1310 imm_use_iterator imm_iter;
1311 gimple *stmt;
1312 use_operand_p use_p;
1314 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1315 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1316 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1317 gimple_phi_set_result (new_phi, new_vop);
1318 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1319 if (stmt != new_phi
1320 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1321 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1322 SET_USE (use_p, new_vop);
1324 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1326 break;
1328 return NULL_TREE;
1331 /* Function vect_get_loop_location.
1333 Extract the location of the loop in the source code.
1334 If the loop is not well formed for vectorization, an estimated
1335 location is calculated.
1336 Return the loop location if succeed and NULL if not. */
1338 dump_user_location_t
1339 find_loop_location (class loop *loop)
1341 gimple *stmt = NULL;
1342 basic_block bb;
1343 gimple_stmt_iterator si;
1345 if (!loop)
1346 return dump_user_location_t ();
1348 stmt = get_loop_exit_condition (loop);
1350 if (stmt
1351 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1352 return stmt;
1354 /* If we got here the loop is probably not "well formed",
1355 try to estimate the loop location */
1357 if (!loop->header)
1358 return dump_user_location_t ();
1360 bb = loop->header;
1362 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1364 stmt = gsi_stmt (si);
1365 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1366 return stmt;
1369 return dump_user_location_t ();
1372 /* Return true if the phi described by STMT_INFO defines an IV of the
1373 loop to be vectorized. */
1375 static bool
1376 iv_phi_p (stmt_vec_info stmt_info)
1378 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1379 if (virtual_operand_p (PHI_RESULT (phi)))
1380 return false;
1382 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1383 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1384 return false;
1386 return true;
1389 /* Function vect_can_advance_ivs_p
1391 In case the number of iterations that LOOP iterates is unknown at compile
1392 time, an epilog loop will be generated, and the loop induction variables
1393 (IVs) will be "advanced" to the value they are supposed to take just before
1394 the epilog loop. Here we check that the access function of the loop IVs
1395 and the expression that represents the loop bound are simple enough.
1396 These restrictions will be relaxed in the future. */
1398 bool
1399 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1401 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402 basic_block bb = loop->header;
1403 gphi_iterator gsi;
1405 /* Analyze phi functions of the loop header. */
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1409 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1411 tree evolution_part;
1413 gphi *phi = gsi.phi ();
1414 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1417 phi_info->stmt);
1419 /* Skip virtual phi's. The data dependences that are associated with
1420 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1422 Skip reduction phis. */
1423 if (!iv_phi_p (phi_info))
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "reduc or virtual phi. skip.\n");
1428 continue;
1431 /* Analyze the evolution function. */
1433 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1434 if (evolution_part == NULL_TREE)
1436 if (dump_enabled_p ())
1437 dump_printf (MSG_MISSED_OPTIMIZATION,
1438 "No access function or evolution.\n");
1439 return false;
1442 /* FORNOW: We do not transform initial conditions of IVs
1443 which evolution functions are not invariants in the loop. */
1445 if (!expr_invariant_in_loop_p (loop, evolution_part))
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449 "evolution not invariant in loop.\n");
1450 return false;
1453 /* FORNOW: We do not transform initial conditions of IVs
1454 which evolution functions are a polynomial of degree >= 2. */
1456 if (tree_is_chrec (evolution_part))
1458 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1460 "evolution is chrec.\n");
1461 return false;
1465 return true;
1469 /* Function vect_update_ivs_after_vectorizer.
1471 "Advance" the induction variables of LOOP to the value they should take
1472 after the execution of LOOP. This is currently necessary because the
1473 vectorizer does not handle induction variables that are used after the
1474 loop. Such a situation occurs when the last iterations of LOOP are
1475 peeled, because:
1476 1. We introduced new uses after LOOP for IVs that were not originally used
1477 after LOOP: the IVs of LOOP are now used by an epilog loop.
1478 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1479 times, whereas the loop IVs should be bumped N times.
1481 Input:
1482 - LOOP - a loop that is going to be vectorized. The last few iterations
1483 of LOOP were peeled.
1484 - NITERS - the number of iterations that LOOP executes (before it is
1485 vectorized). i.e, the number of times the ivs should be bumped.
1486 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1487 coming out from LOOP on which there are uses of the LOOP ivs
1488 (this is the path from LOOP->exit to epilog_loop->preheader).
1490 The new definitions of the ivs are placed in LOOP->exit.
1491 The phi args associated with the edge UPDATE_E in the bb
1492 UPDATE_E->dest are updated accordingly.
1494 Assumption 1: Like the rest of the vectorizer, this function assumes
1495 a single loop exit that has a single predecessor.
1497 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1498 organized in the same order.
1500 Assumption 3: The access function of the ivs is simple enough (see
1501 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1503 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1504 coming out of LOOP on which the ivs of LOOP are used (this is the path
1505 that leads to the epilog loop; other paths skip the epilog loop). This
1506 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1507 needs to have its phis updated.
1510 static void
1511 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1512 tree niters, edge update_e)
1514 gphi_iterator gsi, gsi1;
1515 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1516 basic_block update_bb = update_e->dest;
1517 basic_block exit_bb = single_exit (loop)->dest;
1519 /* Make sure there exists a single-predecessor exit bb: */
1520 gcc_assert (single_pred_p (exit_bb));
1521 gcc_assert (single_succ_edge (exit_bb) == update_e);
1523 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1524 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1525 gsi_next (&gsi), gsi_next (&gsi1))
1527 tree init_expr;
1528 tree step_expr, off;
1529 tree type;
1530 tree var, ni, ni_name;
1531 gimple_stmt_iterator last_gsi;
1533 gphi *phi = gsi.phi ();
1534 gphi *phi1 = gsi1.phi ();
1535 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1536 if (dump_enabled_p ())
1537 dump_printf_loc (MSG_NOTE, vect_location,
1538 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1540 /* Skip reduction and virtual phis. */
1541 if (!iv_phi_p (phi_info))
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_NOTE, vect_location,
1545 "reduc or virtual phi. skip.\n");
1546 continue;
1549 type = TREE_TYPE (gimple_phi_result (phi));
1550 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1551 step_expr = unshare_expr (step_expr);
1553 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1554 of degree >= 2 or exponential. */
1555 gcc_assert (!tree_is_chrec (step_expr));
1557 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1559 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1560 fold_convert (TREE_TYPE (step_expr), niters),
1561 step_expr);
1562 if (POINTER_TYPE_P (type))
1563 ni = fold_build_pointer_plus (init_expr, off);
1564 else
1565 ni = fold_build2 (PLUS_EXPR, type,
1566 init_expr, fold_convert (type, off));
1568 var = create_tmp_var (type, "tmp");
1570 last_gsi = gsi_last_bb (exit_bb);
1571 gimple_seq new_stmts = NULL;
1572 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1573 /* Exit_bb shouldn't be empty. */
1574 if (!gsi_end_p (last_gsi))
1575 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1576 else
1577 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1579 /* Fix phi expressions in the successor bb. */
1580 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1584 /* Return a gimple value containing the misalignment (measured in vector
1585 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1586 it is away from a perfectly aligned address. Add any new statements
1587 to SEQ. */
1589 static tree
1590 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1592 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1593 stmt_vec_info stmt_info = dr_info->stmt;
1594 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1596 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1597 unsigned HOST_WIDE_INT target_align_c;
1598 tree target_align_minus_1;
1600 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1601 size_zero_node) < 0;
1602 tree offset = (negative
1603 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1604 : size_zero_node);
1605 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1606 stmt_info, seq,
1607 offset);
1608 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1609 if (target_align.is_constant (&target_align_c))
1610 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1611 else
1613 tree vla = build_int_cst (type, target_align);
1614 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1615 fold_build2 (MINUS_EXPR, type,
1616 build_int_cst (type, 0), vla));
1617 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1618 build_int_cst (type, 1));
1621 HOST_WIDE_INT elem_size
1622 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1623 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1625 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1626 tree int_start_addr = fold_convert (type, start_addr);
1627 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1628 target_align_minus_1);
1630 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1631 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1632 elem_size_log);
1634 return misalign_in_elems;
1637 /* Function vect_gen_prolog_loop_niters
1639 Generate the number of iterations which should be peeled as prolog for the
1640 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1641 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1642 As a result, after the execution of this loop, the data reference DR will
1643 refer to an aligned location. The following computation is generated:
1645 If the misalignment of DR is known at compile time:
1646 addr_mis = int mis = DR_MISALIGNMENT (dr);
1647 Else, compute address misalignment in bytes:
1648 addr_mis = addr & (target_align - 1)
1650 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1652 (elem_size = element type size; an element is the scalar element whose type
1653 is the inner type of the vectype)
1655 The computations will be emitted at the end of BB. We also compute and
1656 store upper bound (included) of the result in BOUND.
1658 When the step of the data-ref in the loop is not 1 (as in interleaved data
1659 and SLP), the number of iterations of the prolog must be divided by the step
1660 (which is equal to the size of interleaved group).
1662 The above formulas assume that VF == number of elements in the vector. This
1663 may not hold when there are multiple-types in the loop.
1664 In this case, for some data-references in the loop the VF does not represent
1665 the number of elements that fit in the vector. Therefore, instead of VF we
1666 use TYPE_VECTOR_SUBPARTS. */
1668 static tree
1669 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1670 basic_block bb, int *bound)
1672 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1673 tree var;
1674 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1675 gimple_seq stmts = NULL, new_stmts = NULL;
1676 tree iters, iters_name;
1677 stmt_vec_info stmt_info = dr_info->stmt;
1678 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1679 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1681 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1683 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_NOTE, vect_location,
1687 "known peeling = %d.\n", npeel);
1689 iters = build_int_cst (niters_type, npeel);
1690 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1692 else
1694 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1695 tree type = TREE_TYPE (misalign_in_elems);
1696 HOST_WIDE_INT elem_size
1697 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1698 /* We only do prolog peeling if the target alignment is known at compile
1699 time. */
1700 poly_uint64 align_in_elems =
1701 exact_div (target_align, elem_size);
1702 tree align_in_elems_minus_1 =
1703 build_int_cst (type, align_in_elems - 1);
1704 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1706 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1707 & (align_in_elems - 1)). */
1708 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1709 size_zero_node) < 0;
1710 if (negative)
1711 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1712 align_in_elems_tree);
1713 else
1714 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1715 misalign_in_elems);
1716 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1717 iters = fold_convert (niters_type, iters);
1718 unsigned HOST_WIDE_INT align_in_elems_c;
1719 if (align_in_elems.is_constant (&align_in_elems_c))
1720 *bound = align_in_elems_c - 1;
1721 else
1722 *bound = -1;
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "niters for prolog loop: %T\n", iters);
1729 var = create_tmp_var (niters_type, "prolog_loop_niters");
1730 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1732 if (new_stmts)
1733 gimple_seq_add_seq (&stmts, new_stmts);
1734 if (stmts)
1736 gcc_assert (single_succ_p (bb));
1737 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1738 if (gsi_end_p (gsi))
1739 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1740 else
1741 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1743 return iters_name;
1747 /* Function vect_update_init_of_dr
1749 If CODE is PLUS, the vector loop starts NITERS iterations after the
1750 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1751 iterations before the scalar one (using masking to skip inactive
1752 elements). This function updates the information recorded in DR to
1753 account for the difference. Specifically, it updates the OFFSET
1754 field of DR_INFO. */
1756 static void
1757 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1759 struct data_reference *dr = dr_info->dr;
1760 tree offset = dr_info->offset;
1761 if (!offset)
1762 offset = build_zero_cst (sizetype);
1764 niters = fold_build2 (MULT_EXPR, sizetype,
1765 fold_convert (sizetype, niters),
1766 fold_convert (sizetype, DR_STEP (dr)));
1767 offset = fold_build2 (code, sizetype,
1768 fold_convert (sizetype, offset), niters);
1769 dr_info->offset = offset;
1773 /* Function vect_update_inits_of_drs
1775 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1776 CODE and NITERS are as for vect_update_inits_of_dr. */
1778 void
1779 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1780 tree_code code)
1782 unsigned int i;
1783 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1784 struct data_reference *dr;
1786 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1788 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1789 here, but since we might use these niters to update the epilogues niters
1790 and data references we can't insert them here as this definition might not
1791 always dominate its uses. */
1792 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1793 niters = fold_convert (sizetype, niters);
1795 FOR_EACH_VEC_ELT (datarefs, i, dr)
1797 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1798 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1799 vect_update_init_of_dr (dr_info, niters, code);
1803 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1804 by masking. This involves calculating the number of iterations to
1805 be peeled and then aligning all memory references appropriately. */
1807 void
1808 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1810 tree misalign_in_elems;
1811 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1813 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1815 /* From the information recorded in LOOP_VINFO get the number of iterations
1816 that need to be skipped via masking. */
1817 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1819 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1820 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1821 misalign_in_elems = build_int_cst (type, misalign);
1823 else
1825 gimple_seq seq1 = NULL, seq2 = NULL;
1826 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1827 misalign_in_elems = fold_convert (type, misalign_in_elems);
1828 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1829 &seq2, true, NULL_TREE);
1830 gimple_seq_add_seq (&seq1, seq2);
1831 if (seq1)
1833 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1834 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1835 gcc_assert (!new_bb);
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "misalignment for fully-masked loop: %T\n",
1842 misalign_in_elems);
1844 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1846 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1849 /* This function builds ni_name = number of iterations. Statements
1850 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1851 it to TRUE if new ssa_var is generated. */
1853 tree
1854 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1856 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1857 if (TREE_CODE (ni) == INTEGER_CST)
1858 return ni;
1859 else
1861 tree ni_name, var;
1862 gimple_seq stmts = NULL;
1863 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1865 var = create_tmp_var (TREE_TYPE (ni), "niters");
1866 ni_name = force_gimple_operand (ni, &stmts, false, var);
1867 if (stmts)
1869 gsi_insert_seq_on_edge_immediate (pe, stmts);
1870 if (new_var_p != NULL)
1871 *new_var_p = true;
1874 return ni_name;
1878 /* Calculate the number of iterations above which vectorized loop will be
1879 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1880 of prolog loop. If it's integer const, the integer number is also passed
1881 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1882 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1883 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1884 threshold below which the scalar (rather than vectorized) loop will be
1885 executed. This function stores the upper bound (inclusive) of the result
1886 in BOUND_SCALAR. */
1888 static tree
1889 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1890 int bound_prolog, poly_int64 bound_epilog, int th,
1891 poly_uint64 *bound_scalar,
1892 bool check_profitability)
1894 tree type = TREE_TYPE (niters_prolog);
1895 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1896 build_int_cst (type, bound_epilog));
1898 *bound_scalar = bound_prolog + bound_epilog;
1899 if (check_profitability)
1901 /* TH indicates the minimum niters of vectorized loop, while we
1902 compute the maximum niters of scalar loop. */
1903 th--;
1904 /* Peeling for constant times. */
1905 if (int_niters_prolog >= 0)
1907 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1908 return build_int_cst (type, *bound_scalar);
1910 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1911 and BOUND_EPILOG are inclusive upper bounds. */
1912 if (known_ge (th, bound_prolog + bound_epilog))
1914 *bound_scalar = th;
1915 return build_int_cst (type, th);
1917 /* Need to do runtime comparison. */
1918 else if (maybe_gt (th, bound_epilog))
1920 *bound_scalar = upper_bound (*bound_scalar, th);
1921 return fold_build2 (MAX_EXPR, type,
1922 build_int_cst (type, th), niters);
1925 return niters;
1928 /* NITERS is the number of times that the original scalar loop executes
1929 after peeling. Work out the maximum number of iterations N that can
1930 be handled by the vectorized form of the loop and then either:
1932 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1934 niters_vector = N
1936 b) set *STEP_VECTOR_PTR to one and generate:
1938 niters_vector = N / vf
1940 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1941 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1942 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1944 void
1945 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1946 tree *niters_vector_ptr, tree *step_vector_ptr,
1947 bool niters_no_overflow)
1949 tree ni_minus_gap, var;
1950 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1951 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1952 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1953 tree log_vf = NULL_TREE;
1955 /* If epilogue loop is required because of data accesses with gaps, we
1956 subtract one iteration from the total number of iterations here for
1957 correct calculation of RATIO. */
1958 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1960 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1961 build_one_cst (type));
1962 if (!is_gimple_val (ni_minus_gap))
1964 var = create_tmp_var (type, "ni_gap");
1965 gimple *stmts = NULL;
1966 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1967 true, var);
1968 gsi_insert_seq_on_edge_immediate (pe, stmts);
1971 else
1972 ni_minus_gap = niters;
1974 unsigned HOST_WIDE_INT const_vf;
1975 if (vf.is_constant (&const_vf)
1976 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1978 /* Create: niters >> log2(vf) */
1979 /* If it's known that niters == number of latch executions + 1 doesn't
1980 overflow, we can generate niters >> log2(vf); otherwise we generate
1981 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1982 will be at least one. */
1983 log_vf = build_int_cst (type, exact_log2 (const_vf));
1984 if (niters_no_overflow)
1985 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1986 else
1987 niters_vector
1988 = fold_build2 (PLUS_EXPR, type,
1989 fold_build2 (RSHIFT_EXPR, type,
1990 fold_build2 (MINUS_EXPR, type,
1991 ni_minus_gap,
1992 build_int_cst (type, vf)),
1993 log_vf),
1994 build_int_cst (type, 1));
1995 step_vector = build_one_cst (type);
1997 else
1999 niters_vector = ni_minus_gap;
2000 step_vector = build_int_cst (type, vf);
2003 if (!is_gimple_val (niters_vector))
2005 var = create_tmp_var (type, "bnd");
2006 gimple_seq stmts = NULL;
2007 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2008 gsi_insert_seq_on_edge_immediate (pe, stmts);
2009 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2010 we set range information to make niters analyzer's life easier. */
2011 if (stmts != NULL && log_vf)
2012 set_range_info (niters_vector, VR_RANGE,
2013 wi::to_wide (build_int_cst (type, 1)),
2014 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
2015 TYPE_MAX_VALUE (type),
2016 log_vf)));
2018 *niters_vector_ptr = niters_vector;
2019 *step_vector_ptr = step_vector;
2021 return;
2024 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2025 loop specified by LOOP_VINFO after vectorization, compute the number
2026 of iterations before vectorization (niters_vector * vf) and store it
2027 to NITERS_VECTOR_MULT_VF_PTR. */
2029 static void
2030 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2031 tree niters_vector,
2032 tree *niters_vector_mult_vf_ptr)
2034 /* We should be using a step_vector of VF if VF is variable. */
2035 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2036 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2037 tree type = TREE_TYPE (niters_vector);
2038 tree log_vf = build_int_cst (type, exact_log2 (vf));
2039 basic_block exit_bb = single_exit (loop)->dest;
2041 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2042 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2043 niters_vector, log_vf);
2044 if (!is_gimple_val (niters_vector_mult_vf))
2046 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2047 gimple_seq stmts = NULL;
2048 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2049 &stmts, true, var);
2050 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2051 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2053 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2056 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2057 this function searches for the corresponding lcssa phi node in exit
2058 bb of LOOP. If it is found, return the phi result; otherwise return
2059 NULL. */
2061 static tree
2062 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2063 gphi *lcssa_phi)
2065 gphi_iterator gsi;
2066 edge e = single_exit (loop);
2068 gcc_assert (single_pred_p (e->dest));
2069 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2071 gphi *phi = gsi.phi ();
2072 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2073 PHI_ARG_DEF (lcssa_phi, 0), 0))
2074 return PHI_RESULT (phi);
2076 return NULL_TREE;
2079 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2080 from SECOND/FIRST and puts it at the original loop's preheader/exit
2081 edge, the two loops are arranged as below:
2083 preheader_a:
2084 first_loop:
2085 header_a:
2086 i_1 = PHI<i_0, i_2>;
2088 i_2 = i_1 + 1;
2089 if (cond_a)
2090 goto latch_a;
2091 else
2092 goto between_bb;
2093 latch_a:
2094 goto header_a;
2096 between_bb:
2097 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2099 second_loop:
2100 header_b:
2101 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2102 or with i_2 if no LCSSA phi is created
2103 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2105 i_4 = i_3 + 1;
2106 if (cond_b)
2107 goto latch_b;
2108 else
2109 goto exit_bb;
2110 latch_b:
2111 goto header_b;
2113 exit_bb:
2115 This function creates loop closed SSA for the first loop; update the
2116 second loop's PHI nodes by replacing argument on incoming edge with the
2117 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2118 is false, Loop closed ssa phis will only be created for non-iv phis for
2119 the first loop.
2121 This function assumes exit bb of the first loop is preheader bb of the
2122 second loop, i.e, between_bb in the example code. With PHIs updated,
2123 the second loop will execute rest iterations of the first. */
2125 static void
2126 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2127 class loop *first, class loop *second,
2128 bool create_lcssa_for_iv_phis)
2130 gphi_iterator gsi_update, gsi_orig;
2131 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2133 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2134 edge second_preheader_e = loop_preheader_edge (second);
2135 basic_block between_bb = single_exit (first)->dest;
2137 gcc_assert (between_bb == second_preheader_e->src);
2138 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2139 /* Either the first loop or the second is the loop to be vectorized. */
2140 gcc_assert (loop == first || loop == second);
2142 for (gsi_orig = gsi_start_phis (first->header),
2143 gsi_update = gsi_start_phis (second->header);
2144 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2145 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2147 gphi *orig_phi = gsi_orig.phi ();
2148 gphi *update_phi = gsi_update.phi ();
2150 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2151 /* Generate lcssa PHI node for the first loop. */
2152 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2153 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2154 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2156 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2157 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2158 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2159 arg = new_res;
2162 /* Update PHI node in the second loop by replacing arg on the loop's
2163 incoming edge. */
2164 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2167 /* For epilogue peeling we have to make sure to copy all LC PHIs
2168 for correct vectorization of live stmts. */
2169 if (loop == first)
2171 basic_block orig_exit = single_exit (second)->dest;
2172 for (gsi_orig = gsi_start_phis (orig_exit);
2173 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2175 gphi *orig_phi = gsi_orig.phi ();
2176 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2177 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2178 continue;
2180 /* Already created in the above loop. */
2181 if (find_guard_arg (first, second, orig_phi))
2182 continue;
2184 tree new_res = copy_ssa_name (orig_arg);
2185 gphi *lcphi = create_phi_node (new_res, between_bb);
2186 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2191 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2192 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2193 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2194 appear like below:
2196 guard_bb:
2197 if (cond)
2198 goto merge_bb;
2199 else
2200 goto skip_loop;
2202 skip_loop:
2203 header_a:
2204 i_1 = PHI<i_0, i_2>;
2206 i_2 = i_1 + 1;
2207 if (cond_a)
2208 goto latch_a;
2209 else
2210 goto exit_a;
2211 latch_a:
2212 goto header_a;
2214 exit_a:
2215 i_5 = PHI<i_2>;
2217 merge_bb:
2218 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2220 update_loop:
2221 header_b:
2222 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2224 i_4 = i_3 + 1;
2225 if (cond_b)
2226 goto latch_b;
2227 else
2228 goto exit_bb;
2229 latch_b:
2230 goto header_b;
2232 exit_bb:
2234 This function creates PHI nodes at merge_bb and replaces the use of i_5
2235 in the update_loop's PHI node with the result of new PHI result. */
2237 static void
2238 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2239 class loop *update_loop,
2240 edge guard_edge, edge merge_edge)
2242 location_t merge_loc, guard_loc;
2243 edge orig_e = loop_preheader_edge (skip_loop);
2244 edge update_e = loop_preheader_edge (update_loop);
2245 gphi_iterator gsi_orig, gsi_update;
2247 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2248 gsi_update = gsi_start_phis (update_loop->header));
2249 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2250 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2252 gphi *orig_phi = gsi_orig.phi ();
2253 gphi *update_phi = gsi_update.phi ();
2255 /* Generate new phi node at merge bb of the guard. */
2256 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2257 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2259 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2260 args in NEW_PHI for these edges. */
2261 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2262 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2263 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2264 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2265 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2266 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2268 /* Update phi in UPDATE_PHI. */
2269 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2273 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2274 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2275 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2276 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2277 The CFG looks like:
2279 loop:
2280 header_a:
2281 i_1 = PHI<i_0, i_2>;
2283 i_2 = i_1 + 1;
2284 if (cond_a)
2285 goto latch_a;
2286 else
2287 goto exit_a;
2288 latch_a:
2289 goto header_a;
2291 exit_a:
2293 guard_bb:
2294 if (cond)
2295 goto merge_bb;
2296 else
2297 goto epilog_loop;
2299 ;; fall_through_bb
2301 epilog_loop:
2302 header_b:
2303 i_3 = PHI<i_2, i_4>;
2305 i_4 = i_3 + 1;
2306 if (cond_b)
2307 goto latch_b;
2308 else
2309 goto merge_bb;
2310 latch_b:
2311 goto header_b;
2313 merge_bb:
2314 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2316 exit_bb:
2317 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2319 For each name used out side EPILOG (i.e - for each name that has a lcssa
2320 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2321 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2322 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2323 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2324 in exit_bb will also be updated. */
2326 static void
2327 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2328 edge guard_edge, edge merge_edge)
2330 gphi_iterator gsi;
2331 basic_block merge_bb = guard_edge->dest;
2333 gcc_assert (single_succ_p (merge_bb));
2334 edge e = single_succ_edge (merge_bb);
2335 basic_block exit_bb = e->dest;
2336 gcc_assert (single_pred_p (exit_bb));
2337 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2339 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2341 gphi *update_phi = gsi.phi ();
2342 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2344 tree merge_arg = NULL_TREE;
2346 /* If the old argument is a SSA_NAME use its current_def. */
2347 if (TREE_CODE (old_arg) == SSA_NAME)
2348 merge_arg = get_current_def (old_arg);
2349 /* If it's a constant or doesn't have a current_def, just use the old
2350 argument. */
2351 if (!merge_arg)
2352 merge_arg = old_arg;
2354 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2355 /* If the var is live after loop but not a reduction, we simply
2356 use the old arg. */
2357 if (!guard_arg)
2358 guard_arg = old_arg;
2360 /* Create new phi node in MERGE_BB: */
2361 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2362 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2364 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2365 the two PHI args in merge_phi for these edges. */
2366 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2367 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2369 /* Update the original phi in exit_bb. */
2370 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2374 /* EPILOG loop is duplicated from the original loop for vectorizing,
2375 the arg of its loop closed ssa PHI needs to be updated. */
2377 static void
2378 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2380 gphi_iterator gsi;
2381 basic_block exit_bb = single_exit (epilog)->dest;
2383 gcc_assert (single_pred_p (exit_bb));
2384 edge e = EDGE_PRED (exit_bb, 0);
2385 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2386 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2389 /* Function vect_do_peeling.
2391 Input:
2392 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2394 preheader:
2395 LOOP:
2396 header_bb:
2397 loop_body
2398 if (exit_loop_cond) goto exit_bb
2399 else goto header_bb
2400 exit_bb:
2402 - NITERS: The number of iterations of the loop.
2403 - NITERSM1: The number of iterations of the loop's latch.
2404 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2405 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2406 CHECK_PROFITABILITY is true.
2407 Output:
2408 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2409 iterate after vectorization; see vect_set_loop_condition for details.
2410 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2411 should be set to the number of scalar iterations handled by the
2412 vector loop. The SSA name is only used on exit from the loop.
2414 This function peels prolog and epilog from the loop, adds guards skipping
2415 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2416 would look like:
2418 guard_bb_1:
2419 if (prefer_scalar_loop) goto merge_bb_1
2420 else goto guard_bb_2
2422 guard_bb_2:
2423 if (skip_prolog) goto merge_bb_2
2424 else goto prolog_preheader
2426 prolog_preheader:
2427 PROLOG:
2428 prolog_header_bb:
2429 prolog_body
2430 if (exit_prolog_cond) goto prolog_exit_bb
2431 else goto prolog_header_bb
2432 prolog_exit_bb:
2434 merge_bb_2:
2436 vector_preheader:
2437 VECTOR LOOP:
2438 vector_header_bb:
2439 vector_body
2440 if (exit_vector_cond) goto vector_exit_bb
2441 else goto vector_header_bb
2442 vector_exit_bb:
2444 guard_bb_3:
2445 if (skip_epilog) goto merge_bb_3
2446 else goto epilog_preheader
2448 merge_bb_1:
2450 epilog_preheader:
2451 EPILOG:
2452 epilog_header_bb:
2453 epilog_body
2454 if (exit_epilog_cond) goto merge_bb_3
2455 else goto epilog_header_bb
2457 merge_bb_3:
2459 Note this function peels prolog and epilog only if it's necessary,
2460 as well as guards.
2461 This function returns the epilogue loop if a decision was made to vectorize
2462 it, otherwise NULL.
2464 The analysis resulting in this epilogue loop's loop_vec_info was performed
2465 in the same vect_analyze_loop call as the main loop's. At that time
2466 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2467 vectorization factors than the main loop. This list is stored in the main
2468 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2469 vectorize the epilogue loop for a lower vectorization factor, the
2470 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2471 updated and linked to the epilogue loop. This is later used to vectorize
2472 the epilogue. The reason the loop_vec_info needs updating is that it was
2473 constructed based on the original main loop, and the epilogue loop is a
2474 copy of this loop, so all links pointing to statements in the original loop
2475 need updating. Furthermore, these loop_vec_infos share the
2476 data_reference's records, which will also need to be updated.
2478 TODO: Guard for prefer_scalar_loop should be emitted along with
2479 versioning conditions if loop versioning is needed. */
2482 class loop *
2483 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2484 tree *niters_vector, tree *step_vector,
2485 tree *niters_vector_mult_vf_var, int th,
2486 bool check_profitability, bool niters_no_overflow,
2487 tree *advance)
2489 edge e, guard_e;
2490 tree type = TREE_TYPE (niters), guard_cond;
2491 basic_block guard_bb, guard_to;
2492 profile_probability prob_prolog, prob_vector, prob_epilog;
2493 int estimated_vf;
2494 int prolog_peeling = 0;
2495 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2496 /* We currently do not support prolog peeling if the target alignment is not
2497 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2498 target alignment being constant. */
2499 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2500 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2501 return NULL;
2503 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2504 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2506 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2507 poly_uint64 bound_epilog = 0;
2508 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2509 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2510 bound_epilog += vf - 1;
2511 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2512 bound_epilog += 1;
2513 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2514 poly_uint64 bound_scalar = bound_epilog;
2516 if (!prolog_peeling && !epilog_peeling)
2517 return NULL;
2519 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2520 estimated_vf = vect_vf_for_cost (loop_vinfo);
2521 if (estimated_vf == 2)
2522 estimated_vf = 3;
2523 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2524 .apply_scale (estimated_vf - 1, estimated_vf);
2526 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2527 class loop *first_loop = loop;
2528 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2530 /* We might have a queued need to update virtual SSA form. As we
2531 delete the update SSA machinery below after doing a regular
2532 incremental SSA update during loop copying make sure we don't
2533 lose that fact.
2534 ??? Needing to update virtual SSA form by renaming is unfortunate
2535 but not all of the vectorizer code inserting new loads / stores
2536 properly assigns virtual operands to those statements. */
2537 update_ssa (TODO_update_ssa_only_virtuals);
2539 create_lcssa_for_virtual_phi (loop);
2541 /* If we're vectorizing an epilogue loop, the update_ssa above will
2542 have ensured that the virtual operand is in SSA form throughout the
2543 vectorized main loop. Normally it is possible to trace the updated
2544 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2545 back to scalar-stmt vuses, meaning that the effect of the SSA update
2546 remains local to the main loop. However, there are rare cases in
2547 which the vectorized loop has vdefs even when the original scalar
2548 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2549 introduces clobbers of the temporary vector array, which in turn
2550 needs new vdefs. If the scalar loop doesn't write to memory, these
2551 new vdefs will be the only ones in the vector loop.
2553 In that case, update_ssa will have added a new virtual phi to the
2554 main loop, which previously didn't need one. Ensure that we (locally)
2555 maintain LCSSA form for the virtual operand, just as we would have
2556 done if the virtual phi had existed from the outset. This makes it
2557 easier to duplicate the scalar epilogue loop below. */
2558 tree vop_to_rename = NULL_TREE;
2559 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2561 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2562 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2565 if (MAY_HAVE_DEBUG_BIND_STMTS)
2567 gcc_assert (!adjust_vec.exists ());
2568 adjust_vec.create (32);
2570 initialize_original_copy_tables ();
2572 /* Record the anchor bb at which the guard should be placed if the scalar
2573 loop might be preferred. */
2574 basic_block anchor = loop_preheader_edge (loop)->src;
2576 /* Generate the number of iterations for the prolog loop. We do this here
2577 so that we can also get the upper bound on the number of iterations. */
2578 tree niters_prolog;
2579 int bound_prolog = 0;
2580 if (prolog_peeling)
2581 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2582 &bound_prolog);
2583 else
2584 niters_prolog = build_int_cst (type, 0);
2586 loop_vec_info epilogue_vinfo = NULL;
2587 if (vect_epilogues)
2589 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2590 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2593 tree niters_vector_mult_vf = NULL_TREE;
2594 /* Saving NITERs before the loop, as this may be changed by prologue. */
2595 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2596 edge update_e = NULL, skip_e = NULL;
2597 unsigned int lowest_vf = constant_lower_bound (vf);
2598 /* If we know the number of scalar iterations for the main loop we should
2599 check whether after the main loop there are enough iterations left over
2600 for the epilogue. */
2601 if (vect_epilogues
2602 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2603 && prolog_peeling >= 0
2604 && known_eq (vf, lowest_vf)
2605 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (epilogue_vinfo))
2607 unsigned HOST_WIDE_INT eiters
2608 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2609 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2611 eiters -= prolog_peeling;
2612 eiters
2613 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2615 unsigned int ratio;
2616 unsigned int epilogue_gaps
2617 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2618 while (!(constant_multiple_p
2619 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2620 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2621 && eiters >= lowest_vf / ratio + epilogue_gaps))
2623 delete epilogue_vinfo;
2624 epilogue_vinfo = NULL;
2625 if (loop_vinfo->epilogue_vinfos.length () == 0)
2627 vect_epilogues = false;
2628 break;
2630 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2631 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2632 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2635 /* Prolog loop may be skipped. */
2636 bool skip_prolog = (prolog_peeling != 0);
2637 /* Skip this loop to epilog when there are not enough iterations to enter this
2638 vectorized loop. If true we should perform runtime checks on the NITERS
2639 to check whether we should skip the current vectorized loop. If we know
2640 the number of scalar iterations we may choose to add a runtime check if
2641 this number "maybe" smaller than the number of iterations required
2642 when we know the number of scalar iterations may potentially
2643 be smaller than the number of iterations required to enter this loop, for
2644 this we use the upper bounds on the prolog and epilog peeling. When we
2645 don't know the number of iterations and don't require versioning it is
2646 because we have asserted that there are enough scalar iterations to enter
2647 the main loop, so this skip is not necessary. When we are versioning then
2648 we only add such a skip if we have chosen to vectorize the epilogue. */
2649 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2650 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2651 bound_prolog + bound_epilog)
2652 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2653 || vect_epilogues));
2654 /* Epilog loop must be executed if the number of iterations for epilog
2655 loop is known at compile time, otherwise we need to add a check at
2656 the end of vector loop and skip to the end of epilog loop. */
2657 bool skip_epilog = (prolog_peeling < 0
2658 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2659 || !vf.is_constant ());
2660 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2661 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2662 skip_epilog = false;
2664 if (skip_vector)
2666 split_edge (loop_preheader_edge (loop));
2668 /* Due to the order in which we peel prolog and epilog, we first
2669 propagate probability to the whole loop. The purpose is to
2670 avoid adjusting probabilities of both prolog and vector loops
2671 separately. Note in this case, the probability of epilog loop
2672 needs to be scaled back later. */
2673 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2674 if (prob_vector.initialized_p ())
2676 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2677 scale_loop_profile (loop, prob_vector, 0);
2681 dump_user_location_t loop_loc = find_loop_location (loop);
2682 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2683 if (vect_epilogues)
2684 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2685 use the original scalar loop as remaining epilogue if necessary. */
2686 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2687 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2689 if (prolog_peeling)
2691 e = loop_preheader_edge (loop);
2692 if (!slpeel_can_duplicate_loop_p (loop, e))
2694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2695 "loop can't be duplicated to preheader edge.\n");
2696 gcc_unreachable ();
2698 /* Peel prolog and put it on preheader edge of loop. */
2699 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2700 if (!prolog)
2702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2703 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2704 gcc_unreachable ();
2706 prolog->force_vectorize = false;
2707 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2708 first_loop = prolog;
2709 reset_original_copy_tables ();
2711 /* Update the number of iterations for prolog loop. */
2712 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2713 vect_set_loop_condition (prolog, NULL, niters_prolog,
2714 step_prolog, NULL_TREE, false);
2716 /* Skip the prolog loop. */
2717 if (skip_prolog)
2719 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2720 niters_prolog, build_int_cst (type, 0));
2721 guard_bb = loop_preheader_edge (prolog)->src;
2722 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2723 guard_to = split_edge (loop_preheader_edge (loop));
2724 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2725 guard_to, guard_bb,
2726 prob_prolog.invert (),
2727 irred_flag);
2728 e = EDGE_PRED (guard_to, 0);
2729 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2730 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2732 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2733 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2736 /* Update init address of DRs. */
2737 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2738 /* Update niters for vector loop. */
2739 LOOP_VINFO_NITERS (loop_vinfo)
2740 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2741 LOOP_VINFO_NITERSM1 (loop_vinfo)
2742 = fold_build2 (MINUS_EXPR, type,
2743 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2744 bool new_var_p = false;
2745 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2746 /* It's guaranteed that vector loop bound before vectorization is at
2747 least VF, so set range information for newly generated var. */
2748 if (new_var_p)
2749 set_range_info (niters, VR_RANGE,
2750 wi::to_wide (build_int_cst (type, vf)),
2751 wi::to_wide (TYPE_MAX_VALUE (type)));
2753 /* Prolog iterates at most bound_prolog times, latch iterates at
2754 most bound_prolog - 1 times. */
2755 record_niter_bound (prolog, bound_prolog - 1, false, true);
2756 delete_update_ssa ();
2757 adjust_vec_debug_stmts ();
2758 scev_reset ();
2761 if (epilog_peeling)
2763 e = single_exit (loop);
2764 if (!slpeel_can_duplicate_loop_p (loop, e))
2766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2767 "loop can't be duplicated to exit edge.\n");
2768 gcc_unreachable ();
2770 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2771 said epilog then we should use a copy of the main loop as a starting
2772 point. This loop may have already had some preliminary transformations
2773 to allow for more optimal vectorization, for example if-conversion.
2774 If we are not vectorizing the epilog then we should use the scalar loop
2775 as the transformations mentioned above make less or no sense when not
2776 vectorizing. */
2777 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2778 if (vop_to_rename)
2780 /* Vectorizing the main loop can sometimes introduce a vdef to
2781 a loop that previously didn't have one; see the comment above
2782 the definition of VOP_TO_RENAME for details. The definition
2783 D that holds on E will then be different from the definition
2784 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2785 rename VOP_TO_RENAME to D when copying the loop.
2787 The virtual operand is in LCSSA form for the main loop,
2788 and no stmt between the main loop and E needs a vdef,
2789 so we know that D is provided by a phi rather than by a
2790 vdef on a normal gimple stmt. */
2791 basic_block vdef_bb = e->src;
2792 gphi *vphi;
2793 while (!(vphi = get_virtual_phi (vdef_bb)))
2794 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2795 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2796 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2798 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2799 if (!epilog)
2801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2802 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2803 gcc_unreachable ();
2805 epilog->force_vectorize = false;
2806 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2808 /* Scalar version loop may be preferred. In this case, add guard
2809 and skip to epilog. Note this only happens when the number of
2810 iterations of loop is unknown at compile time, otherwise this
2811 won't be vectorized. */
2812 if (skip_vector)
2814 /* Additional epilogue iteration is peeled if gap exists. */
2815 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2816 bound_prolog, bound_epilog,
2817 th, &bound_scalar,
2818 check_profitability);
2819 /* Build guard against NITERSM1 since NITERS may overflow. */
2820 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2821 guard_bb = anchor;
2822 guard_to = split_edge (loop_preheader_edge (epilog));
2823 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2824 guard_to, guard_bb,
2825 prob_vector.invert (),
2826 irred_flag);
2827 skip_e = guard_e;
2828 e = EDGE_PRED (guard_to, 0);
2829 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2830 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2832 /* Simply propagate profile info from guard_bb to guard_to which is
2833 a merge point of control flow. */
2834 guard_to->count = guard_bb->count;
2836 /* Scale probability of epilog loop back.
2837 FIXME: We should avoid scaling down and back up. Profile may
2838 get lost if we scale down to 0. */
2839 basic_block *bbs = get_loop_body (epilog);
2840 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2841 bbs[i]->count = bbs[i]->count.apply_scale
2842 (bbs[i]->count,
2843 bbs[i]->count.apply_probability
2844 (prob_vector));
2845 free (bbs);
2848 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2849 /* If loop is peeled for non-zero constant times, now niters refers to
2850 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2851 overflows. */
2852 niters_no_overflow |= (prolog_peeling > 0);
2853 vect_gen_vector_loop_niters (loop_vinfo, niters,
2854 niters_vector, step_vector,
2855 niters_no_overflow);
2856 if (!integer_onep (*step_vector))
2858 /* On exit from the loop we will have an easy way of calcalating
2859 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2860 until then. */
2861 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2862 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2863 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2865 else
2866 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2867 &niters_vector_mult_vf);
2868 /* Update IVs of original loop as if they were advanced by
2869 niters_vector_mult_vf steps. */
2870 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2871 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2872 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2873 update_e);
2875 if (skip_epilog)
2877 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2878 niters, niters_vector_mult_vf);
2879 guard_bb = single_exit (loop)->dest;
2880 guard_to = split_edge (single_exit (epilog));
2881 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2882 skip_vector ? anchor : guard_bb,
2883 prob_epilog.invert (),
2884 irred_flag);
2885 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2886 single_exit (epilog));
2887 /* Only need to handle basic block before epilog loop if it's not
2888 the guard_bb, which is the case when skip_vector is true. */
2889 if (guard_bb != bb_before_epilog)
2891 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2893 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2895 scale_loop_profile (epilog, prob_epilog, 0);
2897 else
2898 slpeel_update_phi_nodes_for_lcssa (epilog);
2900 unsigned HOST_WIDE_INT bound;
2901 if (bound_scalar.is_constant (&bound))
2903 gcc_assert (bound != 0);
2904 /* -1 to convert loop iterations to latch iterations. */
2905 record_niter_bound (epilog, bound - 1, false, true);
2908 delete_update_ssa ();
2909 adjust_vec_debug_stmts ();
2910 scev_reset ();
2913 if (vect_epilogues)
2915 epilog->aux = epilogue_vinfo;
2916 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2918 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2920 /* We now must calculate the number of NITERS performed by the previous
2921 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2922 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2923 niters_prolog, niters_vector_mult_vf);
2925 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2926 determine whether we are coming from the previous vectorized loop
2927 using the update_e edge or the skip_vector basic block using the
2928 skip_e edge. */
2929 if (skip_vector)
2931 gcc_assert (update_e != NULL && skip_e != NULL);
2932 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2933 update_e->dest);
2934 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2935 gimple *stmt = gimple_build_assign (new_ssa, niters);
2936 gimple_stmt_iterator gsi;
2937 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2938 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2940 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2941 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2943 else
2945 gsi = gsi_last_bb (update_e->src);
2946 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2949 niters = new_ssa;
2950 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2951 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2952 UNKNOWN_LOCATION);
2953 niters = PHI_RESULT (new_phi);
2956 /* Subtract the number of iterations performed by the vectorized loop
2957 from the number of total iterations. */
2958 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2959 before_loop_niters,
2960 niters);
2962 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2963 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2964 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2965 epilogue_niters,
2966 build_one_cst (TREE_TYPE (epilogue_niters)));
2968 /* Set ADVANCE to the number of iterations performed by the previous
2969 loop and its prologue. */
2970 *advance = niters;
2972 /* Redo the peeling for niter analysis as the NITERs and alignment
2973 may have been updated to take the main loop into account. */
2974 determine_peel_for_niter (epilogue_vinfo);
2977 adjust_vec.release ();
2978 free_original_copy_tables ();
2980 return vect_epilogues ? epilog : NULL;
2983 /* Function vect_create_cond_for_niters_checks.
2985 Create a conditional expression that represents the run-time checks for
2986 loop's niter. The loop is guaranteed to terminate if the run-time
2987 checks hold.
2989 Input:
2990 COND_EXPR - input conditional expression. New conditions will be chained
2991 with logical AND operation. If it is NULL, then the function
2992 is used to return the number of alias checks.
2993 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2994 to be checked.
2996 Output:
2997 COND_EXPR - conditional expression.
2999 The returned COND_EXPR is the conditional expression to be used in the
3000 if statement that controls which version of the loop gets executed at
3001 runtime. */
3003 static void
3004 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3006 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3008 if (*cond_expr)
3009 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3010 *cond_expr, part_cond_expr);
3011 else
3012 *cond_expr = part_cond_expr;
3015 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3016 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3018 static void
3019 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3021 if (*cond_expr)
3022 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3023 *cond_expr, part_cond_expr);
3024 else
3025 *cond_expr = part_cond_expr;
3028 /* Function vect_create_cond_for_align_checks.
3030 Create a conditional expression that represents the alignment checks for
3031 all of data references (array element references) whose alignment must be
3032 checked at runtime.
3034 Input:
3035 COND_EXPR - input conditional expression. New conditions will be chained
3036 with logical AND operation.
3037 LOOP_VINFO - two fields of the loop information are used.
3038 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3039 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3041 Output:
3042 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3043 expression.
3044 The returned value is the conditional expression to be used in the if
3045 statement that controls which version of the loop gets executed at runtime.
3047 The algorithm makes two assumptions:
3048 1) The number of bytes "n" in a vector is a power of 2.
3049 2) An address "a" is aligned if a%n is zero and that this
3050 test can be done as a&(n-1) == 0. For example, for 16
3051 byte vectors the test is a&0xf == 0. */
3053 static void
3054 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3055 tree *cond_expr,
3056 gimple_seq *cond_expr_stmt_list)
3058 vec<stmt_vec_info> may_misalign_stmts
3059 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3060 stmt_vec_info stmt_info;
3061 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3062 tree mask_cst;
3063 unsigned int i;
3064 tree int_ptrsize_type;
3065 char tmp_name[20];
3066 tree or_tmp_name = NULL_TREE;
3067 tree and_tmp_name;
3068 gimple *and_stmt;
3069 tree ptrsize_zero;
3070 tree part_cond_expr;
3072 /* Check that mask is one less than a power of 2, i.e., mask is
3073 all zeros followed by all ones. */
3074 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3076 int_ptrsize_type = signed_type_for (ptr_type_node);
3078 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3079 of the first vector of the i'th data reference. */
3081 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3083 gimple_seq new_stmt_list = NULL;
3084 tree addr_base;
3085 tree addr_tmp_name;
3086 tree new_or_tmp_name;
3087 gimple *addr_stmt, *or_stmt;
3088 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3089 bool negative = tree_int_cst_compare
3090 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3091 tree offset = negative
3092 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3094 /* create: addr_tmp = (int)(address_of_first_vector) */
3095 addr_base =
3096 vect_create_addr_base_for_vector_ref (loop_vinfo,
3097 stmt_info, &new_stmt_list,
3098 offset);
3099 if (new_stmt_list != NULL)
3100 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3102 sprintf (tmp_name, "addr2int%d", i);
3103 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3104 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3105 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3107 /* The addresses are OR together. */
3109 if (or_tmp_name != NULL_TREE)
3111 /* create: or_tmp = or_tmp | addr_tmp */
3112 sprintf (tmp_name, "orptrs%d", i);
3113 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3114 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3115 or_tmp_name, addr_tmp_name);
3116 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3117 or_tmp_name = new_or_tmp_name;
3119 else
3120 or_tmp_name = addr_tmp_name;
3122 } /* end for i */
3124 mask_cst = build_int_cst (int_ptrsize_type, mask);
3126 /* create: and_tmp = or_tmp & mask */
3127 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3129 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3130 or_tmp_name, mask_cst);
3131 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3133 /* Make and_tmp the left operand of the conditional test against zero.
3134 if and_tmp has a nonzero bit then some address is unaligned. */
3135 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3136 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3137 and_tmp_name, ptrsize_zero);
3138 chain_cond_expr (cond_expr, part_cond_expr);
3141 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3142 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3143 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3144 and this new condition are true. Treat a null *COND_EXPR as "true". */
3146 static void
3147 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3149 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3150 unsigned int i;
3151 vec_object_pair *pair;
3152 FOR_EACH_VEC_ELT (pairs, i, pair)
3154 tree addr1 = build_fold_addr_expr (pair->first);
3155 tree addr2 = build_fold_addr_expr (pair->second);
3156 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3157 addr1, addr2);
3158 chain_cond_expr (cond_expr, part_cond_expr);
3162 /* Create an expression that is true when all lower-bound conditions for
3163 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3165 static void
3166 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3168 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3169 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3171 tree expr = lower_bounds[i].expr;
3172 tree type = unsigned_type_for (TREE_TYPE (expr));
3173 expr = fold_convert (type, expr);
3174 poly_uint64 bound = lower_bounds[i].min_value;
3175 if (!lower_bounds[i].unsigned_p)
3177 expr = fold_build2 (PLUS_EXPR, type, expr,
3178 build_int_cstu (type, bound - 1));
3179 bound += bound - 1;
3181 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3182 build_int_cstu (type, bound));
3183 chain_cond_expr (cond_expr, part_cond_expr);
3187 /* Function vect_create_cond_for_alias_checks.
3189 Create a conditional expression that represents the run-time checks for
3190 overlapping of address ranges represented by a list of data references
3191 relations passed as input.
3193 Input:
3194 COND_EXPR - input conditional expression. New conditions will be chained
3195 with logical AND operation. If it is NULL, then the function
3196 is used to return the number of alias checks.
3197 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3198 to be checked.
3200 Output:
3201 COND_EXPR - conditional expression.
3203 The returned COND_EXPR is the conditional expression to be used in the if
3204 statement that controls which version of the loop gets executed at runtime.
3207 void
3208 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3210 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3211 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3213 if (comp_alias_ddrs.is_empty ())
3214 return;
3216 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3217 &comp_alias_ddrs, cond_expr);
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_NOTE, vect_location,
3220 "created %u versioning for alias checks.\n",
3221 comp_alias_ddrs.length ());
3225 /* Function vect_loop_versioning.
3227 If the loop has data references that may or may not be aligned or/and
3228 has data reference relations whose independence was not proven then
3229 two versions of the loop need to be generated, one which is vectorized
3230 and one which isn't. A test is then generated to control which of the
3231 loops is executed. The test checks for the alignment of all of the
3232 data references that may or may not be aligned. An additional
3233 sequence of runtime tests is generated for each pairs of DDRs whose
3234 independence was not proven. The vectorized version of loop is
3235 executed only if both alias and alignment tests are passed.
3237 The test generated to check which version of loop is executed
3238 is modified to also check for profitability as indicated by the
3239 cost model threshold TH.
3241 The versioning precondition(s) are placed in *COND_EXPR and
3242 *COND_EXPR_STMT_LIST. */
3244 class loop *
3245 vect_loop_versioning (loop_vec_info loop_vinfo,
3246 gimple *loop_vectorized_call)
3248 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3249 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3250 basic_block condition_bb;
3251 gphi_iterator gsi;
3252 gimple_stmt_iterator cond_exp_gsi;
3253 basic_block merge_bb;
3254 basic_block new_exit_bb;
3255 edge new_exit_e, e;
3256 gphi *orig_phi, *new_phi;
3257 tree cond_expr = NULL_TREE;
3258 gimple_seq cond_expr_stmt_list = NULL;
3259 tree arg;
3260 profile_probability prob = profile_probability::likely ();
3261 gimple_seq gimplify_stmt_list = NULL;
3262 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3263 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3264 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3265 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3266 poly_uint64 versioning_threshold
3267 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3268 tree version_simd_if_cond
3269 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3270 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3272 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3273 && !ordered_p (th, versioning_threshold))
3274 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3275 build_int_cst (TREE_TYPE (scalar_loop_iters),
3276 th - 1));
3277 if (maybe_ne (versioning_threshold, 0U))
3279 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3280 build_int_cst (TREE_TYPE (scalar_loop_iters),
3281 versioning_threshold - 1));
3282 if (cond_expr)
3283 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3284 expr, cond_expr);
3285 else
3286 cond_expr = expr;
3289 if (version_niter)
3290 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3292 if (cond_expr)
3293 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3294 &cond_expr_stmt_list,
3295 is_gimple_condexpr, NULL_TREE);
3297 if (version_align)
3298 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3299 &cond_expr_stmt_list);
3301 if (version_alias)
3303 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3304 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3305 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3308 if (version_simd_if_cond)
3310 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3311 if (flag_checking)
3312 if (basic_block bb
3313 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3314 gcc_assert (bb != loop->header
3315 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3316 && (scalar_loop == NULL
3317 || (bb != scalar_loop->header
3318 && dominated_by_p (CDI_DOMINATORS,
3319 scalar_loop->header, bb))));
3320 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3321 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3322 version_simd_if_cond, zero);
3323 if (cond_expr)
3324 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3325 c, cond_expr);
3326 else
3327 cond_expr = c;
3328 if (dump_enabled_p ())
3329 dump_printf_loc (MSG_NOTE, vect_location,
3330 "created versioning for simd if condition check.\n");
3333 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3334 &gimplify_stmt_list,
3335 is_gimple_condexpr, NULL_TREE);
3336 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3338 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3339 invariant in. */
3340 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3341 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3342 !gsi_end_p (gsi); gsi_next (&gsi))
3344 gimple *stmt = gsi_stmt (gsi);
3345 update_stmt (stmt);
3346 ssa_op_iter iter;
3347 use_operand_p use_p;
3348 basic_block def_bb;
3349 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3350 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3351 && flow_bb_inside_loop_p (outermost, def_bb))
3352 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3355 /* Search for the outermost loop we can version. Avoid versioning of
3356 non-perfect nests but allow if-conversion versioned loops inside. */
3357 class loop *loop_to_version = loop;
3358 if (flow_loop_nested_p (outermost, loop))
3360 if (dump_enabled_p ())
3361 dump_printf_loc (MSG_NOTE, vect_location,
3362 "trying to apply versioning to outer loop %d\n",
3363 outermost->num);
3364 if (outermost->num == 0)
3365 outermost = superloop_at_depth (loop, 1);
3366 /* And avoid applying versioning on non-perfect nests. */
3367 while (loop_to_version != outermost
3368 && single_exit (loop_outer (loop_to_version))
3369 && (!loop_outer (loop_to_version)->inner->next
3370 || vect_loop_vectorized_call (loop_to_version))
3371 && (!loop_outer (loop_to_version)->inner->next
3372 || !loop_outer (loop_to_version)->inner->next->next))
3373 loop_to_version = loop_outer (loop_to_version);
3376 /* Apply versioning. If there is already a scalar version created by
3377 if-conversion re-use that. Note we cannot re-use the copy of
3378 an if-converted outer-loop when vectorizing the inner loop only. */
3379 gcond *cond;
3380 if ((!loop_to_version->inner || loop == loop_to_version)
3381 && loop_vectorized_call)
3383 gcc_assert (scalar_loop);
3384 condition_bb = gimple_bb (loop_vectorized_call);
3385 cond = as_a <gcond *> (last_stmt (condition_bb));
3386 gimple_cond_set_condition_from_tree (cond, cond_expr);
3387 update_stmt (cond);
3389 if (cond_expr_stmt_list)
3391 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3392 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3393 GSI_SAME_STMT);
3396 /* if-conversion uses profile_probability::always () for both paths,
3397 reset the paths probabilities appropriately. */
3398 edge te, fe;
3399 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3400 te->probability = prob;
3401 fe->probability = prob.invert ();
3402 /* We can scale loops counts immediately but have to postpone
3403 scaling the scalar loop because we re-use it during peeling. */
3404 scale_loop_frequencies (loop_to_version, te->probability);
3405 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3407 nloop = scalar_loop;
3408 if (dump_enabled_p ())
3409 dump_printf_loc (MSG_NOTE, vect_location,
3410 "reusing %sloop version created by if conversion\n",
3411 loop_to_version != loop ? "outer " : "");
3413 else
3415 if (loop_to_version != loop
3416 && dump_enabled_p ())
3417 dump_printf_loc (MSG_NOTE, vect_location,
3418 "applying loop versioning to outer loop %d\n",
3419 loop_to_version->num);
3421 initialize_original_copy_tables ();
3422 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3423 prob, prob.invert (), prob, prob.invert (), true);
3424 gcc_assert (nloop);
3425 nloop = get_loop_copy (loop);
3427 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3428 reap those otherwise; they also refer to the original
3429 loops. */
3430 class loop *l = loop;
3431 while (gimple *call = vect_loop_vectorized_call (l))
3433 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3434 fold_loop_internal_call (call, boolean_false_node);
3435 l = loop_outer (l);
3437 free_original_copy_tables ();
3439 if (cond_expr_stmt_list)
3441 cond_exp_gsi = gsi_last_bb (condition_bb);
3442 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3443 GSI_SAME_STMT);
3446 /* Loop versioning violates an assumption we try to maintain during
3447 vectorization - that the loop exit block has a single predecessor.
3448 After versioning, the exit block of both loop versions is the same
3449 basic block (i.e. it has two predecessors). Just in order to simplify
3450 following transformations in the vectorizer, we fix this situation
3451 here by adding a new (empty) block on the exit-edge of the loop,
3452 with the proper loop-exit phis to maintain loop-closed-form.
3453 If loop versioning wasn't done from loop, but scalar_loop instead,
3454 merge_bb will have already just a single successor. */
3456 merge_bb = single_exit (loop_to_version)->dest;
3457 if (EDGE_COUNT (merge_bb->preds) >= 2)
3459 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3460 new_exit_bb = split_edge (single_exit (loop_to_version));
3461 new_exit_e = single_exit (loop_to_version);
3462 e = EDGE_SUCC (new_exit_bb, 0);
3464 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3465 gsi_next (&gsi))
3467 tree new_res;
3468 orig_phi = gsi.phi ();
3469 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3470 new_phi = create_phi_node (new_res, new_exit_bb);
3471 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3472 add_phi_arg (new_phi, arg, new_exit_e,
3473 gimple_phi_arg_location_from_edge (orig_phi, e));
3474 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3478 update_ssa (TODO_update_ssa);
3481 if (version_niter)
3483 /* The versioned loop could be infinite, we need to clear existing
3484 niter information which is copied from the original loop. */
3485 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3486 vect_free_loop_info_assumptions (nloop);
3487 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3488 loop_constraint_set (loop, LOOP_C_INFINITE);
3491 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3492 && dump_enabled_p ())
3494 if (version_alias)
3495 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3496 vect_location,
3497 "loop versioned for vectorization because of "
3498 "possible aliasing\n");
3499 if (version_align)
3500 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3501 vect_location,
3502 "loop versioned for vectorization to enhance "
3503 "alignment\n");
3507 return nloop;