[aarch64] Use op_mode instead of vmode in aarch64_vectorize_vec_perm_const.
[official-gcc.git] / gcc / tree-vect-loop-manip.cc
blob86d2264054ae120d59694a2a2c5a96d5b9eea8c0
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
316 *indices);
319 /* Try to use permutes to define the masks in DEST_RGM using the masks
320 in SRC_RGM, given that the former has twice as many masks as the
321 latter. Return true on success, adding any new statements to SEQ. */
323 static bool
324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
325 rgroup_controls *src_rgm)
327 tree src_masktype = src_rgm->type;
328 tree dest_masktype = dest_rgm->type;
329 machine_mode src_mode = TYPE_MODE (src_masktype);
330 insn_code icode1, icode2;
331 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
332 && (icode1 = optab_handler (vec_unpacku_hi_optab,
333 src_mode)) != CODE_FOR_nothing
334 && (icode2 = optab_handler (vec_unpacku_lo_optab,
335 src_mode)) != CODE_FOR_nothing)
337 /* Unpacking the source masks gives at least as many mask bits as
338 we need. We can then VIEW_CONVERT any excess bits away. */
339 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
340 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
341 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
342 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
344 tree src = src_rgm->controls[i / 2];
345 tree dest = dest_rgm->controls[i];
346 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
347 ? VEC_UNPACK_HI_EXPR
348 : VEC_UNPACK_LO_EXPR);
349 gassign *stmt;
350 if (dest_masktype == unpack_masktype)
351 stmt = gimple_build_assign (dest, code, src);
352 else
354 tree temp = make_ssa_name (unpack_masktype);
355 stmt = gimple_build_assign (temp, code, src);
356 gimple_seq_add_stmt (seq, stmt);
357 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
358 build1 (VIEW_CONVERT_EXPR,
359 dest_masktype, temp));
361 gimple_seq_add_stmt (seq, stmt);
363 return true;
365 vec_perm_indices indices[2];
366 if (dest_masktype == src_masktype
367 && interleave_supported_p (&indices[0], src_masktype, 0)
368 && interleave_supported_p (&indices[1], src_masktype, 1))
370 /* The destination requires twice as many mask bits as the source, so
371 we can use interleaving permutes to double up the number of bits. */
372 tree masks[2];
373 for (unsigned int i = 0; i < 2; ++i)
374 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
375 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
377 tree src = src_rgm->controls[i / 2];
378 tree dest = dest_rgm->controls[i];
379 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
380 src, src, masks[i & 1]);
381 gimple_seq_add_stmt (seq, stmt);
383 return true;
385 return false;
388 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
389 for all the rgroup controls in RGC and return a control that is nonzero
390 when the loop needs to iterate. Add any new preheader statements to
391 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
393 RGC belongs to loop LOOP. The loop originally iterated NITERS
394 times and has been vectorized according to LOOP_VINFO.
396 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
397 starts with NITERS_SKIP dummy iterations of the scalar loop before
398 the real work starts. The mask elements for these dummy iterations
399 must be 0, to ensure that the extra iterations do not have an effect.
401 It is known that:
403 NITERS * RGC->max_nscalars_per_iter * RGC->factor
405 does not overflow. However, MIGHT_WRAP_P says whether an induction
406 variable that starts at 0 and has step:
408 VF * RGC->max_nscalars_per_iter * RGC->factor
410 might overflow before hitting a value above:
412 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
414 This means that we cannot guarantee that such an induction variable
415 would ever hit a value that produces a set of all-false masks or zero
416 lengths for RGC.
418 Note: the cost of the code generated by this function is modeled
419 by vect_estimate_min_profitable_iters, so changes here may need
420 corresponding changes there. */
422 static tree
423 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
424 gimple_seq *preheader_seq,
425 gimple_seq *header_seq,
426 gimple_stmt_iterator loop_cond_gsi,
427 rgroup_controls *rgc, tree niters,
428 tree niters_skip, bool might_wrap_p)
430 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
431 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
432 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
434 tree ctrl_type = rgc->type;
435 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
436 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
437 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
438 tree length_limit = NULL_TREE;
439 /* For length, we need length_limit to ensure length in range. */
440 if (!use_masks_p)
441 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
443 /* Calculate the maximum number of item values that the rgroup
444 handles in total, the number that it handles for each iteration
445 of the vector loop, and the number that it should skip during the
446 first iteration of the vector loop. */
447 tree nitems_total = niters;
448 tree nitems_step = build_int_cst (iv_type, vf);
449 tree nitems_skip = niters_skip;
450 if (nitems_per_iter != 1)
452 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
453 these multiplications don't overflow. */
454 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
455 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
456 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
457 nitems_total, compare_factor);
458 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
459 nitems_step, iv_factor);
460 if (nitems_skip)
461 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
462 nitems_skip, compare_factor);
465 /* Create an induction variable that counts the number of items
466 processed. */
467 tree index_before_incr, index_after_incr;
468 gimple_stmt_iterator incr_gsi;
469 bool insert_after;
470 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
471 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
472 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
474 tree zero_index = build_int_cst (compare_type, 0);
475 tree test_index, test_limit, first_limit;
476 gimple_stmt_iterator *test_gsi;
477 if (might_wrap_p)
479 /* In principle the loop should stop iterating once the incremented
480 IV reaches a value greater than or equal to:
482 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
484 However, there's no guarantee that this addition doesn't overflow
485 the comparison type, or that the IV hits a value above it before
486 wrapping around. We therefore adjust the limit down by one
487 IV step:
489 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
490 -[infinite-prec] NITEMS_STEP
492 and compare the IV against this limit _before_ incrementing it.
493 Since the comparison type is unsigned, we actually want the
494 subtraction to saturate at zero:
496 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
497 -[sat] NITEMS_STEP
499 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
501 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
503 where the rightmost subtraction can be done directly in
504 COMPARE_TYPE. */
505 test_index = index_before_incr;
506 tree adjust = gimple_convert (preheader_seq, compare_type,
507 nitems_step);
508 if (nitems_skip)
509 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
510 adjust, nitems_skip);
511 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
512 nitems_total, adjust);
513 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
514 test_limit, adjust);
515 test_gsi = &incr_gsi;
517 /* Get a safe limit for the first iteration. */
518 if (nitems_skip)
520 /* The first vector iteration can handle at most NITEMS_STEP
521 items. NITEMS_STEP <= CONST_LIMIT, and adding
522 NITEMS_SKIP to that cannot overflow. */
523 tree const_limit = build_int_cst (compare_type,
524 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
525 * nitems_per_iter);
526 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
527 nitems_total, const_limit);
528 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
529 first_limit, nitems_skip);
531 else
532 /* For the first iteration it doesn't matter whether the IV hits
533 a value above NITEMS_TOTAL. That only matters for the latch
534 condition. */
535 first_limit = nitems_total;
537 else
539 /* Test the incremented IV, which will always hit a value above
540 the bound before wrapping. */
541 test_index = index_after_incr;
542 test_limit = nitems_total;
543 if (nitems_skip)
544 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
545 test_limit, nitems_skip);
546 test_gsi = &loop_cond_gsi;
548 first_limit = test_limit;
551 /* Convert the IV value to the comparison type (either a no-op or
552 a demotion). */
553 gimple_seq test_seq = NULL;
554 test_index = gimple_convert (&test_seq, compare_type, test_index);
555 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
557 /* Provide a definition of each control in the group. */
558 tree next_ctrl = NULL_TREE;
559 tree ctrl;
560 unsigned int i;
561 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
563 /* Previous controls will cover BIAS items. This control covers the
564 next batch. */
565 poly_uint64 bias = nitems_per_ctrl * i;
566 tree bias_tree = build_int_cst (compare_type, bias);
568 /* See whether the first iteration of the vector loop is known
569 to have a full control. */
570 poly_uint64 const_limit;
571 bool first_iteration_full
572 = (poly_int_tree_p (first_limit, &const_limit)
573 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
575 /* Rather than have a new IV that starts at BIAS and goes up to
576 TEST_LIMIT, prefer to use the same 0-based IV for each control
577 and adjust the bound down by BIAS. */
578 tree this_test_limit = test_limit;
579 if (i != 0)
581 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
582 compare_type, this_test_limit,
583 bias_tree);
584 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
585 compare_type, this_test_limit,
586 bias_tree);
589 /* Create the initial control. First include all items that
590 are within the loop limit. */
591 tree init_ctrl = NULL_TREE;
592 if (!first_iteration_full)
594 tree start, end;
595 if (first_limit == test_limit)
597 /* Use a natural test between zero (the initial IV value)
598 and the loop limit. The "else" block would be valid too,
599 but this choice can avoid the need to load BIAS_TREE into
600 a register. */
601 start = zero_index;
602 end = this_test_limit;
604 else
606 /* FIRST_LIMIT is the maximum number of items handled by the
607 first iteration of the vector loop. Test the portion
608 associated with this control. */
609 start = bias_tree;
610 end = first_limit;
613 if (use_masks_p)
614 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
615 start, end, "max_mask");
616 else
618 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
619 gimple_seq seq = vect_gen_len (init_ctrl, start,
620 end, length_limit);
621 gimple_seq_add_seq (preheader_seq, seq);
625 /* Now AND out the bits that are within the number of skipped
626 items. */
627 poly_uint64 const_skip;
628 if (nitems_skip
629 && !(poly_int_tree_p (nitems_skip, &const_skip)
630 && known_le (const_skip, bias)))
632 gcc_assert (use_masks_p);
633 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
634 bias_tree, nitems_skip);
635 if (init_ctrl)
636 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
637 init_ctrl, unskipped_mask);
638 else
639 init_ctrl = unskipped_mask;
642 if (!init_ctrl)
644 /* First iteration is full. */
645 if (use_masks_p)
646 init_ctrl = build_minus_one_cst (ctrl_type);
647 else
648 init_ctrl = length_limit;
651 /* Get the control value for the next iteration of the loop. */
652 if (use_masks_p)
654 gimple_seq stmts = NULL;
655 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
656 this_test_limit, "next_mask");
657 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
659 else
661 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
662 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
663 length_limit);
664 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
667 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
670 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
671 if (partial_load_bias != 0)
673 tree adjusted_len = rgc->bias_adjusted_ctrl;
674 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
675 rgc->controls[0],
676 build_int_cst
677 (TREE_TYPE (rgc->controls[0]),
678 partial_load_bias));
679 gimple_seq_add_stmt (header_seq, minus);
682 return next_ctrl;
685 /* Set up the iteration condition and rgroup controls for LOOP, given
686 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
687 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
688 the number of iterations of the original scalar loop that should be
689 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
690 for vect_set_loop_condition.
692 Insert the branch-back condition before LOOP_COND_GSI and return the
693 final gcond. */
695 static gcond *
696 vect_set_loop_condition_partial_vectors (class loop *loop,
697 loop_vec_info loop_vinfo, tree niters,
698 tree final_iv, bool niters_maybe_zero,
699 gimple_stmt_iterator loop_cond_gsi)
701 gimple_seq preheader_seq = NULL;
702 gimple_seq header_seq = NULL;
704 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
705 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
706 unsigned int compare_precision = TYPE_PRECISION (compare_type);
707 tree orig_niters = niters;
709 /* Type of the initial value of NITERS. */
710 tree ni_actual_type = TREE_TYPE (niters);
711 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
712 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
714 /* Convert NITERS to the same size as the compare. */
715 if (compare_precision > ni_actual_precision
716 && niters_maybe_zero)
718 /* We know that there is always at least one iteration, so if the
719 count is zero then it must have wrapped. Cope with this by
720 subtracting 1 before the conversion and adding 1 to the result. */
721 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
722 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
723 niters, build_minus_one_cst (ni_actual_type));
724 niters = gimple_convert (&preheader_seq, compare_type, niters);
725 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
726 niters, build_one_cst (compare_type));
728 else
729 niters = gimple_convert (&preheader_seq, compare_type, niters);
731 /* Iterate over all the rgroups and fill in their controls. We could use
732 the first control from any rgroup for the loop condition; here we
733 arbitrarily pick the last. */
734 tree test_ctrl = NULL_TREE;
735 rgroup_controls *rgc;
736 unsigned int i;
737 auto_vec<rgroup_controls> *controls = use_masks_p
738 ? &LOOP_VINFO_MASKS (loop_vinfo)
739 : &LOOP_VINFO_LENS (loop_vinfo);
740 FOR_EACH_VEC_ELT (*controls, i, rgc)
741 if (!rgc->controls.is_empty ())
743 /* First try using permutes. This adds a single vector
744 instruction to the loop for each mask, but needs no extra
745 loop invariants or IVs. */
746 unsigned int nmasks = i + 1;
747 if (use_masks_p && (nmasks & 1) == 0)
749 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
750 if (!half_rgc->controls.is_empty ()
751 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
752 continue;
755 /* See whether zero-based IV would ever generate all-false masks
756 or zero length before wrapping around. */
757 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
759 /* Set up all controls for this group. */
760 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
761 &preheader_seq,
762 &header_seq,
763 loop_cond_gsi, rgc,
764 niters, niters_skip,
765 might_wrap_p);
768 /* Emit all accumulated statements. */
769 add_preheader_seq (loop, preheader_seq);
770 add_header_seq (loop, header_seq);
772 /* Get a boolean result that tells us whether to iterate. */
773 edge exit_edge = single_exit (loop);
774 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
775 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
776 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
777 NULL_TREE, NULL_TREE);
778 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
780 /* The loop iterates (NITERS - 1) / VF + 1 times.
781 Subtract one from this to get the latch count. */
782 tree step = build_int_cst (compare_type,
783 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
784 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
785 build_minus_one_cst (compare_type));
786 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
787 niters_minus_one, step);
789 if (final_iv)
791 gassign *assign = gimple_build_assign (final_iv, orig_niters);
792 gsi_insert_on_edge_immediate (single_exit (loop), assign);
795 return cond_stmt;
798 /* Like vect_set_loop_condition, but handle the case in which the vector
799 loop handles exactly VF scalars per iteration. */
801 static gcond *
802 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
803 tree final_iv, bool niters_maybe_zero,
804 gimple_stmt_iterator loop_cond_gsi)
806 tree indx_before_incr, indx_after_incr;
807 gcond *cond_stmt;
808 gcond *orig_cond;
809 edge pe = loop_preheader_edge (loop);
810 edge exit_edge = single_exit (loop);
811 gimple_stmt_iterator incr_gsi;
812 bool insert_after;
813 enum tree_code code;
814 tree niters_type = TREE_TYPE (niters);
816 orig_cond = get_loop_exit_condition (loop);
817 gcc_assert (orig_cond);
818 loop_cond_gsi = gsi_for_stmt (orig_cond);
820 tree init, limit;
821 if (!niters_maybe_zero && integer_onep (step))
823 /* In this case we can use a simple 0-based IV:
826 x = 0;
830 x += 1;
832 while (x < NITERS); */
833 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
834 init = build_zero_cst (niters_type);
835 limit = niters;
837 else
839 /* The following works for all values of NITERS except 0:
842 x = 0;
846 x += STEP;
848 while (x <= NITERS - STEP);
850 so that the loop continues to iterate if x + STEP - 1 < NITERS
851 but stops if x + STEP - 1 >= NITERS.
853 However, if NITERS is zero, x never hits a value above NITERS - STEP
854 before wrapping around. There are two obvious ways of dealing with
855 this:
857 - start at STEP - 1 and compare x before incrementing it
858 - start at -1 and compare x after incrementing it
860 The latter is simpler and is what we use. The loop in this case
861 looks like:
864 x = -1;
868 x += STEP;
870 while (x < NITERS - STEP);
872 In both cases the loop limit is NITERS - STEP. */
873 gimple_seq seq = NULL;
874 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
875 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
876 if (seq)
878 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
879 gcc_assert (!new_bb);
881 if (niters_maybe_zero)
883 /* Case C. */
884 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
885 init = build_all_ones_cst (niters_type);
887 else
889 /* Case B. */
890 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
891 init = build_zero_cst (niters_type);
895 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
896 create_iv (init, step, NULL_TREE, loop,
897 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
898 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
899 true, NULL_TREE, true,
900 GSI_SAME_STMT);
901 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
902 true, GSI_SAME_STMT);
904 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
905 NULL_TREE);
907 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
909 /* Record the number of latch iterations. */
910 if (limit == niters)
911 /* Case A: the loop iterates NITERS times. Subtract one to get the
912 latch count. */
913 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
914 build_int_cst (niters_type, 1));
915 else
916 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
917 Subtract one from this to get the latch count. */
918 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
919 limit, step);
921 if (final_iv)
923 gassign *assign;
924 edge exit = single_exit (loop);
925 gcc_assert (single_pred_p (exit->dest));
926 tree phi_dest
927 = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
928 /* Make sure to maintain LC SSA form here and elide the subtraction
929 if the value is zero. */
930 gphi *phi = create_phi_node (phi_dest, exit->dest);
931 add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
932 if (!integer_zerop (init))
934 assign = gimple_build_assign (final_iv, MINUS_EXPR,
935 phi_dest, init);
936 gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
937 gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
941 return cond_stmt;
944 /* If we're using fully-masked loops, make LOOP iterate:
946 N == (NITERS - 1) / STEP + 1
948 times. When NITERS is zero, this is equivalent to making the loop
949 execute (1 << M) / STEP times, where M is the precision of NITERS.
950 NITERS_MAYBE_ZERO is true if this last case might occur.
952 If we're not using fully-masked loops, make LOOP iterate:
954 N == (NITERS - STEP) / STEP + 1
956 times, where NITERS is known to be outside the range [1, STEP - 1].
957 This is equivalent to making the loop execute NITERS / STEP times
958 when NITERS is nonzero and (1 << M) / STEP times otherwise.
959 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
961 If FINAL_IV is nonnull, it is an SSA name that should be set to
962 N * STEP on exit from the loop.
964 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
966 void
967 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
968 tree niters, tree step, tree final_iv,
969 bool niters_maybe_zero)
971 gcond *cond_stmt;
972 gcond *orig_cond = get_loop_exit_condition (loop);
973 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
975 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
976 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
977 niters, final_iv,
978 niters_maybe_zero,
979 loop_cond_gsi);
980 else
981 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
982 niters_maybe_zero,
983 loop_cond_gsi);
985 /* Remove old loop exit test. */
986 stmt_vec_info orig_cond_info;
987 if (loop_vinfo
988 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
989 loop_vinfo->remove_stmt (orig_cond_info);
990 else
991 gsi_remove (&loop_cond_gsi, true);
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
995 cond_stmt);
998 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
999 For all PHI arguments in FROM->dest and TO->dest from those
1000 edges ensure that TO->dest PHI arguments have current_def
1001 to that in from. */
1003 static void
1004 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1006 gimple_stmt_iterator gsi_from, gsi_to;
1008 for (gsi_from = gsi_start_phis (from->dest),
1009 gsi_to = gsi_start_phis (to->dest);
1010 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1012 gimple *from_phi = gsi_stmt (gsi_from);
1013 gimple *to_phi = gsi_stmt (gsi_to);
1014 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1015 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1016 if (virtual_operand_p (from_arg))
1018 gsi_next (&gsi_from);
1019 continue;
1021 if (virtual_operand_p (to_arg))
1023 gsi_next (&gsi_to);
1024 continue;
1026 if (TREE_CODE (from_arg) != SSA_NAME)
1027 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1028 else if (TREE_CODE (to_arg) == SSA_NAME
1029 && from_arg != to_arg)
1031 if (get_current_def (to_arg) == NULL_TREE)
1033 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1034 TREE_TYPE (get_current_def
1035 (from_arg))));
1036 set_current_def (to_arg, get_current_def (from_arg));
1039 gsi_next (&gsi_from);
1040 gsi_next (&gsi_to);
1043 gphi *from_phi = get_virtual_phi (from->dest);
1044 gphi *to_phi = get_virtual_phi (to->dest);
1045 if (from_phi)
1046 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1047 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1051 /* Given LOOP this function generates a new copy of it and puts it
1052 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1053 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1054 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1055 entry or exit of LOOP. */
1057 class loop *
1058 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1059 class loop *scalar_loop, edge e)
1061 class loop *new_loop;
1062 basic_block *new_bbs, *bbs, *pbbs;
1063 bool at_exit;
1064 bool was_imm_dom;
1065 basic_block exit_dest;
1066 edge exit, new_exit;
1067 bool duplicate_outer_loop = false;
1069 exit = single_exit (loop);
1070 at_exit = (e == exit);
1071 if (!at_exit && e != loop_preheader_edge (loop))
1072 return NULL;
1074 if (scalar_loop == NULL)
1075 scalar_loop = loop;
1077 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078 pbbs = bbs + 1;
1079 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1080 /* Allow duplication of outer loops. */
1081 if (scalar_loop->inner)
1082 duplicate_outer_loop = true;
1083 /* Check whether duplication is possible. */
1084 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1086 free (bbs);
1087 return NULL;
1090 /* Generate new loop structure. */
1091 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1092 duplicate_subloops (scalar_loop, new_loop);
1094 exit_dest = exit->dest;
1095 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1096 exit_dest) == loop->header ?
1097 true : false);
1099 /* Also copy the pre-header, this avoids jumping through hoops to
1100 duplicate the loop entry PHI arguments. Create an empty
1101 pre-header unconditionally for this. */
1102 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1103 edge entry_e = single_pred_edge (preheader);
1104 bbs[0] = preheader;
1105 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1107 exit = single_exit (scalar_loop);
1108 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1109 &exit, 1, &new_exit, NULL,
1110 at_exit ? loop->latch : e->src, true);
1111 exit = single_exit (loop);
1112 basic_block new_preheader = new_bbs[0];
1114 /* Before installing PHI arguments make sure that the edges
1115 into them match that of the scalar loop we analyzed. This
1116 makes sure the SLP tree matches up between the main vectorized
1117 loop and the epilogue vectorized copies. */
1118 if (single_succ_edge (preheader)->dest_idx
1119 != single_succ_edge (new_bbs[0])->dest_idx)
1121 basic_block swap_bb = new_bbs[1];
1122 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1123 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1124 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1125 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1127 if (duplicate_outer_loop)
1129 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1130 if (loop_preheader_edge (scalar_loop)->dest_idx
1131 != loop_preheader_edge (new_inner_loop)->dest_idx)
1133 basic_block swap_bb = new_inner_loop->header;
1134 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1135 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1136 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1137 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1141 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1143 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1144 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1145 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1147 if (scalar_loop != loop)
1149 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1150 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1151 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1152 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1153 header) to have current_def set, so copy them over. */
1154 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1155 exit);
1156 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1158 EDGE_SUCC (loop->latch, 0));
1161 if (at_exit) /* Add the loop copy at exit. */
1163 if (scalar_loop != loop)
1165 gphi_iterator gsi;
1166 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1168 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1169 gsi_next (&gsi))
1171 gphi *phi = gsi.phi ();
1172 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1173 location_t orig_locus
1174 = gimple_phi_arg_location_from_edge (phi, e);
1176 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1179 redirect_edge_and_branch_force (e, new_preheader);
1180 flush_pending_stmts (e);
1181 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1182 if (was_imm_dom || duplicate_outer_loop)
1183 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1185 /* And remove the non-necessary forwarder again. Keep the other
1186 one so we have a proper pre-header for the loop at the exit edge. */
1187 redirect_edge_pred (single_succ_edge (preheader),
1188 single_pred (preheader));
1189 delete_basic_block (preheader);
1190 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1191 loop_preheader_edge (scalar_loop)->src);
1193 else /* Add the copy at entry. */
1195 if (scalar_loop != loop)
1197 /* Remove the non-necessary forwarder of scalar_loop again. */
1198 redirect_edge_pred (single_succ_edge (preheader),
1199 single_pred (preheader));
1200 delete_basic_block (preheader);
1201 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1202 loop_preheader_edge (scalar_loop)->src);
1203 preheader = split_edge (loop_preheader_edge (loop));
1204 entry_e = single_pred_edge (preheader);
1207 redirect_edge_and_branch_force (entry_e, new_preheader);
1208 flush_pending_stmts (entry_e);
1209 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1211 redirect_edge_and_branch_force (new_exit, preheader);
1212 flush_pending_stmts (new_exit);
1213 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1215 /* And remove the non-necessary forwarder again. Keep the other
1216 one so we have a proper pre-header for the loop at the exit edge. */
1217 redirect_edge_pred (single_succ_edge (new_preheader),
1218 single_pred (new_preheader));
1219 delete_basic_block (new_preheader);
1220 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1221 loop_preheader_edge (new_loop)->src);
1224 if (scalar_loop != loop)
1226 /* Update new_loop->header PHIs, so that on the preheader
1227 edge they are the ones from loop rather than scalar_loop. */
1228 gphi_iterator gsi_orig, gsi_new;
1229 edge orig_e = loop_preheader_edge (loop);
1230 edge new_e = loop_preheader_edge (new_loop);
1232 for (gsi_orig = gsi_start_phis (loop->header),
1233 gsi_new = gsi_start_phis (new_loop->header);
1234 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1235 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1237 gphi *orig_phi = gsi_orig.phi ();
1238 gphi *new_phi = gsi_new.phi ();
1239 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1240 location_t orig_locus
1241 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1243 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1247 free (new_bbs);
1248 free (bbs);
1250 checking_verify_dominators (CDI_DOMINATORS);
1252 return new_loop;
1256 /* Given the condition expression COND, put it as the last statement of
1257 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1258 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1259 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1260 new edge as irreducible if IRREDUCIBLE_P is true. */
1262 static edge
1263 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1264 basic_block guard_to, basic_block dom_bb,
1265 profile_probability probability, bool irreducible_p)
1267 gimple_stmt_iterator gsi;
1268 edge new_e, enter_e;
1269 gcond *cond_stmt;
1270 gimple_seq gimplify_stmt_list = NULL;
1272 enter_e = EDGE_SUCC (guard_bb, 0);
1273 enter_e->flags &= ~EDGE_FALLTHRU;
1274 enter_e->flags |= EDGE_FALSE_VALUE;
1275 gsi = gsi_last_bb (guard_bb);
1277 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1278 is_gimple_condexpr_for_cond, NULL_TREE);
1279 if (gimplify_stmt_list)
1280 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1282 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1283 gsi = gsi_last_bb (guard_bb);
1284 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1286 /* Add new edge to connect guard block to the merge/loop-exit block. */
1287 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1289 new_e->probability = probability;
1290 if (irreducible_p)
1291 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1293 enter_e->probability = probability.invert ();
1294 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1296 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1297 if (enter_e->dest->loop_father->header == enter_e->dest)
1298 split_edge (enter_e);
1300 return new_e;
1304 /* This function verifies that the following restrictions apply to LOOP:
1305 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1306 for innermost loop and 5 basic blocks for outer-loop.
1307 (2) it is single entry, single exit
1308 (3) its exit condition is the last stmt in the header
1309 (4) E is the entry/exit edge of LOOP.
1312 bool
1313 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1315 edge exit_e = single_exit (loop);
1316 edge entry_e = loop_preheader_edge (loop);
1317 gcond *orig_cond = get_loop_exit_condition (loop);
1318 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1319 unsigned int num_bb = loop->inner? 5 : 2;
1321 /* All loops have an outer scope; the only case loop->outer is NULL is for
1322 the function itself. */
1323 if (!loop_outer (loop)
1324 || loop->num_nodes != num_bb
1325 || !empty_block_p (loop->latch)
1326 || !single_exit (loop)
1327 /* Verify that new loop exit condition can be trivially modified. */
1328 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1329 || (e != exit_e && e != entry_e))
1330 return false;
1332 return true;
1335 /* Function vect_get_loop_location.
1337 Extract the location of the loop in the source code.
1338 If the loop is not well formed for vectorization, an estimated
1339 location is calculated.
1340 Return the loop location if succeed and NULL if not. */
1342 dump_user_location_t
1343 find_loop_location (class loop *loop)
1345 gimple *stmt = NULL;
1346 basic_block bb;
1347 gimple_stmt_iterator si;
1349 if (!loop)
1350 return dump_user_location_t ();
1352 stmt = get_loop_exit_condition (loop);
1354 if (stmt
1355 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1356 return stmt;
1358 /* If we got here the loop is probably not "well formed",
1359 try to estimate the loop location */
1361 if (!loop->header)
1362 return dump_user_location_t ();
1364 bb = loop->header;
1366 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1368 stmt = gsi_stmt (si);
1369 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1370 return stmt;
1373 return dump_user_location_t ();
1376 /* Return true if the phi described by STMT_INFO defines an IV of the
1377 loop to be vectorized. */
1379 static bool
1380 iv_phi_p (stmt_vec_info stmt_info)
1382 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1383 if (virtual_operand_p (PHI_RESULT (phi)))
1384 return false;
1386 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1387 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1388 return false;
1390 return true;
1393 /* Function vect_can_advance_ivs_p
1395 In case the number of iterations that LOOP iterates is unknown at compile
1396 time, an epilog loop will be generated, and the loop induction variables
1397 (IVs) will be "advanced" to the value they are supposed to take just before
1398 the epilog loop. Here we check that the access function of the loop IVs
1399 and the expression that represents the loop bound are simple enough.
1400 These restrictions will be relaxed in the future. */
1402 bool
1403 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1405 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1406 basic_block bb = loop->header;
1407 gphi_iterator gsi;
1409 /* Analyze phi functions of the loop header. */
1411 if (dump_enabled_p ())
1412 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1413 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1415 tree evolution_part;
1417 gphi *phi = gsi.phi ();
1418 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1421 phi_info->stmt);
1423 /* Skip virtual phi's. The data dependences that are associated with
1424 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1426 Skip reduction phis. */
1427 if (!iv_phi_p (phi_info))
1429 if (dump_enabled_p ())
1430 dump_printf_loc (MSG_NOTE, vect_location,
1431 "reduc or virtual phi. skip.\n");
1432 continue;
1435 /* Analyze the evolution function. */
1437 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1438 if (evolution_part == NULL_TREE)
1440 if (dump_enabled_p ())
1441 dump_printf (MSG_MISSED_OPTIMIZATION,
1442 "No access function or evolution.\n");
1443 return false;
1446 /* FORNOW: We do not transform initial conditions of IVs
1447 which evolution functions are not invariants in the loop. */
1449 if (!expr_invariant_in_loop_p (loop, evolution_part))
1451 if (dump_enabled_p ())
1452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453 "evolution not invariant in loop.\n");
1454 return false;
1457 /* FORNOW: We do not transform initial conditions of IVs
1458 which evolution functions are a polynomial of degree >= 2. */
1460 if (tree_is_chrec (evolution_part))
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 "evolution is chrec.\n");
1465 return false;
1469 return true;
1473 /* Function vect_update_ivs_after_vectorizer.
1475 "Advance" the induction variables of LOOP to the value they should take
1476 after the execution of LOOP. This is currently necessary because the
1477 vectorizer does not handle induction variables that are used after the
1478 loop. Such a situation occurs when the last iterations of LOOP are
1479 peeled, because:
1480 1. We introduced new uses after LOOP for IVs that were not originally used
1481 after LOOP: the IVs of LOOP are now used by an epilog loop.
1482 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1483 times, whereas the loop IVs should be bumped N times.
1485 Input:
1486 - LOOP - a loop that is going to be vectorized. The last few iterations
1487 of LOOP were peeled.
1488 - NITERS - the number of iterations that LOOP executes (before it is
1489 vectorized). i.e, the number of times the ivs should be bumped.
1490 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1491 coming out from LOOP on which there are uses of the LOOP ivs
1492 (this is the path from LOOP->exit to epilog_loop->preheader).
1494 The new definitions of the ivs are placed in LOOP->exit.
1495 The phi args associated with the edge UPDATE_E in the bb
1496 UPDATE_E->dest are updated accordingly.
1498 Assumption 1: Like the rest of the vectorizer, this function assumes
1499 a single loop exit that has a single predecessor.
1501 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1502 organized in the same order.
1504 Assumption 3: The access function of the ivs is simple enough (see
1505 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1507 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1508 coming out of LOOP on which the ivs of LOOP are used (this is the path
1509 that leads to the epilog loop; other paths skip the epilog loop). This
1510 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1511 needs to have its phis updated.
1514 static void
1515 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1516 tree niters, edge update_e)
1518 gphi_iterator gsi, gsi1;
1519 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520 basic_block update_bb = update_e->dest;
1521 basic_block exit_bb = single_exit (loop)->dest;
1523 /* Make sure there exists a single-predecessor exit bb: */
1524 gcc_assert (single_pred_p (exit_bb));
1525 gcc_assert (single_succ_edge (exit_bb) == update_e);
1527 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1528 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1529 gsi_next (&gsi), gsi_next (&gsi1))
1531 tree init_expr;
1532 tree step_expr, off;
1533 tree type;
1534 tree var, ni, ni_name;
1535 gimple_stmt_iterator last_gsi;
1537 gphi *phi = gsi.phi ();
1538 gphi *phi1 = gsi1.phi ();
1539 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE, vect_location,
1542 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1544 /* Skip reduction and virtual phis. */
1545 if (!iv_phi_p (phi_info))
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE, vect_location,
1549 "reduc or virtual phi. skip.\n");
1550 continue;
1553 type = TREE_TYPE (gimple_phi_result (phi));
1554 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1555 step_expr = unshare_expr (step_expr);
1557 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1558 of degree >= 2 or exponential. */
1559 gcc_assert (!tree_is_chrec (step_expr));
1561 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1563 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1564 fold_convert (TREE_TYPE (step_expr), niters),
1565 step_expr);
1566 if (POINTER_TYPE_P (type))
1567 ni = fold_build_pointer_plus (init_expr, off);
1568 else
1569 ni = fold_build2 (PLUS_EXPR, type,
1570 init_expr, fold_convert (type, off));
1572 var = create_tmp_var (type, "tmp");
1574 last_gsi = gsi_last_bb (exit_bb);
1575 gimple_seq new_stmts = NULL;
1576 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1577 /* Exit_bb shouldn't be empty. */
1578 if (!gsi_end_p (last_gsi))
1579 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1580 else
1581 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1583 /* Fix phi expressions in the successor bb. */
1584 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1588 /* Return a gimple value containing the misalignment (measured in vector
1589 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1590 it is away from a perfectly aligned address. Add any new statements
1591 to SEQ. */
1593 static tree
1594 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1596 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1597 stmt_vec_info stmt_info = dr_info->stmt;
1598 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1600 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1601 unsigned HOST_WIDE_INT target_align_c;
1602 tree target_align_minus_1;
1604 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1605 size_zero_node) < 0;
1606 tree offset = (negative
1607 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1608 * TREE_INT_CST_LOW
1609 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1610 : size_zero_node);
1611 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1612 stmt_info, seq,
1613 offset);
1614 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1615 if (target_align.is_constant (&target_align_c))
1616 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1617 else
1619 tree vla = build_int_cst (type, target_align);
1620 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1621 fold_build2 (MINUS_EXPR, type,
1622 build_int_cst (type, 0), vla));
1623 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1624 build_int_cst (type, 1));
1627 HOST_WIDE_INT elem_size
1628 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1629 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1631 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1632 tree int_start_addr = fold_convert (type, start_addr);
1633 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1634 target_align_minus_1);
1636 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1637 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1638 elem_size_log);
1640 return misalign_in_elems;
1643 /* Function vect_gen_prolog_loop_niters
1645 Generate the number of iterations which should be peeled as prolog for the
1646 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1647 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1648 As a result, after the execution of this loop, the data reference DR will
1649 refer to an aligned location. The following computation is generated:
1651 If the misalignment of DR is known at compile time:
1652 addr_mis = int mis = DR_MISALIGNMENT (dr);
1653 Else, compute address misalignment in bytes:
1654 addr_mis = addr & (target_align - 1)
1656 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1658 (elem_size = element type size; an element is the scalar element whose type
1659 is the inner type of the vectype)
1661 The computations will be emitted at the end of BB. We also compute and
1662 store upper bound (included) of the result in BOUND.
1664 When the step of the data-ref in the loop is not 1 (as in interleaved data
1665 and SLP), the number of iterations of the prolog must be divided by the step
1666 (which is equal to the size of interleaved group).
1668 The above formulas assume that VF == number of elements in the vector. This
1669 may not hold when there are multiple-types in the loop.
1670 In this case, for some data-references in the loop the VF does not represent
1671 the number of elements that fit in the vector. Therefore, instead of VF we
1672 use TYPE_VECTOR_SUBPARTS. */
1674 static tree
1675 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1676 basic_block bb, int *bound)
1678 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1679 tree var;
1680 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1681 gimple_seq stmts = NULL, new_stmts = NULL;
1682 tree iters, iters_name;
1683 stmt_vec_info stmt_info = dr_info->stmt;
1684 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1685 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1687 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1689 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1691 if (dump_enabled_p ())
1692 dump_printf_loc (MSG_NOTE, vect_location,
1693 "known peeling = %d.\n", npeel);
1695 iters = build_int_cst (niters_type, npeel);
1696 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1698 else
1700 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1701 tree type = TREE_TYPE (misalign_in_elems);
1702 HOST_WIDE_INT elem_size
1703 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1704 /* We only do prolog peeling if the target alignment is known at compile
1705 time. */
1706 poly_uint64 align_in_elems =
1707 exact_div (target_align, elem_size);
1708 tree align_in_elems_minus_1 =
1709 build_int_cst (type, align_in_elems - 1);
1710 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1712 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1713 & (align_in_elems - 1)). */
1714 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1715 size_zero_node) < 0;
1716 if (negative)
1717 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1718 align_in_elems_tree);
1719 else
1720 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1721 misalign_in_elems);
1722 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1723 iters = fold_convert (niters_type, iters);
1724 unsigned HOST_WIDE_INT align_in_elems_c;
1725 if (align_in_elems.is_constant (&align_in_elems_c))
1726 *bound = align_in_elems_c - 1;
1727 else
1728 *bound = -1;
1731 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE, vect_location,
1733 "niters for prolog loop: %T\n", iters);
1735 var = create_tmp_var (niters_type, "prolog_loop_niters");
1736 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1738 if (new_stmts)
1739 gimple_seq_add_seq (&stmts, new_stmts);
1740 if (stmts)
1742 gcc_assert (single_succ_p (bb));
1743 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1744 if (gsi_end_p (gsi))
1745 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1746 else
1747 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1749 return iters_name;
1753 /* Function vect_update_init_of_dr
1755 If CODE is PLUS, the vector loop starts NITERS iterations after the
1756 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1757 iterations before the scalar one (using masking to skip inactive
1758 elements). This function updates the information recorded in DR to
1759 account for the difference. Specifically, it updates the OFFSET
1760 field of DR_INFO. */
1762 static void
1763 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1765 struct data_reference *dr = dr_info->dr;
1766 tree offset = dr_info->offset;
1767 if (!offset)
1768 offset = build_zero_cst (sizetype);
1770 niters = fold_build2 (MULT_EXPR, sizetype,
1771 fold_convert (sizetype, niters),
1772 fold_convert (sizetype, DR_STEP (dr)));
1773 offset = fold_build2 (code, sizetype,
1774 fold_convert (sizetype, offset), niters);
1775 dr_info->offset = offset;
1779 /* Function vect_update_inits_of_drs
1781 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1782 CODE and NITERS are as for vect_update_inits_of_dr. */
1784 void
1785 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1786 tree_code code)
1788 unsigned int i;
1789 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1790 struct data_reference *dr;
1792 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1794 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1795 here, but since we might use these niters to update the epilogues niters
1796 and data references we can't insert them here as this definition might not
1797 always dominate its uses. */
1798 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1799 niters = fold_convert (sizetype, niters);
1801 FOR_EACH_VEC_ELT (datarefs, i, dr)
1803 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1804 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1805 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1806 vect_update_init_of_dr (dr_info, niters, code);
1810 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1811 by masking. This involves calculating the number of iterations to
1812 be peeled and then aligning all memory references appropriately. */
1814 void
1815 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1817 tree misalign_in_elems;
1818 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1820 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1822 /* From the information recorded in LOOP_VINFO get the number of iterations
1823 that need to be skipped via masking. */
1824 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1826 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1827 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1828 misalign_in_elems = build_int_cst (type, misalign);
1830 else
1832 gimple_seq seq1 = NULL, seq2 = NULL;
1833 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1834 misalign_in_elems = fold_convert (type, misalign_in_elems);
1835 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1836 &seq2, true, NULL_TREE);
1837 gimple_seq_add_seq (&seq1, seq2);
1838 if (seq1)
1840 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1841 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1842 gcc_assert (!new_bb);
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location,
1848 "misalignment for fully-masked loop: %T\n",
1849 misalign_in_elems);
1851 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1853 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1856 /* This function builds ni_name = number of iterations. Statements
1857 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1858 it to TRUE if new ssa_var is generated. */
1860 tree
1861 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1863 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1864 if (TREE_CODE (ni) == INTEGER_CST)
1865 return ni;
1866 else
1868 tree ni_name, var;
1869 gimple_seq stmts = NULL;
1870 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1872 var = create_tmp_var (TREE_TYPE (ni), "niters");
1873 ni_name = force_gimple_operand (ni, &stmts, false, var);
1874 if (stmts)
1876 gsi_insert_seq_on_edge_immediate (pe, stmts);
1877 if (new_var_p != NULL)
1878 *new_var_p = true;
1881 return ni_name;
1885 /* Calculate the number of iterations above which vectorized loop will be
1886 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1887 of prolog loop. If it's integer const, the integer number is also passed
1888 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1889 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1890 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1891 threshold below which the scalar (rather than vectorized) loop will be
1892 executed. This function stores the upper bound (inclusive) of the result
1893 in BOUND_SCALAR. */
1895 static tree
1896 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1897 int bound_prolog, poly_int64 bound_epilog, int th,
1898 poly_uint64 *bound_scalar,
1899 bool check_profitability)
1901 tree type = TREE_TYPE (niters_prolog);
1902 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1903 build_int_cst (type, bound_epilog));
1905 *bound_scalar = bound_prolog + bound_epilog;
1906 if (check_profitability)
1908 /* TH indicates the minimum niters of vectorized loop, while we
1909 compute the maximum niters of scalar loop. */
1910 th--;
1911 /* Peeling for constant times. */
1912 if (int_niters_prolog >= 0)
1914 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1915 return build_int_cst (type, *bound_scalar);
1917 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1918 and BOUND_EPILOG are inclusive upper bounds. */
1919 if (known_ge (th, bound_prolog + bound_epilog))
1921 *bound_scalar = th;
1922 return build_int_cst (type, th);
1924 /* Need to do runtime comparison. */
1925 else if (maybe_gt (th, bound_epilog))
1927 *bound_scalar = upper_bound (*bound_scalar, th);
1928 return fold_build2 (MAX_EXPR, type,
1929 build_int_cst (type, th), niters);
1932 return niters;
1935 /* NITERS is the number of times that the original scalar loop executes
1936 after peeling. Work out the maximum number of iterations N that can
1937 be handled by the vectorized form of the loop and then either:
1939 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1941 niters_vector = N
1943 b) set *STEP_VECTOR_PTR to one and generate:
1945 niters_vector = N / vf
1947 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1948 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1949 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1951 void
1952 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1953 tree *niters_vector_ptr, tree *step_vector_ptr,
1954 bool niters_no_overflow)
1956 tree ni_minus_gap, var;
1957 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1958 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1960 tree log_vf = NULL_TREE;
1962 /* If epilogue loop is required because of data accesses with gaps, we
1963 subtract one iteration from the total number of iterations here for
1964 correct calculation of RATIO. */
1965 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1967 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1968 build_one_cst (type));
1969 if (!is_gimple_val (ni_minus_gap))
1971 var = create_tmp_var (type, "ni_gap");
1972 gimple *stmts = NULL;
1973 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1974 true, var);
1975 gsi_insert_seq_on_edge_immediate (pe, stmts);
1978 else
1979 ni_minus_gap = niters;
1981 unsigned HOST_WIDE_INT const_vf;
1982 if (vf.is_constant (&const_vf)
1983 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1985 /* Create: niters >> log2(vf) */
1986 /* If it's known that niters == number of latch executions + 1 doesn't
1987 overflow, we can generate niters >> log2(vf); otherwise we generate
1988 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1989 will be at least one. */
1990 log_vf = build_int_cst (type, exact_log2 (const_vf));
1991 if (niters_no_overflow)
1992 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1993 else
1994 niters_vector
1995 = fold_build2 (PLUS_EXPR, type,
1996 fold_build2 (RSHIFT_EXPR, type,
1997 fold_build2 (MINUS_EXPR, type,
1998 ni_minus_gap,
1999 build_int_cst (type, vf)),
2000 log_vf),
2001 build_int_cst (type, 1));
2002 step_vector = build_one_cst (type);
2004 else
2006 niters_vector = ni_minus_gap;
2007 step_vector = build_int_cst (type, vf);
2010 if (!is_gimple_val (niters_vector))
2012 var = create_tmp_var (type, "bnd");
2013 gimple_seq stmts = NULL;
2014 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2015 gsi_insert_seq_on_edge_immediate (pe, stmts);
2016 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2017 we set range information to make niters analyzer's life easier.
2018 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2019 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2020 if (stmts != NULL && log_vf)
2022 if (niters_no_overflow)
2024 value_range vr (type,
2025 wi::one (TYPE_PRECISION (type)),
2026 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2027 TYPE_SIGN (type)),
2028 exact_log2 (const_vf),
2029 TYPE_SIGN (type)));
2030 set_range_info (niters_vector, vr);
2032 /* For VF == 1 the vector IV might also overflow so we cannot
2033 assert a minimum value of 1. */
2034 else if (const_vf > 1)
2036 value_range vr (type,
2037 wi::one (TYPE_PRECISION (type)),
2038 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2039 TYPE_SIGN (type))
2040 - (const_vf - 1),
2041 exact_log2 (const_vf), TYPE_SIGN (type))
2042 + 1);
2043 set_range_info (niters_vector, vr);
2047 *niters_vector_ptr = niters_vector;
2048 *step_vector_ptr = step_vector;
2050 return;
2053 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2054 loop specified by LOOP_VINFO after vectorization, compute the number
2055 of iterations before vectorization (niters_vector * vf) and store it
2056 to NITERS_VECTOR_MULT_VF_PTR. */
2058 static void
2059 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2060 tree niters_vector,
2061 tree *niters_vector_mult_vf_ptr)
2063 /* We should be using a step_vector of VF if VF is variable. */
2064 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2065 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2066 tree type = TREE_TYPE (niters_vector);
2067 tree log_vf = build_int_cst (type, exact_log2 (vf));
2068 basic_block exit_bb = single_exit (loop)->dest;
2070 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2071 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2072 niters_vector, log_vf);
2073 if (!is_gimple_val (niters_vector_mult_vf))
2075 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2076 gimple_seq stmts = NULL;
2077 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2078 &stmts, true, var);
2079 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2080 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2082 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2085 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2086 this function searches for the corresponding lcssa phi node in exit
2087 bb of LOOP. If it is found, return the phi result; otherwise return
2088 NULL. */
2090 static tree
2091 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2092 gphi *lcssa_phi)
2094 gphi_iterator gsi;
2095 edge e = single_exit (loop);
2097 gcc_assert (single_pred_p (e->dest));
2098 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2100 gphi *phi = gsi.phi ();
2101 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2102 PHI_ARG_DEF (lcssa_phi, 0), 0))
2103 return PHI_RESULT (phi);
2105 return NULL_TREE;
2108 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2109 from SECOND/FIRST and puts it at the original loop's preheader/exit
2110 edge, the two loops are arranged as below:
2112 preheader_a:
2113 first_loop:
2114 header_a:
2115 i_1 = PHI<i_0, i_2>;
2117 i_2 = i_1 + 1;
2118 if (cond_a)
2119 goto latch_a;
2120 else
2121 goto between_bb;
2122 latch_a:
2123 goto header_a;
2125 between_bb:
2126 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2128 second_loop:
2129 header_b:
2130 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2131 or with i_2 if no LCSSA phi is created
2132 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2134 i_4 = i_3 + 1;
2135 if (cond_b)
2136 goto latch_b;
2137 else
2138 goto exit_bb;
2139 latch_b:
2140 goto header_b;
2142 exit_bb:
2144 This function creates loop closed SSA for the first loop; update the
2145 second loop's PHI nodes by replacing argument on incoming edge with the
2146 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2147 is false, Loop closed ssa phis will only be created for non-iv phis for
2148 the first loop.
2150 This function assumes exit bb of the first loop is preheader bb of the
2151 second loop, i.e, between_bb in the example code. With PHIs updated,
2152 the second loop will execute rest iterations of the first. */
2154 static void
2155 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2156 class loop *first, class loop *second,
2157 bool create_lcssa_for_iv_phis)
2159 gphi_iterator gsi_update, gsi_orig;
2160 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2162 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2163 edge second_preheader_e = loop_preheader_edge (second);
2164 basic_block between_bb = single_exit (first)->dest;
2166 gcc_assert (between_bb == second_preheader_e->src);
2167 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2168 /* Either the first loop or the second is the loop to be vectorized. */
2169 gcc_assert (loop == first || loop == second);
2171 for (gsi_orig = gsi_start_phis (first->header),
2172 gsi_update = gsi_start_phis (second->header);
2173 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2174 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2176 gphi *orig_phi = gsi_orig.phi ();
2177 gphi *update_phi = gsi_update.phi ();
2179 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2180 /* Generate lcssa PHI node for the first loop. */
2181 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2182 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2183 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2185 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2186 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2187 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2188 arg = new_res;
2191 /* Update PHI node in the second loop by replacing arg on the loop's
2192 incoming edge. */
2193 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2196 /* For epilogue peeling we have to make sure to copy all LC PHIs
2197 for correct vectorization of live stmts. */
2198 if (loop == first)
2200 basic_block orig_exit = single_exit (second)->dest;
2201 for (gsi_orig = gsi_start_phis (orig_exit);
2202 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2204 gphi *orig_phi = gsi_orig.phi ();
2205 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2206 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2207 continue;
2209 /* Already created in the above loop. */
2210 if (find_guard_arg (first, second, orig_phi))
2211 continue;
2213 tree new_res = copy_ssa_name (orig_arg);
2214 gphi *lcphi = create_phi_node (new_res, between_bb);
2215 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2220 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2221 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2222 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2223 appear like below:
2225 guard_bb:
2226 if (cond)
2227 goto merge_bb;
2228 else
2229 goto skip_loop;
2231 skip_loop:
2232 header_a:
2233 i_1 = PHI<i_0, i_2>;
2235 i_2 = i_1 + 1;
2236 if (cond_a)
2237 goto latch_a;
2238 else
2239 goto exit_a;
2240 latch_a:
2241 goto header_a;
2243 exit_a:
2244 i_5 = PHI<i_2>;
2246 merge_bb:
2247 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2249 update_loop:
2250 header_b:
2251 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2253 i_4 = i_3 + 1;
2254 if (cond_b)
2255 goto latch_b;
2256 else
2257 goto exit_bb;
2258 latch_b:
2259 goto header_b;
2261 exit_bb:
2263 This function creates PHI nodes at merge_bb and replaces the use of i_5
2264 in the update_loop's PHI node with the result of new PHI result. */
2266 static void
2267 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2268 class loop *update_loop,
2269 edge guard_edge, edge merge_edge)
2271 location_t merge_loc, guard_loc;
2272 edge orig_e = loop_preheader_edge (skip_loop);
2273 edge update_e = loop_preheader_edge (update_loop);
2274 gphi_iterator gsi_orig, gsi_update;
2276 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2277 gsi_update = gsi_start_phis (update_loop->header));
2278 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2279 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2281 gphi *orig_phi = gsi_orig.phi ();
2282 gphi *update_phi = gsi_update.phi ();
2284 /* Generate new phi node at merge bb of the guard. */
2285 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2286 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2288 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2289 args in NEW_PHI for these edges. */
2290 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2291 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2292 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2293 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2294 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2295 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2297 /* Update phi in UPDATE_PHI. */
2298 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2302 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2303 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2304 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2305 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2306 The CFG looks like:
2308 loop:
2309 header_a:
2310 i_1 = PHI<i_0, i_2>;
2312 i_2 = i_1 + 1;
2313 if (cond_a)
2314 goto latch_a;
2315 else
2316 goto exit_a;
2317 latch_a:
2318 goto header_a;
2320 exit_a:
2322 guard_bb:
2323 if (cond)
2324 goto merge_bb;
2325 else
2326 goto epilog_loop;
2328 ;; fall_through_bb
2330 epilog_loop:
2331 header_b:
2332 i_3 = PHI<i_2, i_4>;
2334 i_4 = i_3 + 1;
2335 if (cond_b)
2336 goto latch_b;
2337 else
2338 goto merge_bb;
2339 latch_b:
2340 goto header_b;
2342 merge_bb:
2343 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2345 exit_bb:
2346 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2348 For each name used out side EPILOG (i.e - for each name that has a lcssa
2349 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2350 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2351 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2352 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2353 in exit_bb will also be updated. */
2355 static void
2356 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2357 edge guard_edge, edge merge_edge)
2359 gphi_iterator gsi;
2360 basic_block merge_bb = guard_edge->dest;
2362 gcc_assert (single_succ_p (merge_bb));
2363 edge e = single_succ_edge (merge_bb);
2364 basic_block exit_bb = e->dest;
2365 gcc_assert (single_pred_p (exit_bb));
2366 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2368 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2370 gphi *update_phi = gsi.phi ();
2371 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2373 tree merge_arg = NULL_TREE;
2375 /* If the old argument is a SSA_NAME use its current_def. */
2376 if (TREE_CODE (old_arg) == SSA_NAME)
2377 merge_arg = get_current_def (old_arg);
2378 /* If it's a constant or doesn't have a current_def, just use the old
2379 argument. */
2380 if (!merge_arg)
2381 merge_arg = old_arg;
2383 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2384 /* If the var is live after loop but not a reduction, we simply
2385 use the old arg. */
2386 if (!guard_arg)
2387 guard_arg = old_arg;
2389 /* Create new phi node in MERGE_BB: */
2390 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2391 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2393 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2394 the two PHI args in merge_phi for these edges. */
2395 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2396 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2398 /* Update the original phi in exit_bb. */
2399 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2403 /* EPILOG loop is duplicated from the original loop for vectorizing,
2404 the arg of its loop closed ssa PHI needs to be updated. */
2406 static void
2407 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2409 gphi_iterator gsi;
2410 basic_block exit_bb = single_exit (epilog)->dest;
2412 gcc_assert (single_pred_p (exit_bb));
2413 edge e = EDGE_PRED (exit_bb, 0);
2414 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2415 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2418 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2419 iterate exactly CONST_NITERS times. Make a final decision about
2420 whether the epilogue loop should be used, returning true if so. */
2422 static bool
2423 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2424 unsigned HOST_WIDE_INT const_niters)
2426 /* Avoid wrap-around when computing const_niters - 1. Also reject
2427 using an epilogue loop for a single scalar iteration, even if
2428 we could in principle implement that using partial vectors. */
2429 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2430 if (const_niters <= gap_niters + 1)
2431 return false;
2433 /* Install the number of iterations. */
2434 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2435 tree niters_tree = build_int_cst (niters_type, const_niters);
2436 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2438 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2439 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2441 /* Decide what to do if the number of epilogue iterations is not
2442 a multiple of the epilogue loop's vectorization factor. */
2443 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2446 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2447 Return a value that equals:
2449 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2450 - SKIP_VALUE when the main loop is skipped. */
2452 tree
2453 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2454 tree skip_value)
2456 gcc_assert (loop_vinfo->main_loop_edge);
2458 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2459 basic_block bb = loop_vinfo->main_loop_edge->dest;
2460 gphi *new_phi = create_phi_node (phi_result, bb);
2461 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2462 UNKNOWN_LOCATION);
2463 add_phi_arg (new_phi, skip_value,
2464 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2465 return phi_result;
2468 /* Function vect_do_peeling.
2470 Input:
2471 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2473 preheader:
2474 LOOP:
2475 header_bb:
2476 loop_body
2477 if (exit_loop_cond) goto exit_bb
2478 else goto header_bb
2479 exit_bb:
2481 - NITERS: The number of iterations of the loop.
2482 - NITERSM1: The number of iterations of the loop's latch.
2483 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2484 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2485 CHECK_PROFITABILITY is true.
2486 Output:
2487 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2488 iterate after vectorization; see vect_set_loop_condition for details.
2489 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2490 should be set to the number of scalar iterations handled by the
2491 vector loop. The SSA name is only used on exit from the loop.
2493 This function peels prolog and epilog from the loop, adds guards skipping
2494 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2495 would look like:
2497 guard_bb_1:
2498 if (prefer_scalar_loop) goto merge_bb_1
2499 else goto guard_bb_2
2501 guard_bb_2:
2502 if (skip_prolog) goto merge_bb_2
2503 else goto prolog_preheader
2505 prolog_preheader:
2506 PROLOG:
2507 prolog_header_bb:
2508 prolog_body
2509 if (exit_prolog_cond) goto prolog_exit_bb
2510 else goto prolog_header_bb
2511 prolog_exit_bb:
2513 merge_bb_2:
2515 vector_preheader:
2516 VECTOR LOOP:
2517 vector_header_bb:
2518 vector_body
2519 if (exit_vector_cond) goto vector_exit_bb
2520 else goto vector_header_bb
2521 vector_exit_bb:
2523 guard_bb_3:
2524 if (skip_epilog) goto merge_bb_3
2525 else goto epilog_preheader
2527 merge_bb_1:
2529 epilog_preheader:
2530 EPILOG:
2531 epilog_header_bb:
2532 epilog_body
2533 if (exit_epilog_cond) goto merge_bb_3
2534 else goto epilog_header_bb
2536 merge_bb_3:
2538 Note this function peels prolog and epilog only if it's necessary,
2539 as well as guards.
2540 This function returns the epilogue loop if a decision was made to vectorize
2541 it, otherwise NULL.
2543 The analysis resulting in this epilogue loop's loop_vec_info was performed
2544 in the same vect_analyze_loop call as the main loop's. At that time
2545 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2546 vectorization factors than the main loop. This list is stored in the main
2547 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2548 vectorize the epilogue loop for a lower vectorization factor, the
2549 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2550 updated and linked to the epilogue loop. This is later used to vectorize
2551 the epilogue. The reason the loop_vec_info needs updating is that it was
2552 constructed based on the original main loop, and the epilogue loop is a
2553 copy of this loop, so all links pointing to statements in the original loop
2554 need updating. Furthermore, these loop_vec_infos share the
2555 data_reference's records, which will also need to be updated.
2557 TODO: Guard for prefer_scalar_loop should be emitted along with
2558 versioning conditions if loop versioning is needed. */
2561 class loop *
2562 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2563 tree *niters_vector, tree *step_vector,
2564 tree *niters_vector_mult_vf_var, int th,
2565 bool check_profitability, bool niters_no_overflow,
2566 tree *advance)
2568 edge e, guard_e;
2569 tree type = TREE_TYPE (niters), guard_cond;
2570 basic_block guard_bb, guard_to;
2571 profile_probability prob_prolog, prob_vector, prob_epilog;
2572 int estimated_vf;
2573 int prolog_peeling = 0;
2574 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2575 bool vect_epilogues_updated_niters = false;
2576 /* We currently do not support prolog peeling if the target alignment is not
2577 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2578 target alignment being constant. */
2579 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2580 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2581 return NULL;
2583 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2584 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2586 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2587 poly_uint64 bound_epilog = 0;
2588 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2589 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2590 bound_epilog += vf - 1;
2591 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2592 bound_epilog += 1;
2593 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2594 poly_uint64 bound_scalar = bound_epilog;
2596 if (!prolog_peeling && !epilog_peeling)
2597 return NULL;
2599 /* Before doing any peeling make sure to reset debug binds outside of
2600 the loop refering to defs not in LC SSA. */
2601 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2602 for (unsigned i = 0; i < loop->num_nodes; ++i)
2604 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2605 imm_use_iterator ui;
2606 gimple *use_stmt;
2607 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2608 gsi_next (&gsi))
2610 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2611 if (gimple_debug_bind_p (use_stmt)
2612 && loop != gimple_bb (use_stmt)->loop_father
2613 && !flow_loop_nested_p (loop,
2614 gimple_bb (use_stmt)->loop_father))
2616 gimple_debug_bind_reset_value (use_stmt);
2617 update_stmt (use_stmt);
2620 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2621 gsi_next (&gsi))
2623 ssa_op_iter op_iter;
2624 def_operand_p def_p;
2625 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2626 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2627 if (gimple_debug_bind_p (use_stmt)
2628 && loop != gimple_bb (use_stmt)->loop_father
2629 && !flow_loop_nested_p (loop,
2630 gimple_bb (use_stmt)->loop_father))
2632 gimple_debug_bind_reset_value (use_stmt);
2633 update_stmt (use_stmt);
2638 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2639 estimated_vf = vect_vf_for_cost (loop_vinfo);
2640 if (estimated_vf == 2)
2641 estimated_vf = 3;
2642 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2643 .apply_scale (estimated_vf - 1, estimated_vf);
2645 class loop *prolog, *epilog = NULL;
2646 class loop *first_loop = loop;
2647 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2649 /* SSA form needs to be up-to-date since we are going to manually
2650 update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2651 update SSA state after that, so we have to make sure to not lose any
2652 pending update needs. */
2653 gcc_assert (!need_ssa_update_p (cfun));
2655 /* If we're vectorizing an epilogue loop, we have ensured that the
2656 virtual operand is in SSA form throughout the vectorized main loop.
2657 Normally it is possible to trace the updated
2658 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2659 back to scalar-stmt vuses, meaning that the effect of the SSA update
2660 remains local to the main loop. However, there are rare cases in
2661 which the vectorized loop should have vdefs even when the original scalar
2662 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2663 introduces clobbers of the temporary vector array, which in turn
2664 needs new vdefs. If the scalar loop doesn't write to memory, these
2665 new vdefs will be the only ones in the vector loop.
2666 We are currently defering updating virtual SSA form and creating
2667 of a virtual PHI for this case so we do not have to make sure the
2668 newly introduced virtual def is in LCSSA form. */
2670 if (MAY_HAVE_DEBUG_BIND_STMTS)
2672 gcc_assert (!adjust_vec.exists ());
2673 adjust_vec.create (32);
2675 initialize_original_copy_tables ();
2677 /* Record the anchor bb at which the guard should be placed if the scalar
2678 loop might be preferred. */
2679 basic_block anchor = loop_preheader_edge (loop)->src;
2681 /* Generate the number of iterations for the prolog loop. We do this here
2682 so that we can also get the upper bound on the number of iterations. */
2683 tree niters_prolog;
2684 int bound_prolog = 0;
2685 if (prolog_peeling)
2686 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2687 &bound_prolog);
2688 else
2689 niters_prolog = build_int_cst (type, 0);
2691 loop_vec_info epilogue_vinfo = NULL;
2692 if (vect_epilogues)
2694 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2695 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2698 tree niters_vector_mult_vf = NULL_TREE;
2699 /* Saving NITERs before the loop, as this may be changed by prologue. */
2700 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2701 edge update_e = NULL, skip_e = NULL;
2702 unsigned int lowest_vf = constant_lower_bound (vf);
2703 /* If we know the number of scalar iterations for the main loop we should
2704 check whether after the main loop there are enough iterations left over
2705 for the epilogue. */
2706 if (vect_epilogues
2707 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2708 && prolog_peeling >= 0
2709 && known_eq (vf, lowest_vf))
2711 unsigned HOST_WIDE_INT eiters
2712 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2713 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2715 eiters -= prolog_peeling;
2716 eiters
2717 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2719 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2721 delete epilogue_vinfo;
2722 epilogue_vinfo = NULL;
2723 if (loop_vinfo->epilogue_vinfos.length () == 0)
2725 vect_epilogues = false;
2726 break;
2728 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2729 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2731 vect_epilogues_updated_niters = true;
2733 /* Prolog loop may be skipped. */
2734 bool skip_prolog = (prolog_peeling != 0);
2735 /* Skip this loop to epilog when there are not enough iterations to enter this
2736 vectorized loop. If true we should perform runtime checks on the NITERS
2737 to check whether we should skip the current vectorized loop. If we know
2738 the number of scalar iterations we may choose to add a runtime check if
2739 this number "maybe" smaller than the number of iterations required
2740 when we know the number of scalar iterations may potentially
2741 be smaller than the number of iterations required to enter this loop, for
2742 this we use the upper bounds on the prolog and epilog peeling. When we
2743 don't know the number of iterations and don't require versioning it is
2744 because we have asserted that there are enough scalar iterations to enter
2745 the main loop, so this skip is not necessary. When we are versioning then
2746 we only add such a skip if we have chosen to vectorize the epilogue. */
2747 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2748 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2749 bound_prolog + bound_epilog)
2750 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2751 || vect_epilogues));
2752 /* Epilog loop must be executed if the number of iterations for epilog
2753 loop is known at compile time, otherwise we need to add a check at
2754 the end of vector loop and skip to the end of epilog loop. */
2755 bool skip_epilog = (prolog_peeling < 0
2756 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2757 || !vf.is_constant ());
2758 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2759 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2760 skip_epilog = false;
2762 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2763 auto_vec<profile_count> original_counts;
2764 basic_block *original_bbs = NULL;
2766 if (skip_vector)
2768 split_edge (loop_preheader_edge (loop));
2770 if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2772 original_bbs = get_loop_body (loop);
2773 for (unsigned int i = 0; i < loop->num_nodes; i++)
2774 original_counts.safe_push(original_bbs[i]->count);
2777 /* Due to the order in which we peel prolog and epilog, we first
2778 propagate probability to the whole loop. The purpose is to
2779 avoid adjusting probabilities of both prolog and vector loops
2780 separately. Note in this case, the probability of epilog loop
2781 needs to be scaled back later. */
2782 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2783 if (prob_vector.initialized_p ())
2785 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2786 scale_loop_profile (loop, prob_vector, 0);
2790 dump_user_location_t loop_loc = find_loop_location (loop);
2791 if (vect_epilogues)
2792 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2793 use the original scalar loop as remaining epilogue if necessary. */
2794 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2795 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2797 if (prolog_peeling)
2799 e = loop_preheader_edge (loop);
2800 if (!slpeel_can_duplicate_loop_p (loop, e))
2802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2803 "loop can't be duplicated to preheader edge.\n");
2804 gcc_unreachable ();
2806 /* Peel prolog and put it on preheader edge of loop. */
2807 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2808 if (!prolog)
2810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2811 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2812 gcc_unreachable ();
2814 prolog->force_vectorize = false;
2815 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2816 first_loop = prolog;
2817 reset_original_copy_tables ();
2819 /* Update the number of iterations for prolog loop. */
2820 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2821 vect_set_loop_condition (prolog, NULL, niters_prolog,
2822 step_prolog, NULL_TREE, false);
2824 /* Skip the prolog loop. */
2825 if (skip_prolog)
2827 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2828 niters_prolog, build_int_cst (type, 0));
2829 guard_bb = loop_preheader_edge (prolog)->src;
2830 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2831 guard_to = split_edge (loop_preheader_edge (loop));
2832 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2833 guard_to, guard_bb,
2834 prob_prolog.invert (),
2835 irred_flag);
2836 e = EDGE_PRED (guard_to, 0);
2837 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2838 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2840 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2841 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2844 /* Update init address of DRs. */
2845 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2846 /* Update niters for vector loop. */
2847 LOOP_VINFO_NITERS (loop_vinfo)
2848 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2849 LOOP_VINFO_NITERSM1 (loop_vinfo)
2850 = fold_build2 (MINUS_EXPR, type,
2851 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2852 bool new_var_p = false;
2853 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2854 /* It's guaranteed that vector loop bound before vectorization is at
2855 least VF, so set range information for newly generated var. */
2856 if (new_var_p)
2858 value_range vr (type,
2859 wi::to_wide (build_int_cst (type, vf)),
2860 wi::to_wide (TYPE_MAX_VALUE (type)));
2861 set_range_info (niters, vr);
2864 /* Prolog iterates at most bound_prolog times, latch iterates at
2865 most bound_prolog - 1 times. */
2866 record_niter_bound (prolog, bound_prolog - 1, false, true);
2867 delete_update_ssa ();
2868 adjust_vec_debug_stmts ();
2869 scev_reset ();
2872 if (epilog_peeling)
2874 e = single_exit (loop);
2875 if (!slpeel_can_duplicate_loop_p (loop, e))
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2878 "loop can't be duplicated to exit edge.\n");
2879 gcc_unreachable ();
2881 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2882 said epilog then we should use a copy of the main loop as a starting
2883 point. This loop may have already had some preliminary transformations
2884 to allow for more optimal vectorization, for example if-conversion.
2885 If we are not vectorizing the epilog then we should use the scalar loop
2886 as the transformations mentioned above make less or no sense when not
2887 vectorizing. */
2888 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2889 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2890 if (!epilog)
2892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2893 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2894 gcc_unreachable ();
2896 epilog->force_vectorize = false;
2897 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2899 /* Scalar version loop may be preferred. In this case, add guard
2900 and skip to epilog. Note this only happens when the number of
2901 iterations of loop is unknown at compile time, otherwise this
2902 won't be vectorized. */
2903 if (skip_vector)
2905 /* Additional epilogue iteration is peeled if gap exists. */
2906 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2907 bound_prolog, bound_epilog,
2908 th, &bound_scalar,
2909 check_profitability);
2910 /* Build guard against NITERSM1 since NITERS may overflow. */
2911 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2912 guard_bb = anchor;
2913 guard_to = split_edge (loop_preheader_edge (epilog));
2914 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2915 guard_to, guard_bb,
2916 prob_vector.invert (),
2917 irred_flag);
2918 skip_e = guard_e;
2919 e = EDGE_PRED (guard_to, 0);
2920 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2921 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2923 /* Simply propagate profile info from guard_bb to guard_to which is
2924 a merge point of control flow. */
2925 guard_to->count = guard_bb->count;
2927 /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
2928 if (vect_epilogues || scalar_loop == NULL)
2930 gcc_assert(epilog->num_nodes == loop->num_nodes);
2931 basic_block *bbs = get_loop_body (epilog);
2932 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2934 gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
2935 bbs[i]->count = original_counts[i];
2937 free (bbs);
2938 free (original_bbs);
2942 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2943 /* If loop is peeled for non-zero constant times, now niters refers to
2944 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2945 overflows. */
2946 niters_no_overflow |= (prolog_peeling > 0);
2947 vect_gen_vector_loop_niters (loop_vinfo, niters,
2948 niters_vector, step_vector,
2949 niters_no_overflow);
2950 if (!integer_onep (*step_vector))
2952 /* On exit from the loop we will have an easy way of calcalating
2953 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2954 until then. */
2955 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2956 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2957 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2959 else
2960 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2961 &niters_vector_mult_vf);
2962 /* Update IVs of original loop as if they were advanced by
2963 niters_vector_mult_vf steps. */
2964 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2965 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2966 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2967 update_e);
2969 if (skip_epilog)
2971 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2972 niters, niters_vector_mult_vf);
2973 guard_bb = single_exit (loop)->dest;
2974 guard_to = split_edge (single_exit (epilog));
2975 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2976 skip_vector ? anchor : guard_bb,
2977 prob_epilog.invert (),
2978 irred_flag);
2979 if (vect_epilogues)
2980 epilogue_vinfo->skip_this_loop_edge = guard_e;
2981 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2982 single_exit (epilog));
2983 /* Only need to handle basic block before epilog loop if it's not
2984 the guard_bb, which is the case when skip_vector is true. */
2985 if (guard_bb != bb_before_epilog)
2987 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2989 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2991 scale_loop_profile (epilog, prob_epilog, 0);
2993 else
2994 slpeel_update_phi_nodes_for_lcssa (epilog);
2996 unsigned HOST_WIDE_INT bound;
2997 if (bound_scalar.is_constant (&bound))
2999 gcc_assert (bound != 0);
3000 /* -1 to convert loop iterations to latch iterations. */
3001 record_niter_bound (epilog, bound - 1, false, true);
3004 delete_update_ssa ();
3005 adjust_vec_debug_stmts ();
3006 scev_reset ();
3009 if (vect_epilogues)
3011 epilog->aux = epilogue_vinfo;
3012 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3014 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3016 /* We now must calculate the number of NITERS performed by the previous
3017 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3018 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3019 niters_prolog, niters_vector_mult_vf);
3021 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3022 determine whether we are coming from the previous vectorized loop
3023 using the update_e edge or the skip_vector basic block using the
3024 skip_e edge. */
3025 if (skip_vector)
3027 gcc_assert (update_e != NULL
3028 && skip_e != NULL
3029 && !vect_epilogues_updated_niters);
3030 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3031 update_e->dest);
3032 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3033 gimple *stmt = gimple_build_assign (new_ssa, niters);
3034 gimple_stmt_iterator gsi;
3035 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3036 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3038 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3039 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3041 else
3043 gsi = gsi_last_bb (update_e->src);
3044 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3047 niters = new_ssa;
3048 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3049 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3050 UNKNOWN_LOCATION);
3051 niters = PHI_RESULT (new_phi);
3052 epilogue_vinfo->main_loop_edge = update_e;
3053 epilogue_vinfo->skip_main_loop_edge = skip_e;
3056 /* Set ADVANCE to the number of iterations performed by the previous
3057 loop and its prologue. */
3058 *advance = niters;
3060 if (!vect_epilogues_updated_niters)
3062 /* Subtract the number of iterations performed by the vectorized loop
3063 from the number of total iterations. */
3064 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3065 before_loop_niters,
3066 niters);
3068 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3069 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3070 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3071 epilogue_niters,
3072 build_one_cst (TREE_TYPE (epilogue_niters)));
3074 /* Decide what to do if the number of epilogue iterations is not
3075 a multiple of the epilogue loop's vectorization factor.
3076 We should have rejected the loop during the analysis phase
3077 if this fails. */
3078 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3079 true))
3080 gcc_unreachable ();
3084 adjust_vec.release ();
3085 free_original_copy_tables ();
3087 return vect_epilogues ? epilog : NULL;
3090 /* Function vect_create_cond_for_niters_checks.
3092 Create a conditional expression that represents the run-time checks for
3093 loop's niter. The loop is guaranteed to terminate if the run-time
3094 checks hold.
3096 Input:
3097 COND_EXPR - input conditional expression. New conditions will be chained
3098 with logical AND operation. If it is NULL, then the function
3099 is used to return the number of alias checks.
3100 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3101 to be checked.
3103 Output:
3104 COND_EXPR - conditional expression.
3106 The returned COND_EXPR is the conditional expression to be used in the
3107 if statement that controls which version of the loop gets executed at
3108 runtime. */
3110 static void
3111 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3113 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3115 if (*cond_expr)
3116 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3117 *cond_expr, part_cond_expr);
3118 else
3119 *cond_expr = part_cond_expr;
3122 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3123 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3125 static void
3126 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3128 if (*cond_expr)
3129 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3130 *cond_expr, part_cond_expr);
3131 else
3132 *cond_expr = part_cond_expr;
3135 /* Function vect_create_cond_for_align_checks.
3137 Create a conditional expression that represents the alignment checks for
3138 all of data references (array element references) whose alignment must be
3139 checked at runtime.
3141 Input:
3142 COND_EXPR - input conditional expression. New conditions will be chained
3143 with logical AND operation.
3144 LOOP_VINFO - two fields of the loop information are used.
3145 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3146 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3148 Output:
3149 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3150 expression.
3151 The returned value is the conditional expression to be used in the if
3152 statement that controls which version of the loop gets executed at runtime.
3154 The algorithm makes two assumptions:
3155 1) The number of bytes "n" in a vector is a power of 2.
3156 2) An address "a" is aligned if a%n is zero and that this
3157 test can be done as a&(n-1) == 0. For example, for 16
3158 byte vectors the test is a&0xf == 0. */
3160 static void
3161 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3162 tree *cond_expr,
3163 gimple_seq *cond_expr_stmt_list)
3165 const vec<stmt_vec_info> &may_misalign_stmts
3166 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3167 stmt_vec_info stmt_info;
3168 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3169 tree mask_cst;
3170 unsigned int i;
3171 tree int_ptrsize_type;
3172 char tmp_name[20];
3173 tree or_tmp_name = NULL_TREE;
3174 tree and_tmp_name;
3175 gimple *and_stmt;
3176 tree ptrsize_zero;
3177 tree part_cond_expr;
3179 /* Check that mask is one less than a power of 2, i.e., mask is
3180 all zeros followed by all ones. */
3181 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3183 int_ptrsize_type = signed_type_for (ptr_type_node);
3185 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3186 of the first vector of the i'th data reference. */
3188 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3190 gimple_seq new_stmt_list = NULL;
3191 tree addr_base;
3192 tree addr_tmp_name;
3193 tree new_or_tmp_name;
3194 gimple *addr_stmt, *or_stmt;
3195 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3196 bool negative = tree_int_cst_compare
3197 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3198 tree offset = negative
3199 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3200 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3201 : size_zero_node;
3203 /* create: addr_tmp = (int)(address_of_first_vector) */
3204 addr_base =
3205 vect_create_addr_base_for_vector_ref (loop_vinfo,
3206 stmt_info, &new_stmt_list,
3207 offset);
3208 if (new_stmt_list != NULL)
3209 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3211 sprintf (tmp_name, "addr2int%d", i);
3212 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3213 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3214 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3216 /* The addresses are OR together. */
3218 if (or_tmp_name != NULL_TREE)
3220 /* create: or_tmp = or_tmp | addr_tmp */
3221 sprintf (tmp_name, "orptrs%d", i);
3222 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3223 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3224 or_tmp_name, addr_tmp_name);
3225 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3226 or_tmp_name = new_or_tmp_name;
3228 else
3229 or_tmp_name = addr_tmp_name;
3231 } /* end for i */
3233 mask_cst = build_int_cst (int_ptrsize_type, mask);
3235 /* create: and_tmp = or_tmp & mask */
3236 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3238 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3239 or_tmp_name, mask_cst);
3240 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3242 /* Make and_tmp the left operand of the conditional test against zero.
3243 if and_tmp has a nonzero bit then some address is unaligned. */
3244 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3245 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3246 and_tmp_name, ptrsize_zero);
3247 chain_cond_expr (cond_expr, part_cond_expr);
3250 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3251 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3252 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3253 and this new condition are true. Treat a null *COND_EXPR as "true". */
3255 static void
3256 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3258 const vec<vec_object_pair> &pairs
3259 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3260 unsigned int i;
3261 vec_object_pair *pair;
3262 FOR_EACH_VEC_ELT (pairs, i, pair)
3264 tree addr1 = build_fold_addr_expr (pair->first);
3265 tree addr2 = build_fold_addr_expr (pair->second);
3266 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3267 addr1, addr2);
3268 chain_cond_expr (cond_expr, part_cond_expr);
3272 /* Create an expression that is true when all lower-bound conditions for
3273 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3275 static void
3276 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3278 const vec<vec_lower_bound> &lower_bounds
3279 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3280 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3282 tree expr = lower_bounds[i].expr;
3283 tree type = unsigned_type_for (TREE_TYPE (expr));
3284 expr = fold_convert (type, expr);
3285 poly_uint64 bound = lower_bounds[i].min_value;
3286 if (!lower_bounds[i].unsigned_p)
3288 expr = fold_build2 (PLUS_EXPR, type, expr,
3289 build_int_cstu (type, bound - 1));
3290 bound += bound - 1;
3292 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3293 build_int_cstu (type, bound));
3294 chain_cond_expr (cond_expr, part_cond_expr);
3298 /* Function vect_create_cond_for_alias_checks.
3300 Create a conditional expression that represents the run-time checks for
3301 overlapping of address ranges represented by a list of data references
3302 relations passed as input.
3304 Input:
3305 COND_EXPR - input conditional expression. New conditions will be chained
3306 with logical AND operation. If it is NULL, then the function
3307 is used to return the number of alias checks.
3308 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3309 to be checked.
3311 Output:
3312 COND_EXPR - conditional expression.
3314 The returned COND_EXPR is the conditional expression to be used in the if
3315 statement that controls which version of the loop gets executed at runtime.
3318 void
3319 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3321 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3322 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3324 if (comp_alias_ddrs.is_empty ())
3325 return;
3327 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3328 &comp_alias_ddrs, cond_expr);
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location,
3331 "created %u versioning for alias checks.\n",
3332 comp_alias_ddrs.length ());
3336 /* Function vect_loop_versioning.
3338 If the loop has data references that may or may not be aligned or/and
3339 has data reference relations whose independence was not proven then
3340 two versions of the loop need to be generated, one which is vectorized
3341 and one which isn't. A test is then generated to control which of the
3342 loops is executed. The test checks for the alignment of all of the
3343 data references that may or may not be aligned. An additional
3344 sequence of runtime tests is generated for each pairs of DDRs whose
3345 independence was not proven. The vectorized version of loop is
3346 executed only if both alias and alignment tests are passed.
3348 The test generated to check which version of loop is executed
3349 is modified to also check for profitability as indicated by the
3350 cost model threshold TH.
3352 The versioning precondition(s) are placed in *COND_EXPR and
3353 *COND_EXPR_STMT_LIST. */
3355 class loop *
3356 vect_loop_versioning (loop_vec_info loop_vinfo,
3357 gimple *loop_vectorized_call)
3359 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3360 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3361 basic_block condition_bb;
3362 gphi_iterator gsi;
3363 gimple_stmt_iterator cond_exp_gsi;
3364 basic_block merge_bb;
3365 basic_block new_exit_bb;
3366 edge new_exit_e, e;
3367 gphi *orig_phi, *new_phi;
3368 tree cond_expr = NULL_TREE;
3369 gimple_seq cond_expr_stmt_list = NULL;
3370 tree arg;
3371 profile_probability prob = profile_probability::likely ();
3372 gimple_seq gimplify_stmt_list = NULL;
3373 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3374 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3375 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3376 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3377 poly_uint64 versioning_threshold
3378 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3379 tree version_simd_if_cond
3380 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3381 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3383 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3384 && !ordered_p (th, versioning_threshold))
3385 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3386 build_int_cst (TREE_TYPE (scalar_loop_iters),
3387 th - 1));
3388 if (maybe_ne (versioning_threshold, 0U))
3390 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3391 build_int_cst (TREE_TYPE (scalar_loop_iters),
3392 versioning_threshold - 1));
3393 if (cond_expr)
3394 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3395 expr, cond_expr);
3396 else
3397 cond_expr = expr;
3400 tree cost_name = NULL_TREE;
3401 profile_probability prob2 = profile_probability::uninitialized ();
3402 if (cond_expr
3403 && !integer_truep (cond_expr)
3404 && (version_niter
3405 || version_align
3406 || version_alias
3407 || version_simd_if_cond))
3409 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3410 &cond_expr_stmt_list,
3411 is_gimple_val, NULL_TREE);
3412 /* Split prob () into two so that the overall probability of passing
3413 both the cost-model and versioning checks is the orig prob. */
3414 prob2 = prob.split (prob);
3417 if (version_niter)
3418 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3420 if (cond_expr)
3422 gimple_seq tem = NULL;
3423 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3424 &tem, is_gimple_condexpr_for_cond,
3425 NULL_TREE);
3426 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3429 if (version_align)
3430 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3431 &cond_expr_stmt_list);
3433 if (version_alias)
3435 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3436 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3437 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3440 if (version_simd_if_cond)
3442 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3443 if (flag_checking)
3444 if (basic_block bb
3445 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3446 gcc_assert (bb != loop->header
3447 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3448 && (scalar_loop == NULL
3449 || (bb != scalar_loop->header
3450 && dominated_by_p (CDI_DOMINATORS,
3451 scalar_loop->header, bb))));
3452 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3453 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3454 version_simd_if_cond, zero);
3455 if (cond_expr)
3456 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3457 c, cond_expr);
3458 else
3459 cond_expr = c;
3460 if (dump_enabled_p ())
3461 dump_printf_loc (MSG_NOTE, vect_location,
3462 "created versioning for simd if condition check.\n");
3465 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3466 &gimplify_stmt_list,
3467 is_gimple_condexpr_for_cond, NULL_TREE);
3468 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3470 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3471 invariant in. */
3472 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3473 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3474 !gsi_end_p (gsi); gsi_next (&gsi))
3476 gimple *stmt = gsi_stmt (gsi);
3477 update_stmt (stmt);
3478 ssa_op_iter iter;
3479 use_operand_p use_p;
3480 basic_block def_bb;
3481 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3482 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3483 && flow_bb_inside_loop_p (outermost, def_bb))
3484 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3487 /* Search for the outermost loop we can version. Avoid versioning of
3488 non-perfect nests but allow if-conversion versioned loops inside. */
3489 class loop *loop_to_version = loop;
3490 if (flow_loop_nested_p (outermost, loop))
3492 if (dump_enabled_p ())
3493 dump_printf_loc (MSG_NOTE, vect_location,
3494 "trying to apply versioning to outer loop %d\n",
3495 outermost->num);
3496 if (outermost->num == 0)
3497 outermost = superloop_at_depth (loop, 1);
3498 /* And avoid applying versioning on non-perfect nests. */
3499 while (loop_to_version != outermost
3500 && (e = single_exit (loop_outer (loop_to_version)))
3501 && !(e->flags & EDGE_COMPLEX)
3502 && (!loop_outer (loop_to_version)->inner->next
3503 || vect_loop_vectorized_call (loop_to_version))
3504 && (!loop_outer (loop_to_version)->inner->next
3505 || !loop_outer (loop_to_version)->inner->next->next))
3506 loop_to_version = loop_outer (loop_to_version);
3509 /* Apply versioning. If there is already a scalar version created by
3510 if-conversion re-use that. Note we cannot re-use the copy of
3511 an if-converted outer-loop when vectorizing the inner loop only. */
3512 gcond *cond;
3513 if ((!loop_to_version->inner || loop == loop_to_version)
3514 && loop_vectorized_call)
3516 gcc_assert (scalar_loop);
3517 condition_bb = gimple_bb (loop_vectorized_call);
3518 cond = as_a <gcond *> (last_stmt (condition_bb));
3519 gimple_cond_set_condition_from_tree (cond, cond_expr);
3520 update_stmt (cond);
3522 if (cond_expr_stmt_list)
3524 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3525 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3526 GSI_SAME_STMT);
3529 /* if-conversion uses profile_probability::always () for both paths,
3530 reset the paths probabilities appropriately. */
3531 edge te, fe;
3532 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3533 te->probability = prob;
3534 fe->probability = prob.invert ();
3535 /* We can scale loops counts immediately but have to postpone
3536 scaling the scalar loop because we re-use it during peeling. */
3537 scale_loop_frequencies (loop_to_version, te->probability);
3538 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3540 nloop = scalar_loop;
3541 if (dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE, vect_location,
3543 "reusing %sloop version created by if conversion\n",
3544 loop_to_version != loop ? "outer " : "");
3546 else
3548 if (loop_to_version != loop
3549 && dump_enabled_p ())
3550 dump_printf_loc (MSG_NOTE, vect_location,
3551 "applying loop versioning to outer loop %d\n",
3552 loop_to_version->num);
3554 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3556 initialize_original_copy_tables ();
3557 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3558 prob, prob.invert (), prob, prob.invert (), true);
3559 gcc_assert (nloop);
3560 nloop = get_loop_copy (loop);
3562 /* For cycle vectorization with SLP we rely on the PHI arguments
3563 appearing in the same order as the SLP node operands which for the
3564 loop PHI nodes means the preheader edge dest index needs to remain
3565 the same for the analyzed loop which also becomes the vectorized one.
3566 Make it so in case the state after versioning differs by redirecting
3567 the first edge into the header to the same destination which moves
3568 it last. */
3569 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3571 edge e = EDGE_PRED (loop->header, 0);
3572 ssa_redirect_edge (e, e->dest);
3573 flush_pending_stmts (e);
3575 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3577 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3578 reap those otherwise; they also refer to the original
3579 loops. */
3580 class loop *l = loop;
3581 while (gimple *call = vect_loop_vectorized_call (l))
3583 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3584 fold_loop_internal_call (call, boolean_false_node);
3585 l = loop_outer (l);
3587 free_original_copy_tables ();
3589 if (cond_expr_stmt_list)
3591 cond_exp_gsi = gsi_last_bb (condition_bb);
3592 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3593 GSI_SAME_STMT);
3596 /* Loop versioning violates an assumption we try to maintain during
3597 vectorization - that the loop exit block has a single predecessor.
3598 After versioning, the exit block of both loop versions is the same
3599 basic block (i.e. it has two predecessors). Just in order to simplify
3600 following transformations in the vectorizer, we fix this situation
3601 here by adding a new (empty) block on the exit-edge of the loop,
3602 with the proper loop-exit phis to maintain loop-closed-form.
3603 If loop versioning wasn't done from loop, but scalar_loop instead,
3604 merge_bb will have already just a single successor. */
3606 merge_bb = single_exit (loop_to_version)->dest;
3607 if (EDGE_COUNT (merge_bb->preds) >= 2)
3609 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3610 new_exit_bb = split_edge (single_exit (loop_to_version));
3611 new_exit_e = single_exit (loop_to_version);
3612 e = EDGE_SUCC (new_exit_bb, 0);
3614 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3615 gsi_next (&gsi))
3617 tree new_res;
3618 orig_phi = gsi.phi ();
3619 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3620 new_phi = create_phi_node (new_res, new_exit_bb);
3621 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3622 add_phi_arg (new_phi, arg, new_exit_e,
3623 gimple_phi_arg_location_from_edge (orig_phi, e));
3624 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3628 update_ssa (TODO_update_ssa_no_phi);
3631 /* Split the cost model check off to a separate BB. Costing assumes
3632 this is the only thing we perform when we enter the scalar loop
3633 from a failed cost decision. */
3634 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3636 gimple *def = SSA_NAME_DEF_STMT (cost_name);
3637 /* All uses of the cost check are 'true' after the check we
3638 are going to insert. */
3639 replace_uses_by (cost_name, boolean_true_node);
3640 /* And we're going to build the new single use of it. */
3641 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3642 NULL_TREE, NULL_TREE);
3643 edge e = split_block (gimple_bb (def), def);
3644 gimple_stmt_iterator gsi = gsi_for_stmt (def);
3645 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3646 edge true_e, false_e;
3647 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3648 e->flags &= ~EDGE_FALLTHRU;
3649 e->flags |= EDGE_TRUE_VALUE;
3650 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3651 e->probability = prob2;
3652 e2->probability = prob2.invert ();
3653 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3654 auto_vec<basic_block, 3> adj;
3655 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3656 son;
3657 son = next_dom_son (CDI_DOMINATORS, son))
3658 if (EDGE_COUNT (son->preds) > 1)
3659 adj.safe_push (son);
3660 for (auto son : adj)
3661 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3664 if (version_niter)
3666 /* The versioned loop could be infinite, we need to clear existing
3667 niter information which is copied from the original loop. */
3668 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3669 vect_free_loop_info_assumptions (nloop);
3672 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3673 && dump_enabled_p ())
3675 if (version_alias)
3676 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3677 vect_location,
3678 "loop versioned for vectorization because of "
3679 "possible aliasing\n");
3680 if (version_align)
3681 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3682 vect_location,
3683 "loop versioned for vectorization to enhance "
3684 "alignment\n");
3688 return nloop;