d: Add test for PR d/108167 to the testsuite [PR108167]
[official-gcc.git] / gcc / tree-vect-loop-manip.cc
blobd88edafa01857b5a4fe0565adf187544f3c25363
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
316 *indices);
319 /* Try to use permutes to define the masks in DEST_RGM using the masks
320 in SRC_RGM, given that the former has twice as many masks as the
321 latter. Return true on success, adding any new statements to SEQ. */
323 static bool
324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
325 rgroup_controls *src_rgm)
327 tree src_masktype = src_rgm->type;
328 tree dest_masktype = dest_rgm->type;
329 machine_mode src_mode = TYPE_MODE (src_masktype);
330 insn_code icode1, icode2;
331 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
332 && (icode1 = optab_handler (vec_unpacku_hi_optab,
333 src_mode)) != CODE_FOR_nothing
334 && (icode2 = optab_handler (vec_unpacku_lo_optab,
335 src_mode)) != CODE_FOR_nothing)
337 /* Unpacking the source masks gives at least as many mask bits as
338 we need. We can then VIEW_CONVERT any excess bits away. */
339 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
340 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
341 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
342 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
344 tree src = src_rgm->controls[i / 2];
345 tree dest = dest_rgm->controls[i];
346 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
347 ? VEC_UNPACK_HI_EXPR
348 : VEC_UNPACK_LO_EXPR);
349 gassign *stmt;
350 if (dest_masktype == unpack_masktype)
351 stmt = gimple_build_assign (dest, code, src);
352 else
354 tree temp = make_ssa_name (unpack_masktype);
355 stmt = gimple_build_assign (temp, code, src);
356 gimple_seq_add_stmt (seq, stmt);
357 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
358 build1 (VIEW_CONVERT_EXPR,
359 dest_masktype, temp));
361 gimple_seq_add_stmt (seq, stmt);
363 return true;
365 vec_perm_indices indices[2];
366 if (dest_masktype == src_masktype
367 && interleave_supported_p (&indices[0], src_masktype, 0)
368 && interleave_supported_p (&indices[1], src_masktype, 1))
370 /* The destination requires twice as many mask bits as the source, so
371 we can use interleaving permutes to double up the number of bits. */
372 tree masks[2];
373 for (unsigned int i = 0; i < 2; ++i)
374 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
375 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
377 tree src = src_rgm->controls[i / 2];
378 tree dest = dest_rgm->controls[i];
379 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
380 src, src, masks[i & 1]);
381 gimple_seq_add_stmt (seq, stmt);
383 return true;
385 return false;
388 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
389 for all the rgroup controls in RGC and return a control that is nonzero
390 when the loop needs to iterate. Add any new preheader statements to
391 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
393 RGC belongs to loop LOOP. The loop originally iterated NITERS
394 times and has been vectorized according to LOOP_VINFO.
396 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
397 starts with NITERS_SKIP dummy iterations of the scalar loop before
398 the real work starts. The mask elements for these dummy iterations
399 must be 0, to ensure that the extra iterations do not have an effect.
401 It is known that:
403 NITERS * RGC->max_nscalars_per_iter * RGC->factor
405 does not overflow. However, MIGHT_WRAP_P says whether an induction
406 variable that starts at 0 and has step:
408 VF * RGC->max_nscalars_per_iter * RGC->factor
410 might overflow before hitting a value above:
412 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
414 This means that we cannot guarantee that such an induction variable
415 would ever hit a value that produces a set of all-false masks or zero
416 lengths for RGC.
418 Note: the cost of the code generated by this function is modeled
419 by vect_estimate_min_profitable_iters, so changes here may need
420 corresponding changes there. */
422 static tree
423 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
424 gimple_seq *preheader_seq,
425 gimple_seq *header_seq,
426 gimple_stmt_iterator loop_cond_gsi,
427 rgroup_controls *rgc, tree niters,
428 tree niters_skip, bool might_wrap_p)
430 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
431 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
432 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
434 tree ctrl_type = rgc->type;
435 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
436 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
437 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
438 tree length_limit = NULL_TREE;
439 /* For length, we need length_limit to ensure length in range. */
440 if (!use_masks_p)
441 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
443 /* Calculate the maximum number of item values that the rgroup
444 handles in total, the number that it handles for each iteration
445 of the vector loop, and the number that it should skip during the
446 first iteration of the vector loop. */
447 tree nitems_total = niters;
448 tree nitems_step = build_int_cst (iv_type, vf);
449 tree nitems_skip = niters_skip;
450 if (nitems_per_iter != 1)
452 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
453 these multiplications don't overflow. */
454 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
455 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
456 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
457 nitems_total, compare_factor);
458 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
459 nitems_step, iv_factor);
460 if (nitems_skip)
461 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
462 nitems_skip, compare_factor);
465 /* Create an induction variable that counts the number of items
466 processed. */
467 tree index_before_incr, index_after_incr;
468 gimple_stmt_iterator incr_gsi;
469 bool insert_after;
470 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
471 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
472 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
474 tree zero_index = build_int_cst (compare_type, 0);
475 tree test_index, test_limit, first_limit;
476 gimple_stmt_iterator *test_gsi;
477 if (might_wrap_p)
479 /* In principle the loop should stop iterating once the incremented
480 IV reaches a value greater than or equal to:
482 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
484 However, there's no guarantee that this addition doesn't overflow
485 the comparison type, or that the IV hits a value above it before
486 wrapping around. We therefore adjust the limit down by one
487 IV step:
489 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
490 -[infinite-prec] NITEMS_STEP
492 and compare the IV against this limit _before_ incrementing it.
493 Since the comparison type is unsigned, we actually want the
494 subtraction to saturate at zero:
496 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
497 -[sat] NITEMS_STEP
499 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
501 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
503 where the rightmost subtraction can be done directly in
504 COMPARE_TYPE. */
505 test_index = index_before_incr;
506 tree adjust = gimple_convert (preheader_seq, compare_type,
507 nitems_step);
508 if (nitems_skip)
509 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
510 adjust, nitems_skip);
511 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
512 nitems_total, adjust);
513 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
514 test_limit, adjust);
515 test_gsi = &incr_gsi;
517 /* Get a safe limit for the first iteration. */
518 if (nitems_skip)
520 /* The first vector iteration can handle at most NITEMS_STEP
521 items. NITEMS_STEP <= CONST_LIMIT, and adding
522 NITEMS_SKIP to that cannot overflow. */
523 tree const_limit = build_int_cst (compare_type,
524 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
525 * nitems_per_iter);
526 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
527 nitems_total, const_limit);
528 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
529 first_limit, nitems_skip);
531 else
532 /* For the first iteration it doesn't matter whether the IV hits
533 a value above NITEMS_TOTAL. That only matters for the latch
534 condition. */
535 first_limit = nitems_total;
537 else
539 /* Test the incremented IV, which will always hit a value above
540 the bound before wrapping. */
541 test_index = index_after_incr;
542 test_limit = nitems_total;
543 if (nitems_skip)
544 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
545 test_limit, nitems_skip);
546 test_gsi = &loop_cond_gsi;
548 first_limit = test_limit;
551 /* Convert the IV value to the comparison type (either a no-op or
552 a demotion). */
553 gimple_seq test_seq = NULL;
554 test_index = gimple_convert (&test_seq, compare_type, test_index);
555 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
557 /* Provide a definition of each control in the group. */
558 tree next_ctrl = NULL_TREE;
559 tree ctrl;
560 unsigned int i;
561 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
563 /* Previous controls will cover BIAS items. This control covers the
564 next batch. */
565 poly_uint64 bias = nitems_per_ctrl * i;
566 tree bias_tree = build_int_cst (compare_type, bias);
568 /* See whether the first iteration of the vector loop is known
569 to have a full control. */
570 poly_uint64 const_limit;
571 bool first_iteration_full
572 = (poly_int_tree_p (first_limit, &const_limit)
573 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
575 /* Rather than have a new IV that starts at BIAS and goes up to
576 TEST_LIMIT, prefer to use the same 0-based IV for each control
577 and adjust the bound down by BIAS. */
578 tree this_test_limit = test_limit;
579 if (i != 0)
581 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
582 compare_type, this_test_limit,
583 bias_tree);
584 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
585 compare_type, this_test_limit,
586 bias_tree);
589 /* Create the initial control. First include all items that
590 are within the loop limit. */
591 tree init_ctrl = NULL_TREE;
592 if (!first_iteration_full)
594 tree start, end;
595 if (first_limit == test_limit)
597 /* Use a natural test between zero (the initial IV value)
598 and the loop limit. The "else" block would be valid too,
599 but this choice can avoid the need to load BIAS_TREE into
600 a register. */
601 start = zero_index;
602 end = this_test_limit;
604 else
606 /* FIRST_LIMIT is the maximum number of items handled by the
607 first iteration of the vector loop. Test the portion
608 associated with this control. */
609 start = bias_tree;
610 end = first_limit;
613 if (use_masks_p)
614 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
615 start, end, "max_mask");
616 else
618 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
619 gimple_seq seq = vect_gen_len (init_ctrl, start,
620 end, length_limit);
621 gimple_seq_add_seq (preheader_seq, seq);
625 /* Now AND out the bits that are within the number of skipped
626 items. */
627 poly_uint64 const_skip;
628 if (nitems_skip
629 && !(poly_int_tree_p (nitems_skip, &const_skip)
630 && known_le (const_skip, bias)))
632 gcc_assert (use_masks_p);
633 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
634 bias_tree, nitems_skip);
635 if (init_ctrl)
636 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
637 init_ctrl, unskipped_mask);
638 else
639 init_ctrl = unskipped_mask;
642 if (!init_ctrl)
644 /* First iteration is full. */
645 if (use_masks_p)
646 init_ctrl = build_minus_one_cst (ctrl_type);
647 else
648 init_ctrl = length_limit;
651 /* Get the control value for the next iteration of the loop. */
652 if (use_masks_p)
654 gimple_seq stmts = NULL;
655 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
656 this_test_limit, "next_mask");
657 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
659 else
661 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
662 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
663 length_limit);
664 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
667 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
670 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
671 if (partial_load_bias != 0)
673 tree adjusted_len = rgc->bias_adjusted_ctrl;
674 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
675 rgc->controls[0],
676 build_int_cst
677 (TREE_TYPE (rgc->controls[0]),
678 partial_load_bias));
679 gimple_seq_add_stmt (header_seq, minus);
682 return next_ctrl;
685 /* Set up the iteration condition and rgroup controls for LOOP, given
686 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
687 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
688 the number of iterations of the original scalar loop that should be
689 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
690 for vect_set_loop_condition.
692 Insert the branch-back condition before LOOP_COND_GSI and return the
693 final gcond. */
695 static gcond *
696 vect_set_loop_condition_partial_vectors (class loop *loop,
697 loop_vec_info loop_vinfo, tree niters,
698 tree final_iv, bool niters_maybe_zero,
699 gimple_stmt_iterator loop_cond_gsi)
701 gimple_seq preheader_seq = NULL;
702 gimple_seq header_seq = NULL;
704 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
705 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
706 unsigned int compare_precision = TYPE_PRECISION (compare_type);
707 tree orig_niters = niters;
709 /* Type of the initial value of NITERS. */
710 tree ni_actual_type = TREE_TYPE (niters);
711 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
712 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
714 /* Convert NITERS to the same size as the compare. */
715 if (compare_precision > ni_actual_precision
716 && niters_maybe_zero)
718 /* We know that there is always at least one iteration, so if the
719 count is zero then it must have wrapped. Cope with this by
720 subtracting 1 before the conversion and adding 1 to the result. */
721 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
722 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
723 niters, build_minus_one_cst (ni_actual_type));
724 niters = gimple_convert (&preheader_seq, compare_type, niters);
725 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
726 niters, build_one_cst (compare_type));
728 else
729 niters = gimple_convert (&preheader_seq, compare_type, niters);
731 /* Iterate over all the rgroups and fill in their controls. We could use
732 the first control from any rgroup for the loop condition; here we
733 arbitrarily pick the last. */
734 tree test_ctrl = NULL_TREE;
735 rgroup_controls *rgc;
736 unsigned int i;
737 auto_vec<rgroup_controls> *controls = use_masks_p
738 ? &LOOP_VINFO_MASKS (loop_vinfo)
739 : &LOOP_VINFO_LENS (loop_vinfo);
740 FOR_EACH_VEC_ELT (*controls, i, rgc)
741 if (!rgc->controls.is_empty ())
743 /* First try using permutes. This adds a single vector
744 instruction to the loop for each mask, but needs no extra
745 loop invariants or IVs. */
746 unsigned int nmasks = i + 1;
747 if (use_masks_p && (nmasks & 1) == 0)
749 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
750 if (!half_rgc->controls.is_empty ()
751 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
752 continue;
755 /* See whether zero-based IV would ever generate all-false masks
756 or zero length before wrapping around. */
757 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
759 /* Set up all controls for this group. */
760 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
761 &preheader_seq,
762 &header_seq,
763 loop_cond_gsi, rgc,
764 niters, niters_skip,
765 might_wrap_p);
768 /* Emit all accumulated statements. */
769 add_preheader_seq (loop, preheader_seq);
770 add_header_seq (loop, header_seq);
772 /* Get a boolean result that tells us whether to iterate. */
773 edge exit_edge = single_exit (loop);
774 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
775 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
776 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
777 NULL_TREE, NULL_TREE);
778 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
780 /* The loop iterates (NITERS - 1) / VF + 1 times.
781 Subtract one from this to get the latch count. */
782 tree step = build_int_cst (compare_type,
783 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
784 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
785 build_minus_one_cst (compare_type));
786 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
787 niters_minus_one, step);
789 if (final_iv)
791 gassign *assign = gimple_build_assign (final_iv, orig_niters);
792 gsi_insert_on_edge_immediate (single_exit (loop), assign);
795 return cond_stmt;
798 /* Like vect_set_loop_condition, but handle the case in which the vector
799 loop handles exactly VF scalars per iteration. */
801 static gcond *
802 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
803 tree final_iv, bool niters_maybe_zero,
804 gimple_stmt_iterator loop_cond_gsi)
806 tree indx_before_incr, indx_after_incr;
807 gcond *cond_stmt;
808 gcond *orig_cond;
809 edge pe = loop_preheader_edge (loop);
810 edge exit_edge = single_exit (loop);
811 gimple_stmt_iterator incr_gsi;
812 bool insert_after;
813 enum tree_code code;
814 tree niters_type = TREE_TYPE (niters);
816 orig_cond = get_loop_exit_condition (loop);
817 gcc_assert (orig_cond);
818 loop_cond_gsi = gsi_for_stmt (orig_cond);
820 tree init, limit;
821 if (!niters_maybe_zero && integer_onep (step))
823 /* In this case we can use a simple 0-based IV:
826 x = 0;
830 x += 1;
832 while (x < NITERS); */
833 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
834 init = build_zero_cst (niters_type);
835 limit = niters;
837 else
839 /* The following works for all values of NITERS except 0:
842 x = 0;
846 x += STEP;
848 while (x <= NITERS - STEP);
850 so that the loop continues to iterate if x + STEP - 1 < NITERS
851 but stops if x + STEP - 1 >= NITERS.
853 However, if NITERS is zero, x never hits a value above NITERS - STEP
854 before wrapping around. There are two obvious ways of dealing with
855 this:
857 - start at STEP - 1 and compare x before incrementing it
858 - start at -1 and compare x after incrementing it
860 The latter is simpler and is what we use. The loop in this case
861 looks like:
864 x = -1;
868 x += STEP;
870 while (x < NITERS - STEP);
872 In both cases the loop limit is NITERS - STEP. */
873 gimple_seq seq = NULL;
874 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
875 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
876 if (seq)
878 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
879 gcc_assert (!new_bb);
881 if (niters_maybe_zero)
883 /* Case C. */
884 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
885 init = build_all_ones_cst (niters_type);
887 else
889 /* Case B. */
890 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
891 init = build_zero_cst (niters_type);
895 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
896 create_iv (init, step, NULL_TREE, loop,
897 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
898 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
899 true, NULL_TREE, true,
900 GSI_SAME_STMT);
901 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
902 true, GSI_SAME_STMT);
904 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
905 NULL_TREE);
907 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
909 /* Record the number of latch iterations. */
910 if (limit == niters)
911 /* Case A: the loop iterates NITERS times. Subtract one to get the
912 latch count. */
913 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
914 build_int_cst (niters_type, 1));
915 else
916 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
917 Subtract one from this to get the latch count. */
918 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
919 limit, step);
921 if (final_iv)
923 gassign *assign;
924 edge exit = single_exit (loop);
925 gcc_assert (single_pred_p (exit->dest));
926 tree phi_dest
927 = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
928 /* Make sure to maintain LC SSA form here and elide the subtraction
929 if the value is zero. */
930 gphi *phi = create_phi_node (phi_dest, exit->dest);
931 add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
932 if (!integer_zerop (init))
934 assign = gimple_build_assign (final_iv, MINUS_EXPR,
935 phi_dest, init);
936 gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
937 gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
941 return cond_stmt;
944 /* If we're using fully-masked loops, make LOOP iterate:
946 N == (NITERS - 1) / STEP + 1
948 times. When NITERS is zero, this is equivalent to making the loop
949 execute (1 << M) / STEP times, where M is the precision of NITERS.
950 NITERS_MAYBE_ZERO is true if this last case might occur.
952 If we're not using fully-masked loops, make LOOP iterate:
954 N == (NITERS - STEP) / STEP + 1
956 times, where NITERS is known to be outside the range [1, STEP - 1].
957 This is equivalent to making the loop execute NITERS / STEP times
958 when NITERS is nonzero and (1 << M) / STEP times otherwise.
959 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
961 If FINAL_IV is nonnull, it is an SSA name that should be set to
962 N * STEP on exit from the loop.
964 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
966 void
967 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
968 tree niters, tree step, tree final_iv,
969 bool niters_maybe_zero)
971 gcond *cond_stmt;
972 gcond *orig_cond = get_loop_exit_condition (loop);
973 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
975 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
976 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
977 niters, final_iv,
978 niters_maybe_zero,
979 loop_cond_gsi);
980 else
981 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
982 niters_maybe_zero,
983 loop_cond_gsi);
985 /* Remove old loop exit test. */
986 stmt_vec_info orig_cond_info;
987 if (loop_vinfo
988 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
989 loop_vinfo->remove_stmt (orig_cond_info);
990 else
991 gsi_remove (&loop_cond_gsi, true);
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
995 (gimple *) cond_stmt);
998 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
999 For all PHI arguments in FROM->dest and TO->dest from those
1000 edges ensure that TO->dest PHI arguments have current_def
1001 to that in from. */
1003 static void
1004 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1006 gimple_stmt_iterator gsi_from, gsi_to;
1008 for (gsi_from = gsi_start_phis (from->dest),
1009 gsi_to = gsi_start_phis (to->dest);
1010 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1012 gimple *from_phi = gsi_stmt (gsi_from);
1013 gimple *to_phi = gsi_stmt (gsi_to);
1014 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1015 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1016 if (virtual_operand_p (from_arg))
1018 gsi_next (&gsi_from);
1019 continue;
1021 if (virtual_operand_p (to_arg))
1023 gsi_next (&gsi_to);
1024 continue;
1026 if (TREE_CODE (from_arg) != SSA_NAME)
1027 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1028 else if (TREE_CODE (to_arg) == SSA_NAME
1029 && from_arg != to_arg)
1031 if (get_current_def (to_arg) == NULL_TREE)
1033 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1034 TREE_TYPE (get_current_def
1035 (from_arg))));
1036 set_current_def (to_arg, get_current_def (from_arg));
1039 gsi_next (&gsi_from);
1040 gsi_next (&gsi_to);
1043 gphi *from_phi = get_virtual_phi (from->dest);
1044 gphi *to_phi = get_virtual_phi (to->dest);
1045 if (from_phi)
1046 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1047 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1051 /* Given LOOP this function generates a new copy of it and puts it
1052 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1053 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1054 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1055 entry or exit of LOOP. */
1057 class loop *
1058 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1059 class loop *scalar_loop, edge e)
1061 class loop *new_loop;
1062 basic_block *new_bbs, *bbs, *pbbs;
1063 bool at_exit;
1064 bool was_imm_dom;
1065 basic_block exit_dest;
1066 edge exit, new_exit;
1067 bool duplicate_outer_loop = false;
1069 exit = single_exit (loop);
1070 at_exit = (e == exit);
1071 if (!at_exit && e != loop_preheader_edge (loop))
1072 return NULL;
1074 if (scalar_loop == NULL)
1075 scalar_loop = loop;
1077 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078 pbbs = bbs + 1;
1079 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1080 /* Allow duplication of outer loops. */
1081 if (scalar_loop->inner)
1082 duplicate_outer_loop = true;
1084 /* Generate new loop structure. */
1085 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1086 duplicate_subloops (scalar_loop, new_loop);
1088 exit_dest = exit->dest;
1089 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1090 exit_dest) == loop->header ?
1091 true : false);
1093 /* Also copy the pre-header, this avoids jumping through hoops to
1094 duplicate the loop entry PHI arguments. Create an empty
1095 pre-header unconditionally for this. */
1096 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1097 edge entry_e = single_pred_edge (preheader);
1098 bbs[0] = preheader;
1099 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1101 exit = single_exit (scalar_loop);
1102 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1103 &exit, 1, &new_exit, NULL,
1104 at_exit ? loop->latch : e->src, true);
1105 exit = single_exit (loop);
1106 basic_block new_preheader = new_bbs[0];
1108 /* Before installing PHI arguments make sure that the edges
1109 into them match that of the scalar loop we analyzed. This
1110 makes sure the SLP tree matches up between the main vectorized
1111 loop and the epilogue vectorized copies. */
1112 if (single_succ_edge (preheader)->dest_idx
1113 != single_succ_edge (new_bbs[0])->dest_idx)
1115 basic_block swap_bb = new_bbs[1];
1116 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1117 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1118 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1119 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1121 if (duplicate_outer_loop)
1123 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1124 if (loop_preheader_edge (scalar_loop)->dest_idx
1125 != loop_preheader_edge (new_inner_loop)->dest_idx)
1127 basic_block swap_bb = new_inner_loop->header;
1128 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1129 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1130 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1131 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1135 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1137 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1138 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1139 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1141 if (scalar_loop != loop)
1143 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1144 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1145 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1146 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1147 header) to have current_def set, so copy them over. */
1148 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1149 exit);
1150 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1152 EDGE_SUCC (loop->latch, 0));
1155 if (at_exit) /* Add the loop copy at exit. */
1157 if (scalar_loop != loop)
1159 gphi_iterator gsi;
1160 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1162 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1163 gsi_next (&gsi))
1165 gphi *phi = gsi.phi ();
1166 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1167 location_t orig_locus
1168 = gimple_phi_arg_location_from_edge (phi, e);
1170 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1173 redirect_edge_and_branch_force (e, new_preheader);
1174 flush_pending_stmts (e);
1175 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1176 if (was_imm_dom || duplicate_outer_loop)
1177 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1179 /* And remove the non-necessary forwarder again. Keep the other
1180 one so we have a proper pre-header for the loop at the exit edge. */
1181 redirect_edge_pred (single_succ_edge (preheader),
1182 single_pred (preheader));
1183 delete_basic_block (preheader);
1184 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1185 loop_preheader_edge (scalar_loop)->src);
1187 else /* Add the copy at entry. */
1189 if (scalar_loop != loop)
1191 /* Remove the non-necessary forwarder of scalar_loop again. */
1192 redirect_edge_pred (single_succ_edge (preheader),
1193 single_pred (preheader));
1194 delete_basic_block (preheader);
1195 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1196 loop_preheader_edge (scalar_loop)->src);
1197 preheader = split_edge (loop_preheader_edge (loop));
1198 entry_e = single_pred_edge (preheader);
1201 redirect_edge_and_branch_force (entry_e, new_preheader);
1202 flush_pending_stmts (entry_e);
1203 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1205 redirect_edge_and_branch_force (new_exit, preheader);
1206 flush_pending_stmts (new_exit);
1207 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1209 /* And remove the non-necessary forwarder again. Keep the other
1210 one so we have a proper pre-header for the loop at the exit edge. */
1211 redirect_edge_pred (single_succ_edge (new_preheader),
1212 single_pred (new_preheader));
1213 delete_basic_block (new_preheader);
1214 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1215 loop_preheader_edge (new_loop)->src);
1218 if (scalar_loop != loop)
1220 /* Update new_loop->header PHIs, so that on the preheader
1221 edge they are the ones from loop rather than scalar_loop. */
1222 gphi_iterator gsi_orig, gsi_new;
1223 edge orig_e = loop_preheader_edge (loop);
1224 edge new_e = loop_preheader_edge (new_loop);
1226 for (gsi_orig = gsi_start_phis (loop->header),
1227 gsi_new = gsi_start_phis (new_loop->header);
1228 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1229 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1231 gphi *orig_phi = gsi_orig.phi ();
1232 gphi *new_phi = gsi_new.phi ();
1233 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1234 location_t orig_locus
1235 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1237 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1241 free (new_bbs);
1242 free (bbs);
1244 checking_verify_dominators (CDI_DOMINATORS);
1246 return new_loop;
1250 /* Given the condition expression COND, put it as the last statement of
1251 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1252 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1253 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1254 new edge as irreducible if IRREDUCIBLE_P is true. */
1256 static edge
1257 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1258 basic_block guard_to, basic_block dom_bb,
1259 profile_probability probability, bool irreducible_p)
1261 gimple_stmt_iterator gsi;
1262 edge new_e, enter_e;
1263 gcond *cond_stmt;
1264 gimple_seq gimplify_stmt_list = NULL;
1266 enter_e = EDGE_SUCC (guard_bb, 0);
1267 enter_e->flags &= ~EDGE_FALLTHRU;
1268 enter_e->flags |= EDGE_FALSE_VALUE;
1269 gsi = gsi_last_bb (guard_bb);
1271 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1272 is_gimple_condexpr_for_cond, NULL_TREE);
1273 if (gimplify_stmt_list)
1274 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1276 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1277 gsi = gsi_last_bb (guard_bb);
1278 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1280 /* Add new edge to connect guard block to the merge/loop-exit block. */
1281 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1283 new_e->probability = probability;
1284 if (irreducible_p)
1285 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1287 enter_e->probability = probability.invert ();
1288 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1290 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1291 if (enter_e->dest->loop_father->header == enter_e->dest)
1292 split_edge (enter_e);
1294 return new_e;
1298 /* This function verifies that the following restrictions apply to LOOP:
1299 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1300 for innermost loop and 5 basic blocks for outer-loop.
1301 (2) it is single entry, single exit
1302 (3) its exit condition is the last stmt in the header
1303 (4) E is the entry/exit edge of LOOP.
1306 bool
1307 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1309 edge exit_e = single_exit (loop);
1310 edge entry_e = loop_preheader_edge (loop);
1311 gcond *orig_cond = get_loop_exit_condition (loop);
1312 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1313 unsigned int num_bb = loop->inner? 5 : 2;
1315 /* All loops have an outer scope; the only case loop->outer is NULL is for
1316 the function itself. */
1317 if (!loop_outer (loop)
1318 || loop->num_nodes != num_bb
1319 || !empty_block_p (loop->latch)
1320 || !single_exit (loop)
1321 /* Verify that new loop exit condition can be trivially modified. */
1322 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1323 || (e != exit_e && e != entry_e))
1324 return false;
1326 basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1327 get_loop_body_with_size (loop, bbs, loop->num_nodes);
1328 bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1329 free (bbs);
1330 return ret;
1333 /* Function vect_get_loop_location.
1335 Extract the location of the loop in the source code.
1336 If the loop is not well formed for vectorization, an estimated
1337 location is calculated.
1338 Return the loop location if succeed and NULL if not. */
1340 dump_user_location_t
1341 find_loop_location (class loop *loop)
1343 gimple *stmt = NULL;
1344 basic_block bb;
1345 gimple_stmt_iterator si;
1347 if (!loop)
1348 return dump_user_location_t ();
1350 stmt = get_loop_exit_condition (loop);
1352 if (stmt
1353 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1354 return stmt;
1356 /* If we got here the loop is probably not "well formed",
1357 try to estimate the loop location */
1359 if (!loop->header)
1360 return dump_user_location_t ();
1362 bb = loop->header;
1364 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1366 stmt = gsi_stmt (si);
1367 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1368 return stmt;
1371 return dump_user_location_t ();
1374 /* Return true if the phi described by STMT_INFO defines an IV of the
1375 loop to be vectorized. */
1377 static bool
1378 iv_phi_p (stmt_vec_info stmt_info)
1380 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1381 if (virtual_operand_p (PHI_RESULT (phi)))
1382 return false;
1384 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1385 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1386 return false;
1388 return true;
1391 /* Return true if vectorizer can peel for nonlinear iv. */
1392 static bool
1393 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
1394 enum vect_induction_op_type induction_type)
1396 tree niters_skip;
1397 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
1398 if niters or vf is unkown:
1399 For shift, when shift mount >= precision, there would be UD.
1400 For mult, don't known how to generate
1401 init_expr * pow (step, niters) for variable niters.
1402 For neg, it should be ok, since niters of vectorized main loop
1403 will always be multiple of 2. */
1404 if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1405 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
1406 && induction_type != vect_step_op_neg)
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 "Peeling for epilogue is not supported"
1411 " for nonlinear induction except neg"
1412 " when iteration count is unknown.\n");
1413 return false;
1416 /* Also doens't support peel for neg when niter is variable.
1417 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
1418 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
1419 if ((niters_skip != NULL_TREE
1420 && TREE_CODE (niters_skip) != INTEGER_CST)
1421 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
1422 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "Peeling for alignement is not supported"
1427 " for nonlinear induction when niters_skip"
1428 " is not constant.\n");
1429 return false;
1432 return true;
1435 /* Function vect_can_advance_ivs_p
1437 In case the number of iterations that LOOP iterates is unknown at compile
1438 time, an epilog loop will be generated, and the loop induction variables
1439 (IVs) will be "advanced" to the value they are supposed to take just before
1440 the epilog loop. Here we check that the access function of the loop IVs
1441 and the expression that represents the loop bound are simple enough.
1442 These restrictions will be relaxed in the future. */
1444 bool
1445 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1447 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1448 basic_block bb = loop->header;
1449 gphi_iterator gsi;
1451 /* Analyze phi functions of the loop header. */
1453 if (dump_enabled_p ())
1454 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1455 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1457 tree evolution_part;
1458 enum vect_induction_op_type induction_type;
1460 gphi *phi = gsi.phi ();
1461 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1464 phi_info->stmt);
1466 /* Skip virtual phi's. The data dependences that are associated with
1467 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1469 Skip reduction phis. */
1470 if (!iv_phi_p (phi_info))
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_NOTE, vect_location,
1474 "reduc or virtual phi. skip.\n");
1475 continue;
1478 induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1479 if (induction_type != vect_step_op_add)
1481 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
1482 return false;
1484 continue;
1487 /* Analyze the evolution function. */
1489 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1490 if (evolution_part == NULL_TREE)
1492 if (dump_enabled_p ())
1493 dump_printf (MSG_MISSED_OPTIMIZATION,
1494 "No access function or evolution.\n");
1495 return false;
1498 /* FORNOW: We do not transform initial conditions of IVs
1499 which evolution functions are not invariants in the loop. */
1501 if (!expr_invariant_in_loop_p (loop, evolution_part))
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505 "evolution not invariant in loop.\n");
1506 return false;
1509 /* FORNOW: We do not transform initial conditions of IVs
1510 which evolution functions are a polynomial of degree >= 2. */
1512 if (tree_is_chrec (evolution_part))
1514 if (dump_enabled_p ())
1515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516 "evolution is chrec.\n");
1517 return false;
1521 return true;
1525 /* Function vect_update_ivs_after_vectorizer.
1527 "Advance" the induction variables of LOOP to the value they should take
1528 after the execution of LOOP. This is currently necessary because the
1529 vectorizer does not handle induction variables that are used after the
1530 loop. Such a situation occurs when the last iterations of LOOP are
1531 peeled, because:
1532 1. We introduced new uses after LOOP for IVs that were not originally used
1533 after LOOP: the IVs of LOOP are now used by an epilog loop.
1534 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1535 times, whereas the loop IVs should be bumped N times.
1537 Input:
1538 - LOOP - a loop that is going to be vectorized. The last few iterations
1539 of LOOP were peeled.
1540 - NITERS - the number of iterations that LOOP executes (before it is
1541 vectorized). i.e, the number of times the ivs should be bumped.
1542 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1543 coming out from LOOP on which there are uses of the LOOP ivs
1544 (this is the path from LOOP->exit to epilog_loop->preheader).
1546 The new definitions of the ivs are placed in LOOP->exit.
1547 The phi args associated with the edge UPDATE_E in the bb
1548 UPDATE_E->dest are updated accordingly.
1550 Assumption 1: Like the rest of the vectorizer, this function assumes
1551 a single loop exit that has a single predecessor.
1553 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1554 organized in the same order.
1556 Assumption 3: The access function of the ivs is simple enough (see
1557 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1559 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1560 coming out of LOOP on which the ivs of LOOP are used (this is the path
1561 that leads to the epilog loop; other paths skip the epilog loop). This
1562 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1563 needs to have its phis updated.
1566 static void
1567 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1568 tree niters, edge update_e)
1570 gphi_iterator gsi, gsi1;
1571 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1572 basic_block update_bb = update_e->dest;
1573 basic_block exit_bb = single_exit (loop)->dest;
1575 /* Make sure there exists a single-predecessor exit bb: */
1576 gcc_assert (single_pred_p (exit_bb));
1577 gcc_assert (single_succ_edge (exit_bb) == update_e);
1579 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1580 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1581 gsi_next (&gsi), gsi_next (&gsi1))
1583 tree init_expr;
1584 tree step_expr, off;
1585 tree type;
1586 tree var, ni, ni_name;
1587 gimple_stmt_iterator last_gsi;
1589 gphi *phi = gsi.phi ();
1590 gphi *phi1 = gsi1.phi ();
1591 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1592 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "vect_update_ivs_after_vectorizer: phi: %G",
1595 (gimple *) phi);
1597 /* Skip reduction and virtual phis. */
1598 if (!iv_phi_p (phi_info))
1600 if (dump_enabled_p ())
1601 dump_printf_loc (MSG_NOTE, vect_location,
1602 "reduc or virtual phi. skip.\n");
1603 continue;
1606 type = TREE_TYPE (gimple_phi_result (phi));
1607 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1608 step_expr = unshare_expr (step_expr);
1610 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1611 of degree >= 2 or exponential. */
1612 gcc_assert (!tree_is_chrec (step_expr));
1614 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1615 gimple_seq stmts = NULL;
1616 enum vect_induction_op_type induction_type
1617 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1619 if (induction_type == vect_step_op_add)
1621 tree stype = TREE_TYPE (step_expr);
1622 off = fold_build2 (MULT_EXPR, stype,
1623 fold_convert (stype, niters), step_expr);
1624 if (POINTER_TYPE_P (type))
1625 ni = fold_build_pointer_plus (init_expr, off);
1626 else
1627 ni = fold_convert (type,
1628 fold_build2 (PLUS_EXPR, stype,
1629 fold_convert (stype, init_expr),
1630 off));
1632 /* Don't bother call vect_peel_nonlinear_iv_init. */
1633 else if (induction_type == vect_step_op_neg)
1634 ni = init_expr;
1635 else
1636 ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
1637 niters, step_expr,
1638 induction_type);
1640 var = create_tmp_var (type, "tmp");
1642 last_gsi = gsi_last_bb (exit_bb);
1643 gimple_seq new_stmts = NULL;
1644 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1645 /* Exit_bb shouldn't be empty. */
1646 if (!gsi_end_p (last_gsi))
1648 gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
1649 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1651 else
1653 gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
1654 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1657 /* Fix phi expressions in the successor bb. */
1658 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1662 /* Return a gimple value containing the misalignment (measured in vector
1663 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1664 it is away from a perfectly aligned address. Add any new statements
1665 to SEQ. */
1667 static tree
1668 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1670 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1671 stmt_vec_info stmt_info = dr_info->stmt;
1672 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1674 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1675 unsigned HOST_WIDE_INT target_align_c;
1676 tree target_align_minus_1;
1678 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1679 size_zero_node) < 0;
1680 tree offset = (negative
1681 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1682 * TREE_INT_CST_LOW
1683 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1684 : size_zero_node);
1685 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1686 stmt_info, seq,
1687 offset);
1688 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1689 if (target_align.is_constant (&target_align_c))
1690 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1691 else
1693 tree vla = build_int_cst (type, target_align);
1694 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1695 fold_build2 (MINUS_EXPR, type,
1696 build_int_cst (type, 0), vla));
1697 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1698 build_int_cst (type, 1));
1701 HOST_WIDE_INT elem_size
1702 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1703 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1705 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1706 tree int_start_addr = fold_convert (type, start_addr);
1707 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1708 target_align_minus_1);
1710 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1711 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1712 elem_size_log);
1714 return misalign_in_elems;
1717 /* Function vect_gen_prolog_loop_niters
1719 Generate the number of iterations which should be peeled as prolog for the
1720 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1721 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1722 As a result, after the execution of this loop, the data reference DR will
1723 refer to an aligned location. The following computation is generated:
1725 If the misalignment of DR is known at compile time:
1726 addr_mis = int mis = DR_MISALIGNMENT (dr);
1727 Else, compute address misalignment in bytes:
1728 addr_mis = addr & (target_align - 1)
1730 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1732 (elem_size = element type size; an element is the scalar element whose type
1733 is the inner type of the vectype)
1735 The computations will be emitted at the end of BB. We also compute and
1736 store upper bound (included) of the result in BOUND.
1738 When the step of the data-ref in the loop is not 1 (as in interleaved data
1739 and SLP), the number of iterations of the prolog must be divided by the step
1740 (which is equal to the size of interleaved group).
1742 The above formulas assume that VF == number of elements in the vector. This
1743 may not hold when there are multiple-types in the loop.
1744 In this case, for some data-references in the loop the VF does not represent
1745 the number of elements that fit in the vector. Therefore, instead of VF we
1746 use TYPE_VECTOR_SUBPARTS. */
1748 static tree
1749 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1750 basic_block bb, int *bound)
1752 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1753 tree var;
1754 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1755 gimple_seq stmts = NULL, new_stmts = NULL;
1756 tree iters, iters_name;
1757 stmt_vec_info stmt_info = dr_info->stmt;
1758 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1759 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1761 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1763 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1765 if (dump_enabled_p ())
1766 dump_printf_loc (MSG_NOTE, vect_location,
1767 "known peeling = %d.\n", npeel);
1769 iters = build_int_cst (niters_type, npeel);
1770 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1772 else
1774 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1775 tree type = TREE_TYPE (misalign_in_elems);
1776 HOST_WIDE_INT elem_size
1777 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1778 /* We only do prolog peeling if the target alignment is known at compile
1779 time. */
1780 poly_uint64 align_in_elems =
1781 exact_div (target_align, elem_size);
1782 tree align_in_elems_minus_1 =
1783 build_int_cst (type, align_in_elems - 1);
1784 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1786 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1787 & (align_in_elems - 1)). */
1788 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1789 size_zero_node) < 0;
1790 if (negative)
1791 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1792 align_in_elems_tree);
1793 else
1794 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1795 misalign_in_elems);
1796 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1797 iters = fold_convert (niters_type, iters);
1798 unsigned HOST_WIDE_INT align_in_elems_c;
1799 if (align_in_elems.is_constant (&align_in_elems_c))
1800 *bound = align_in_elems_c - 1;
1801 else
1802 *bound = -1;
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location,
1807 "niters for prolog loop: %T\n", iters);
1809 var = create_tmp_var (niters_type, "prolog_loop_niters");
1810 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1812 if (new_stmts)
1813 gimple_seq_add_seq (&stmts, new_stmts);
1814 if (stmts)
1816 gcc_assert (single_succ_p (bb));
1817 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1818 if (gsi_end_p (gsi))
1819 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1820 else
1821 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1823 return iters_name;
1827 /* Function vect_update_init_of_dr
1829 If CODE is PLUS, the vector loop starts NITERS iterations after the
1830 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1831 iterations before the scalar one (using masking to skip inactive
1832 elements). This function updates the information recorded in DR to
1833 account for the difference. Specifically, it updates the OFFSET
1834 field of DR_INFO. */
1836 static void
1837 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1839 struct data_reference *dr = dr_info->dr;
1840 tree offset = dr_info->offset;
1841 if (!offset)
1842 offset = build_zero_cst (sizetype);
1844 niters = fold_build2 (MULT_EXPR, sizetype,
1845 fold_convert (sizetype, niters),
1846 fold_convert (sizetype, DR_STEP (dr)));
1847 offset = fold_build2 (code, sizetype,
1848 fold_convert (sizetype, offset), niters);
1849 dr_info->offset = offset;
1853 /* Function vect_update_inits_of_drs
1855 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1856 CODE and NITERS are as for vect_update_inits_of_dr. */
1858 void
1859 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1860 tree_code code)
1862 unsigned int i;
1863 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1864 struct data_reference *dr;
1866 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1868 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1869 here, but since we might use these niters to update the epilogues niters
1870 and data references we can't insert them here as this definition might not
1871 always dominate its uses. */
1872 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1873 niters = fold_convert (sizetype, niters);
1875 FOR_EACH_VEC_ELT (datarefs, i, dr)
1877 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1878 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1879 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1880 vect_update_init_of_dr (dr_info, niters, code);
1884 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1885 by masking. This involves calculating the number of iterations to
1886 be peeled and then aligning all memory references appropriately. */
1888 void
1889 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1891 tree misalign_in_elems;
1892 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1894 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1896 /* From the information recorded in LOOP_VINFO get the number of iterations
1897 that need to be skipped via masking. */
1898 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1900 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1901 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1902 misalign_in_elems = build_int_cst (type, misalign);
1904 else
1906 gimple_seq seq1 = NULL, seq2 = NULL;
1907 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1908 misalign_in_elems = fold_convert (type, misalign_in_elems);
1909 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1910 &seq2, true, NULL_TREE);
1911 gimple_seq_add_seq (&seq1, seq2);
1912 if (seq1)
1914 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1915 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1916 gcc_assert (!new_bb);
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "misalignment for fully-masked loop: %T\n",
1923 misalign_in_elems);
1925 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1927 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1930 /* This function builds ni_name = number of iterations. Statements
1931 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1932 it to TRUE if new ssa_var is generated. */
1934 tree
1935 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1937 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1938 if (TREE_CODE (ni) == INTEGER_CST)
1939 return ni;
1940 else
1942 tree ni_name, var;
1943 gimple_seq stmts = NULL;
1944 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1946 var = create_tmp_var (TREE_TYPE (ni), "niters");
1947 ni_name = force_gimple_operand (ni, &stmts, false, var);
1948 if (stmts)
1950 gsi_insert_seq_on_edge_immediate (pe, stmts);
1951 if (new_var_p != NULL)
1952 *new_var_p = true;
1955 return ni_name;
1959 /* Calculate the number of iterations above which vectorized loop will be
1960 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1961 of prolog loop. If it's integer const, the integer number is also passed
1962 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1963 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1964 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1965 threshold below which the scalar (rather than vectorized) loop will be
1966 executed. This function stores the upper bound (inclusive) of the result
1967 in BOUND_SCALAR. */
1969 static tree
1970 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1971 int bound_prolog, poly_int64 bound_epilog, int th,
1972 poly_uint64 *bound_scalar,
1973 bool check_profitability)
1975 tree type = TREE_TYPE (niters_prolog);
1976 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1977 build_int_cst (type, bound_epilog));
1979 *bound_scalar = bound_prolog + bound_epilog;
1980 if (check_profitability)
1982 /* TH indicates the minimum niters of vectorized loop, while we
1983 compute the maximum niters of scalar loop. */
1984 th--;
1985 /* Peeling for constant times. */
1986 if (int_niters_prolog >= 0)
1988 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1989 return build_int_cst (type, *bound_scalar);
1991 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1992 and BOUND_EPILOG are inclusive upper bounds. */
1993 if (known_ge (th, bound_prolog + bound_epilog))
1995 *bound_scalar = th;
1996 return build_int_cst (type, th);
1998 /* Need to do runtime comparison. */
1999 else if (maybe_gt (th, bound_epilog))
2001 *bound_scalar = upper_bound (*bound_scalar, th);
2002 return fold_build2 (MAX_EXPR, type,
2003 build_int_cst (type, th), niters);
2006 return niters;
2009 /* NITERS is the number of times that the original scalar loop executes
2010 after peeling. Work out the maximum number of iterations N that can
2011 be handled by the vectorized form of the loop and then either:
2013 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2015 niters_vector = N
2017 b) set *STEP_VECTOR_PTR to one and generate:
2019 niters_vector = N / vf
2021 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2022 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2023 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
2025 void
2026 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2027 tree *niters_vector_ptr, tree *step_vector_ptr,
2028 bool niters_no_overflow)
2030 tree ni_minus_gap, var;
2031 tree niters_vector, step_vector, type = TREE_TYPE (niters);
2032 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2033 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2034 tree log_vf = NULL_TREE;
2036 /* If epilogue loop is required because of data accesses with gaps, we
2037 subtract one iteration from the total number of iterations here for
2038 correct calculation of RATIO. */
2039 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2041 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2042 build_one_cst (type));
2043 if (!is_gimple_val (ni_minus_gap))
2045 var = create_tmp_var (type, "ni_gap");
2046 gimple *stmts = NULL;
2047 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2048 true, var);
2049 gsi_insert_seq_on_edge_immediate (pe, stmts);
2052 else
2053 ni_minus_gap = niters;
2055 /* To silence some unexpected warnings, simply initialize to 0. */
2056 unsigned HOST_WIDE_INT const_vf = 0;
2057 if (vf.is_constant (&const_vf)
2058 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2060 /* Create: niters >> log2(vf) */
2061 /* If it's known that niters == number of latch executions + 1 doesn't
2062 overflow, we can generate niters >> log2(vf); otherwise we generate
2063 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2064 will be at least one. */
2065 log_vf = build_int_cst (type, exact_log2 (const_vf));
2066 if (niters_no_overflow)
2067 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2068 else
2069 niters_vector
2070 = fold_build2 (PLUS_EXPR, type,
2071 fold_build2 (RSHIFT_EXPR, type,
2072 fold_build2 (MINUS_EXPR, type,
2073 ni_minus_gap,
2074 build_int_cst (type, vf)),
2075 log_vf),
2076 build_int_cst (type, 1));
2077 step_vector = build_one_cst (type);
2079 else
2081 niters_vector = ni_minus_gap;
2082 step_vector = build_int_cst (type, vf);
2085 if (!is_gimple_val (niters_vector))
2087 var = create_tmp_var (type, "bnd");
2088 gimple_seq stmts = NULL;
2089 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2090 gsi_insert_seq_on_edge_immediate (pe, stmts);
2091 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2092 we set range information to make niters analyzer's life easier.
2093 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2094 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2095 if (stmts != NULL && log_vf)
2097 if (niters_no_overflow)
2099 value_range vr (type,
2100 wi::one (TYPE_PRECISION (type)),
2101 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2102 TYPE_SIGN (type)),
2103 exact_log2 (const_vf),
2104 TYPE_SIGN (type)));
2105 set_range_info (niters_vector, vr);
2107 /* For VF == 1 the vector IV might also overflow so we cannot
2108 assert a minimum value of 1. */
2109 else if (const_vf > 1)
2111 value_range vr (type,
2112 wi::one (TYPE_PRECISION (type)),
2113 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2114 TYPE_SIGN (type))
2115 - (const_vf - 1),
2116 exact_log2 (const_vf), TYPE_SIGN (type))
2117 + 1);
2118 set_range_info (niters_vector, vr);
2122 *niters_vector_ptr = niters_vector;
2123 *step_vector_ptr = step_vector;
2125 return;
2128 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2129 loop specified by LOOP_VINFO after vectorization, compute the number
2130 of iterations before vectorization (niters_vector * vf) and store it
2131 to NITERS_VECTOR_MULT_VF_PTR. */
2133 static void
2134 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2135 tree niters_vector,
2136 tree *niters_vector_mult_vf_ptr)
2138 /* We should be using a step_vector of VF if VF is variable. */
2139 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2140 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2141 tree type = TREE_TYPE (niters_vector);
2142 tree log_vf = build_int_cst (type, exact_log2 (vf));
2143 basic_block exit_bb = single_exit (loop)->dest;
2145 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2146 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2147 niters_vector, log_vf);
2148 if (!is_gimple_val (niters_vector_mult_vf))
2150 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2151 gimple_seq stmts = NULL;
2152 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2153 &stmts, true, var);
2154 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2155 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2157 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2160 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2161 this function searches for the corresponding lcssa phi node in exit
2162 bb of LOOP. If it is found, return the phi result; otherwise return
2163 NULL. */
2165 static tree
2166 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2167 gphi *lcssa_phi)
2169 gphi_iterator gsi;
2170 edge e = single_exit (loop);
2172 gcc_assert (single_pred_p (e->dest));
2173 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2175 gphi *phi = gsi.phi ();
2176 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2177 PHI_ARG_DEF (lcssa_phi, 0), 0))
2178 return PHI_RESULT (phi);
2180 return NULL_TREE;
2183 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2184 from SECOND/FIRST and puts it at the original loop's preheader/exit
2185 edge, the two loops are arranged as below:
2187 preheader_a:
2188 first_loop:
2189 header_a:
2190 i_1 = PHI<i_0, i_2>;
2192 i_2 = i_1 + 1;
2193 if (cond_a)
2194 goto latch_a;
2195 else
2196 goto between_bb;
2197 latch_a:
2198 goto header_a;
2200 between_bb:
2201 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2203 second_loop:
2204 header_b:
2205 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2206 or with i_2 if no LCSSA phi is created
2207 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2209 i_4 = i_3 + 1;
2210 if (cond_b)
2211 goto latch_b;
2212 else
2213 goto exit_bb;
2214 latch_b:
2215 goto header_b;
2217 exit_bb:
2219 This function creates loop closed SSA for the first loop; update the
2220 second loop's PHI nodes by replacing argument on incoming edge with the
2221 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2222 is false, Loop closed ssa phis will only be created for non-iv phis for
2223 the first loop.
2225 This function assumes exit bb of the first loop is preheader bb of the
2226 second loop, i.e, between_bb in the example code. With PHIs updated,
2227 the second loop will execute rest iterations of the first. */
2229 static void
2230 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2231 class loop *first, class loop *second,
2232 bool create_lcssa_for_iv_phis)
2234 gphi_iterator gsi_update, gsi_orig;
2235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2238 edge second_preheader_e = loop_preheader_edge (second);
2239 basic_block between_bb = single_exit (first)->dest;
2241 gcc_assert (between_bb == second_preheader_e->src);
2242 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2243 /* Either the first loop or the second is the loop to be vectorized. */
2244 gcc_assert (loop == first || loop == second);
2246 for (gsi_orig = gsi_start_phis (first->header),
2247 gsi_update = gsi_start_phis (second->header);
2248 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2249 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2251 gphi *orig_phi = gsi_orig.phi ();
2252 gphi *update_phi = gsi_update.phi ();
2254 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2255 /* Generate lcssa PHI node for the first loop. */
2256 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2257 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2258 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2260 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2261 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2262 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2263 arg = new_res;
2266 /* Update PHI node in the second loop by replacing arg on the loop's
2267 incoming edge. */
2268 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2271 /* For epilogue peeling we have to make sure to copy all LC PHIs
2272 for correct vectorization of live stmts. */
2273 if (loop == first)
2275 basic_block orig_exit = single_exit (second)->dest;
2276 for (gsi_orig = gsi_start_phis (orig_exit);
2277 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2279 gphi *orig_phi = gsi_orig.phi ();
2280 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2281 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2282 continue;
2284 /* Already created in the above loop. */
2285 if (find_guard_arg (first, second, orig_phi))
2286 continue;
2288 tree new_res = copy_ssa_name (orig_arg);
2289 gphi *lcphi = create_phi_node (new_res, between_bb);
2290 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2295 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2296 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2297 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2298 appear like below:
2300 guard_bb:
2301 if (cond)
2302 goto merge_bb;
2303 else
2304 goto skip_loop;
2306 skip_loop:
2307 header_a:
2308 i_1 = PHI<i_0, i_2>;
2310 i_2 = i_1 + 1;
2311 if (cond_a)
2312 goto latch_a;
2313 else
2314 goto exit_a;
2315 latch_a:
2316 goto header_a;
2318 exit_a:
2319 i_5 = PHI<i_2>;
2321 merge_bb:
2322 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2324 update_loop:
2325 header_b:
2326 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2328 i_4 = i_3 + 1;
2329 if (cond_b)
2330 goto latch_b;
2331 else
2332 goto exit_bb;
2333 latch_b:
2334 goto header_b;
2336 exit_bb:
2338 This function creates PHI nodes at merge_bb and replaces the use of i_5
2339 in the update_loop's PHI node with the result of new PHI result. */
2341 static void
2342 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2343 class loop *update_loop,
2344 edge guard_edge, edge merge_edge)
2346 location_t merge_loc, guard_loc;
2347 edge orig_e = loop_preheader_edge (skip_loop);
2348 edge update_e = loop_preheader_edge (update_loop);
2349 gphi_iterator gsi_orig, gsi_update;
2351 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2352 gsi_update = gsi_start_phis (update_loop->header));
2353 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2354 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2356 gphi *orig_phi = gsi_orig.phi ();
2357 gphi *update_phi = gsi_update.phi ();
2359 /* Generate new phi node at merge bb of the guard. */
2360 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2361 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2363 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2364 args in NEW_PHI for these edges. */
2365 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2366 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2367 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2368 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2369 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2370 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2372 /* Update phi in UPDATE_PHI. */
2373 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2377 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2378 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2379 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2380 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2381 The CFG looks like:
2383 loop:
2384 header_a:
2385 i_1 = PHI<i_0, i_2>;
2387 i_2 = i_1 + 1;
2388 if (cond_a)
2389 goto latch_a;
2390 else
2391 goto exit_a;
2392 latch_a:
2393 goto header_a;
2395 exit_a:
2397 guard_bb:
2398 if (cond)
2399 goto merge_bb;
2400 else
2401 goto epilog_loop;
2403 ;; fall_through_bb
2405 epilog_loop:
2406 header_b:
2407 i_3 = PHI<i_2, i_4>;
2409 i_4 = i_3 + 1;
2410 if (cond_b)
2411 goto latch_b;
2412 else
2413 goto merge_bb;
2414 latch_b:
2415 goto header_b;
2417 merge_bb:
2418 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2420 exit_bb:
2421 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2423 For each name used out side EPILOG (i.e - for each name that has a lcssa
2424 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2425 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2426 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2427 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2428 in exit_bb will also be updated. */
2430 static void
2431 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2432 edge guard_edge, edge merge_edge)
2434 gphi_iterator gsi;
2435 basic_block merge_bb = guard_edge->dest;
2437 gcc_assert (single_succ_p (merge_bb));
2438 edge e = single_succ_edge (merge_bb);
2439 basic_block exit_bb = e->dest;
2440 gcc_assert (single_pred_p (exit_bb));
2441 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2443 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2445 gphi *update_phi = gsi.phi ();
2446 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2448 tree merge_arg = NULL_TREE;
2450 /* If the old argument is a SSA_NAME use its current_def. */
2451 if (TREE_CODE (old_arg) == SSA_NAME)
2452 merge_arg = get_current_def (old_arg);
2453 /* If it's a constant or doesn't have a current_def, just use the old
2454 argument. */
2455 if (!merge_arg)
2456 merge_arg = old_arg;
2458 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2459 /* If the var is live after loop but not a reduction, we simply
2460 use the old arg. */
2461 if (!guard_arg)
2462 guard_arg = old_arg;
2464 /* Create new phi node in MERGE_BB: */
2465 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2466 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2468 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2469 the two PHI args in merge_phi for these edges. */
2470 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2471 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2473 /* Update the original phi in exit_bb. */
2474 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2478 /* EPILOG loop is duplicated from the original loop for vectorizing,
2479 the arg of its loop closed ssa PHI needs to be updated. */
2481 static void
2482 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2484 gphi_iterator gsi;
2485 basic_block exit_bb = single_exit (epilog)->dest;
2487 gcc_assert (single_pred_p (exit_bb));
2488 edge e = EDGE_PRED (exit_bb, 0);
2489 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2490 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2493 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2494 iterate exactly CONST_NITERS times. Make a final decision about
2495 whether the epilogue loop should be used, returning true if so. */
2497 static bool
2498 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2499 unsigned HOST_WIDE_INT const_niters)
2501 /* Avoid wrap-around when computing const_niters - 1. Also reject
2502 using an epilogue loop for a single scalar iteration, even if
2503 we could in principle implement that using partial vectors. */
2504 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2505 if (const_niters <= gap_niters + 1)
2506 return false;
2508 /* Install the number of iterations. */
2509 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2510 tree niters_tree = build_int_cst (niters_type, const_niters);
2511 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2513 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2514 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2516 /* Decide what to do if the number of epilogue iterations is not
2517 a multiple of the epilogue loop's vectorization factor. */
2518 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2521 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2522 Return a value that equals:
2524 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2525 - SKIP_VALUE when the main loop is skipped. */
2527 tree
2528 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2529 tree skip_value)
2531 gcc_assert (loop_vinfo->main_loop_edge);
2533 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2534 basic_block bb = loop_vinfo->main_loop_edge->dest;
2535 gphi *new_phi = create_phi_node (phi_result, bb);
2536 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2537 UNKNOWN_LOCATION);
2538 add_phi_arg (new_phi, skip_value,
2539 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2540 return phi_result;
2543 /* Function vect_do_peeling.
2545 Input:
2546 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2548 preheader:
2549 LOOP:
2550 header_bb:
2551 loop_body
2552 if (exit_loop_cond) goto exit_bb
2553 else goto header_bb
2554 exit_bb:
2556 - NITERS: The number of iterations of the loop.
2557 - NITERSM1: The number of iterations of the loop's latch.
2558 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2559 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2560 CHECK_PROFITABILITY is true.
2561 Output:
2562 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2563 iterate after vectorization; see vect_set_loop_condition for details.
2564 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2565 should be set to the number of scalar iterations handled by the
2566 vector loop. The SSA name is only used on exit from the loop.
2568 This function peels prolog and epilog from the loop, adds guards skipping
2569 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2570 would look like:
2572 guard_bb_1:
2573 if (prefer_scalar_loop) goto merge_bb_1
2574 else goto guard_bb_2
2576 guard_bb_2:
2577 if (skip_prolog) goto merge_bb_2
2578 else goto prolog_preheader
2580 prolog_preheader:
2581 PROLOG:
2582 prolog_header_bb:
2583 prolog_body
2584 if (exit_prolog_cond) goto prolog_exit_bb
2585 else goto prolog_header_bb
2586 prolog_exit_bb:
2588 merge_bb_2:
2590 vector_preheader:
2591 VECTOR LOOP:
2592 vector_header_bb:
2593 vector_body
2594 if (exit_vector_cond) goto vector_exit_bb
2595 else goto vector_header_bb
2596 vector_exit_bb:
2598 guard_bb_3:
2599 if (skip_epilog) goto merge_bb_3
2600 else goto epilog_preheader
2602 merge_bb_1:
2604 epilog_preheader:
2605 EPILOG:
2606 epilog_header_bb:
2607 epilog_body
2608 if (exit_epilog_cond) goto merge_bb_3
2609 else goto epilog_header_bb
2611 merge_bb_3:
2613 Note this function peels prolog and epilog only if it's necessary,
2614 as well as guards.
2615 This function returns the epilogue loop if a decision was made to vectorize
2616 it, otherwise NULL.
2618 The analysis resulting in this epilogue loop's loop_vec_info was performed
2619 in the same vect_analyze_loop call as the main loop's. At that time
2620 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2621 vectorization factors than the main loop. This list is stored in the main
2622 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2623 vectorize the epilogue loop for a lower vectorization factor, the
2624 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2625 updated and linked to the epilogue loop. This is later used to vectorize
2626 the epilogue. The reason the loop_vec_info needs updating is that it was
2627 constructed based on the original main loop, and the epilogue loop is a
2628 copy of this loop, so all links pointing to statements in the original loop
2629 need updating. Furthermore, these loop_vec_infos share the
2630 data_reference's records, which will also need to be updated.
2632 TODO: Guard for prefer_scalar_loop should be emitted along with
2633 versioning conditions if loop versioning is needed. */
2636 class loop *
2637 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2638 tree *niters_vector, tree *step_vector,
2639 tree *niters_vector_mult_vf_var, int th,
2640 bool check_profitability, bool niters_no_overflow,
2641 tree *advance)
2643 edge e, guard_e;
2644 tree type = TREE_TYPE (niters), guard_cond;
2645 basic_block guard_bb, guard_to;
2646 profile_probability prob_prolog, prob_vector, prob_epilog;
2647 int estimated_vf;
2648 int prolog_peeling = 0;
2649 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2650 bool vect_epilogues_updated_niters = false;
2651 /* We currently do not support prolog peeling if the target alignment is not
2652 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2653 target alignment being constant. */
2654 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2655 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2656 return NULL;
2658 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2659 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2661 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2662 poly_uint64 bound_epilog = 0;
2663 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2664 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2665 bound_epilog += vf - 1;
2666 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2667 bound_epilog += 1;
2668 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2669 poly_uint64 bound_scalar = bound_epilog;
2671 if (!prolog_peeling && !epilog_peeling)
2672 return NULL;
2674 /* Before doing any peeling make sure to reset debug binds outside of
2675 the loop refering to defs not in LC SSA. */
2676 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2677 for (unsigned i = 0; i < loop->num_nodes; ++i)
2679 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2680 imm_use_iterator ui;
2681 gimple *use_stmt;
2682 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2683 gsi_next (&gsi))
2685 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2686 if (gimple_debug_bind_p (use_stmt)
2687 && loop != gimple_bb (use_stmt)->loop_father
2688 && !flow_loop_nested_p (loop,
2689 gimple_bb (use_stmt)->loop_father))
2691 gimple_debug_bind_reset_value (use_stmt);
2692 update_stmt (use_stmt);
2695 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2696 gsi_next (&gsi))
2698 ssa_op_iter op_iter;
2699 def_operand_p def_p;
2700 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2701 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2702 if (gimple_debug_bind_p (use_stmt)
2703 && loop != gimple_bb (use_stmt)->loop_father
2704 && !flow_loop_nested_p (loop,
2705 gimple_bb (use_stmt)->loop_father))
2707 gimple_debug_bind_reset_value (use_stmt);
2708 update_stmt (use_stmt);
2713 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2714 estimated_vf = vect_vf_for_cost (loop_vinfo);
2715 if (estimated_vf == 2)
2716 estimated_vf = 3;
2717 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2718 .apply_scale (estimated_vf - 1, estimated_vf);
2720 class loop *prolog, *epilog = NULL;
2721 class loop *first_loop = loop;
2722 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2724 /* SSA form needs to be up-to-date since we are going to manually
2725 update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2726 update SSA state after that, so we have to make sure to not lose any
2727 pending update needs. */
2728 gcc_assert (!need_ssa_update_p (cfun));
2730 /* If we're vectorizing an epilogue loop, we have ensured that the
2731 virtual operand is in SSA form throughout the vectorized main loop.
2732 Normally it is possible to trace the updated
2733 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2734 back to scalar-stmt vuses, meaning that the effect of the SSA update
2735 remains local to the main loop. However, there are rare cases in
2736 which the vectorized loop should have vdefs even when the original scalar
2737 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2738 introduces clobbers of the temporary vector array, which in turn
2739 needs new vdefs. If the scalar loop doesn't write to memory, these
2740 new vdefs will be the only ones in the vector loop.
2741 We are currently defering updating virtual SSA form and creating
2742 of a virtual PHI for this case so we do not have to make sure the
2743 newly introduced virtual def is in LCSSA form. */
2745 if (MAY_HAVE_DEBUG_BIND_STMTS)
2747 gcc_assert (!adjust_vec.exists ());
2748 adjust_vec.create (32);
2750 initialize_original_copy_tables ();
2752 /* Record the anchor bb at which the guard should be placed if the scalar
2753 loop might be preferred. */
2754 basic_block anchor = loop_preheader_edge (loop)->src;
2756 /* Generate the number of iterations for the prolog loop. We do this here
2757 so that we can also get the upper bound on the number of iterations. */
2758 tree niters_prolog;
2759 int bound_prolog = 0;
2760 if (prolog_peeling)
2761 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2762 &bound_prolog);
2763 else
2764 niters_prolog = build_int_cst (type, 0);
2766 loop_vec_info epilogue_vinfo = NULL;
2767 if (vect_epilogues)
2769 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2770 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2773 tree niters_vector_mult_vf = NULL_TREE;
2774 /* Saving NITERs before the loop, as this may be changed by prologue. */
2775 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2776 edge update_e = NULL, skip_e = NULL;
2777 unsigned int lowest_vf = constant_lower_bound (vf);
2778 /* If we know the number of scalar iterations for the main loop we should
2779 check whether after the main loop there are enough iterations left over
2780 for the epilogue. */
2781 if (vect_epilogues
2782 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2783 && prolog_peeling >= 0
2784 && known_eq (vf, lowest_vf))
2786 unsigned HOST_WIDE_INT eiters
2787 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2788 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2790 eiters -= prolog_peeling;
2791 eiters
2792 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2794 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2796 delete epilogue_vinfo;
2797 epilogue_vinfo = NULL;
2798 if (loop_vinfo->epilogue_vinfos.length () == 0)
2800 vect_epilogues = false;
2801 break;
2803 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2804 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2806 vect_epilogues_updated_niters = true;
2808 /* Prolog loop may be skipped. */
2809 bool skip_prolog = (prolog_peeling != 0);
2810 /* Skip this loop to epilog when there are not enough iterations to enter this
2811 vectorized loop. If true we should perform runtime checks on the NITERS
2812 to check whether we should skip the current vectorized loop. If we know
2813 the number of scalar iterations we may choose to add a runtime check if
2814 this number "maybe" smaller than the number of iterations required
2815 when we know the number of scalar iterations may potentially
2816 be smaller than the number of iterations required to enter this loop, for
2817 this we use the upper bounds on the prolog and epilog peeling. When we
2818 don't know the number of iterations and don't require versioning it is
2819 because we have asserted that there are enough scalar iterations to enter
2820 the main loop, so this skip is not necessary. When we are versioning then
2821 we only add such a skip if we have chosen to vectorize the epilogue. */
2822 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2823 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2824 bound_prolog + bound_epilog)
2825 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2826 || vect_epilogues));
2827 /* Epilog loop must be executed if the number of iterations for epilog
2828 loop is known at compile time, otherwise we need to add a check at
2829 the end of vector loop and skip to the end of epilog loop. */
2830 bool skip_epilog = (prolog_peeling < 0
2831 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2832 || !vf.is_constant ());
2833 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2834 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2835 skip_epilog = false;
2837 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2838 auto_vec<profile_count> original_counts;
2839 basic_block *original_bbs = NULL;
2841 if (skip_vector)
2843 split_edge (loop_preheader_edge (loop));
2845 if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2847 original_bbs = get_loop_body (loop);
2848 for (unsigned int i = 0; i < loop->num_nodes; i++)
2849 original_counts.safe_push(original_bbs[i]->count);
2852 /* Due to the order in which we peel prolog and epilog, we first
2853 propagate probability to the whole loop. The purpose is to
2854 avoid adjusting probabilities of both prolog and vector loops
2855 separately. Note in this case, the probability of epilog loop
2856 needs to be scaled back later. */
2857 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2858 if (prob_vector.initialized_p ())
2860 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2861 scale_loop_profile (loop, prob_vector, 0);
2865 if (vect_epilogues)
2866 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2867 use the original scalar loop as remaining epilogue if necessary. */
2868 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2869 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2871 if (prolog_peeling)
2873 e = loop_preheader_edge (loop);
2874 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
2876 /* Peel prolog and put it on preheader edge of loop. */
2877 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2878 gcc_assert (prolog);
2879 prolog->force_vectorize = false;
2880 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2881 first_loop = prolog;
2882 reset_original_copy_tables ();
2884 /* Update the number of iterations for prolog loop. */
2885 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2886 vect_set_loop_condition (prolog, NULL, niters_prolog,
2887 step_prolog, NULL_TREE, false);
2889 /* Skip the prolog loop. */
2890 if (skip_prolog)
2892 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2893 niters_prolog, build_int_cst (type, 0));
2894 guard_bb = loop_preheader_edge (prolog)->src;
2895 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2896 guard_to = split_edge (loop_preheader_edge (loop));
2897 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2898 guard_to, guard_bb,
2899 prob_prolog.invert (),
2900 irred_flag);
2901 e = EDGE_PRED (guard_to, 0);
2902 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2903 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2905 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2906 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2909 /* Update init address of DRs. */
2910 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2911 /* Update niters for vector loop. */
2912 LOOP_VINFO_NITERS (loop_vinfo)
2913 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2914 LOOP_VINFO_NITERSM1 (loop_vinfo)
2915 = fold_build2 (MINUS_EXPR, type,
2916 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2917 bool new_var_p = false;
2918 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2919 /* It's guaranteed that vector loop bound before vectorization is at
2920 least VF, so set range information for newly generated var. */
2921 if (new_var_p)
2923 value_range vr (type,
2924 wi::to_wide (build_int_cst (type, vf)),
2925 wi::to_wide (TYPE_MAX_VALUE (type)));
2926 set_range_info (niters, vr);
2929 /* Prolog iterates at most bound_prolog times, latch iterates at
2930 most bound_prolog - 1 times. */
2931 record_niter_bound (prolog, bound_prolog - 1, false, true);
2932 delete_update_ssa ();
2933 adjust_vec_debug_stmts ();
2934 scev_reset ();
2937 if (epilog_peeling)
2939 e = single_exit (loop);
2940 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
2942 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2943 said epilog then we should use a copy of the main loop as a starting
2944 point. This loop may have already had some preliminary transformations
2945 to allow for more optimal vectorization, for example if-conversion.
2946 If we are not vectorizing the epilog then we should use the scalar loop
2947 as the transformations mentioned above make less or no sense when not
2948 vectorizing. */
2949 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2950 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2951 gcc_assert (epilog);
2953 epilog->force_vectorize = false;
2954 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2956 /* Scalar version loop may be preferred. In this case, add guard
2957 and skip to epilog. Note this only happens when the number of
2958 iterations of loop is unknown at compile time, otherwise this
2959 won't be vectorized. */
2960 if (skip_vector)
2962 /* Additional epilogue iteration is peeled if gap exists. */
2963 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2964 bound_prolog, bound_epilog,
2965 th, &bound_scalar,
2966 check_profitability);
2967 /* Build guard against NITERSM1 since NITERS may overflow. */
2968 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2969 guard_bb = anchor;
2970 guard_to = split_edge (loop_preheader_edge (epilog));
2971 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2972 guard_to, guard_bb,
2973 prob_vector.invert (),
2974 irred_flag);
2975 skip_e = guard_e;
2976 e = EDGE_PRED (guard_to, 0);
2977 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2978 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2980 /* Simply propagate profile info from guard_bb to guard_to which is
2981 a merge point of control flow. */
2982 guard_to->count = guard_bb->count;
2984 /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
2985 if (vect_epilogues || scalar_loop == NULL)
2987 gcc_assert(epilog->num_nodes == loop->num_nodes);
2988 basic_block *bbs = get_loop_body (epilog);
2989 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2991 gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
2992 bbs[i]->count = original_counts[i];
2994 free (bbs);
2995 free (original_bbs);
2999 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
3000 /* If loop is peeled for non-zero constant times, now niters refers to
3001 orig_niters - prolog_peeling, it won't overflow even the orig_niters
3002 overflows. */
3003 niters_no_overflow |= (prolog_peeling > 0);
3004 vect_gen_vector_loop_niters (loop_vinfo, niters,
3005 niters_vector, step_vector,
3006 niters_no_overflow);
3007 if (!integer_onep (*step_vector))
3009 /* On exit from the loop we will have an easy way of calcalating
3010 NITERS_VECTOR / STEP * STEP. Install a dummy definition
3011 until then. */
3012 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3013 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3014 *niters_vector_mult_vf_var = niters_vector_mult_vf;
3016 else
3017 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3018 &niters_vector_mult_vf);
3019 /* Update IVs of original loop as if they were advanced by
3020 niters_vector_mult_vf steps. */
3021 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3022 update_e = skip_vector ? e : loop_preheader_edge (epilog);
3023 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3024 update_e);
3026 if (skip_epilog)
3028 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3029 niters, niters_vector_mult_vf);
3030 guard_bb = single_exit (loop)->dest;
3031 guard_to = split_edge (single_exit (epilog));
3032 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3033 skip_vector ? anchor : guard_bb,
3034 prob_epilog.invert (),
3035 irred_flag);
3036 if (vect_epilogues)
3037 epilogue_vinfo->skip_this_loop_edge = guard_e;
3038 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3039 single_exit (epilog));
3040 /* Only need to handle basic block before epilog loop if it's not
3041 the guard_bb, which is the case when skip_vector is true. */
3042 if (guard_bb != bb_before_epilog)
3044 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3046 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3048 scale_loop_profile (epilog, prob_epilog, 0);
3050 else
3051 slpeel_update_phi_nodes_for_lcssa (epilog);
3053 unsigned HOST_WIDE_INT bound;
3054 if (bound_scalar.is_constant (&bound))
3056 gcc_assert (bound != 0);
3057 /* -1 to convert loop iterations to latch iterations. */
3058 record_niter_bound (epilog, bound - 1, false, true);
3061 delete_update_ssa ();
3062 adjust_vec_debug_stmts ();
3063 scev_reset ();
3066 if (vect_epilogues)
3068 epilog->aux = epilogue_vinfo;
3069 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3071 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3073 /* We now must calculate the number of NITERS performed by the previous
3074 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3075 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3076 niters_prolog, niters_vector_mult_vf);
3078 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3079 determine whether we are coming from the previous vectorized loop
3080 using the update_e edge or the skip_vector basic block using the
3081 skip_e edge. */
3082 if (skip_vector)
3084 gcc_assert (update_e != NULL
3085 && skip_e != NULL
3086 && !vect_epilogues_updated_niters);
3087 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3088 update_e->dest);
3089 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3090 gimple *stmt = gimple_build_assign (new_ssa, niters);
3091 gimple_stmt_iterator gsi;
3092 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3093 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3095 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3096 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3098 else
3100 gsi = gsi_last_bb (update_e->src);
3101 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3104 niters = new_ssa;
3105 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3106 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3107 UNKNOWN_LOCATION);
3108 niters = PHI_RESULT (new_phi);
3109 epilogue_vinfo->main_loop_edge = update_e;
3110 epilogue_vinfo->skip_main_loop_edge = skip_e;
3113 /* Set ADVANCE to the number of iterations performed by the previous
3114 loop and its prologue. */
3115 *advance = niters;
3117 if (!vect_epilogues_updated_niters)
3119 /* Subtract the number of iterations performed by the vectorized loop
3120 from the number of total iterations. */
3121 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3122 before_loop_niters,
3123 niters);
3125 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3126 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3127 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3128 epilogue_niters,
3129 build_one_cst (TREE_TYPE (epilogue_niters)));
3131 /* Decide what to do if the number of epilogue iterations is not
3132 a multiple of the epilogue loop's vectorization factor.
3133 We should have rejected the loop during the analysis phase
3134 if this fails. */
3135 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3136 true))
3137 gcc_unreachable ();
3141 adjust_vec.release ();
3142 free_original_copy_tables ();
3144 return vect_epilogues ? epilog : NULL;
3147 /* Function vect_create_cond_for_niters_checks.
3149 Create a conditional expression that represents the run-time checks for
3150 loop's niter. The loop is guaranteed to terminate if the run-time
3151 checks hold.
3153 Input:
3154 COND_EXPR - input conditional expression. New conditions will be chained
3155 with logical AND operation. If it is NULL, then the function
3156 is used to return the number of alias checks.
3157 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3158 to be checked.
3160 Output:
3161 COND_EXPR - conditional expression.
3163 The returned COND_EXPR is the conditional expression to be used in the
3164 if statement that controls which version of the loop gets executed at
3165 runtime. */
3167 static void
3168 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3170 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3172 if (*cond_expr)
3173 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3174 *cond_expr, part_cond_expr);
3175 else
3176 *cond_expr = part_cond_expr;
3179 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3180 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3182 static void
3183 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3185 if (*cond_expr)
3186 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3187 *cond_expr, part_cond_expr);
3188 else
3189 *cond_expr = part_cond_expr;
3192 /* Function vect_create_cond_for_align_checks.
3194 Create a conditional expression that represents the alignment checks for
3195 all of data references (array element references) whose alignment must be
3196 checked at runtime.
3198 Input:
3199 COND_EXPR - input conditional expression. New conditions will be chained
3200 with logical AND operation.
3201 LOOP_VINFO - two fields of the loop information are used.
3202 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3203 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3205 Output:
3206 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3207 expression.
3208 The returned value is the conditional expression to be used in the if
3209 statement that controls which version of the loop gets executed at runtime.
3211 The algorithm makes two assumptions:
3212 1) The number of bytes "n" in a vector is a power of 2.
3213 2) An address "a" is aligned if a%n is zero and that this
3214 test can be done as a&(n-1) == 0. For example, for 16
3215 byte vectors the test is a&0xf == 0. */
3217 static void
3218 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3219 tree *cond_expr,
3220 gimple_seq *cond_expr_stmt_list)
3222 const vec<stmt_vec_info> &may_misalign_stmts
3223 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3224 stmt_vec_info stmt_info;
3225 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3226 tree mask_cst;
3227 unsigned int i;
3228 tree int_ptrsize_type;
3229 char tmp_name[20];
3230 tree or_tmp_name = NULL_TREE;
3231 tree and_tmp_name;
3232 gimple *and_stmt;
3233 tree ptrsize_zero;
3234 tree part_cond_expr;
3236 /* Check that mask is one less than a power of 2, i.e., mask is
3237 all zeros followed by all ones. */
3238 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3240 int_ptrsize_type = signed_type_for (ptr_type_node);
3242 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3243 of the first vector of the i'th data reference. */
3245 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3247 gimple_seq new_stmt_list = NULL;
3248 tree addr_base;
3249 tree addr_tmp_name;
3250 tree new_or_tmp_name;
3251 gimple *addr_stmt, *or_stmt;
3252 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3253 bool negative = tree_int_cst_compare
3254 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3255 tree offset = negative
3256 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3257 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3258 : size_zero_node;
3260 /* create: addr_tmp = (int)(address_of_first_vector) */
3261 addr_base =
3262 vect_create_addr_base_for_vector_ref (loop_vinfo,
3263 stmt_info, &new_stmt_list,
3264 offset);
3265 if (new_stmt_list != NULL)
3266 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3268 sprintf (tmp_name, "addr2int%d", i);
3269 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3270 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3271 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3273 /* The addresses are OR together. */
3275 if (or_tmp_name != NULL_TREE)
3277 /* create: or_tmp = or_tmp | addr_tmp */
3278 sprintf (tmp_name, "orptrs%d", i);
3279 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3280 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3281 or_tmp_name, addr_tmp_name);
3282 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3283 or_tmp_name = new_or_tmp_name;
3285 else
3286 or_tmp_name = addr_tmp_name;
3288 } /* end for i */
3290 mask_cst = build_int_cst (int_ptrsize_type, mask);
3292 /* create: and_tmp = or_tmp & mask */
3293 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3295 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3296 or_tmp_name, mask_cst);
3297 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3299 /* Make and_tmp the left operand of the conditional test against zero.
3300 if and_tmp has a nonzero bit then some address is unaligned. */
3301 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3302 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3303 and_tmp_name, ptrsize_zero);
3304 chain_cond_expr (cond_expr, part_cond_expr);
3307 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3308 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3309 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3310 and this new condition are true. Treat a null *COND_EXPR as "true". */
3312 static void
3313 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3315 const vec<vec_object_pair> &pairs
3316 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3317 unsigned int i;
3318 vec_object_pair *pair;
3319 FOR_EACH_VEC_ELT (pairs, i, pair)
3321 tree addr1 = build_fold_addr_expr (pair->first);
3322 tree addr2 = build_fold_addr_expr (pair->second);
3323 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3324 addr1, addr2);
3325 chain_cond_expr (cond_expr, part_cond_expr);
3329 /* Create an expression that is true when all lower-bound conditions for
3330 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3332 static void
3333 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3335 const vec<vec_lower_bound> &lower_bounds
3336 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3337 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3339 tree expr = lower_bounds[i].expr;
3340 tree type = unsigned_type_for (TREE_TYPE (expr));
3341 expr = fold_convert (type, expr);
3342 poly_uint64 bound = lower_bounds[i].min_value;
3343 if (!lower_bounds[i].unsigned_p)
3345 expr = fold_build2 (PLUS_EXPR, type, expr,
3346 build_int_cstu (type, bound - 1));
3347 bound += bound - 1;
3349 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3350 build_int_cstu (type, bound));
3351 chain_cond_expr (cond_expr, part_cond_expr);
3355 /* Function vect_create_cond_for_alias_checks.
3357 Create a conditional expression that represents the run-time checks for
3358 overlapping of address ranges represented by a list of data references
3359 relations passed as input.
3361 Input:
3362 COND_EXPR - input conditional expression. New conditions will be chained
3363 with logical AND operation. If it is NULL, then the function
3364 is used to return the number of alias checks.
3365 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3366 to be checked.
3368 Output:
3369 COND_EXPR - conditional expression.
3371 The returned COND_EXPR is the conditional expression to be used in the if
3372 statement that controls which version of the loop gets executed at runtime.
3375 void
3376 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3378 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3379 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3381 if (comp_alias_ddrs.is_empty ())
3382 return;
3384 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3385 &comp_alias_ddrs, cond_expr);
3386 if (dump_enabled_p ())
3387 dump_printf_loc (MSG_NOTE, vect_location,
3388 "created %u versioning for alias checks.\n",
3389 comp_alias_ddrs.length ());
3393 /* Function vect_loop_versioning.
3395 If the loop has data references that may or may not be aligned or/and
3396 has data reference relations whose independence was not proven then
3397 two versions of the loop need to be generated, one which is vectorized
3398 and one which isn't. A test is then generated to control which of the
3399 loops is executed. The test checks for the alignment of all of the
3400 data references that may or may not be aligned. An additional
3401 sequence of runtime tests is generated for each pairs of DDRs whose
3402 independence was not proven. The vectorized version of loop is
3403 executed only if both alias and alignment tests are passed.
3405 The test generated to check which version of loop is executed
3406 is modified to also check for profitability as indicated by the
3407 cost model threshold TH.
3409 The versioning precondition(s) are placed in *COND_EXPR and
3410 *COND_EXPR_STMT_LIST. */
3412 class loop *
3413 vect_loop_versioning (loop_vec_info loop_vinfo,
3414 gimple *loop_vectorized_call)
3416 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3417 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3418 basic_block condition_bb;
3419 gphi_iterator gsi;
3420 gimple_stmt_iterator cond_exp_gsi;
3421 basic_block merge_bb;
3422 basic_block new_exit_bb;
3423 edge new_exit_e, e;
3424 gphi *orig_phi, *new_phi;
3425 tree cond_expr = NULL_TREE;
3426 gimple_seq cond_expr_stmt_list = NULL;
3427 tree arg;
3428 profile_probability prob = profile_probability::likely ();
3429 gimple_seq gimplify_stmt_list = NULL;
3430 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3431 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3432 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3433 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3434 poly_uint64 versioning_threshold
3435 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3436 tree version_simd_if_cond
3437 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3438 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3440 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3441 && !ordered_p (th, versioning_threshold))
3442 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3443 build_int_cst (TREE_TYPE (scalar_loop_iters),
3444 th - 1));
3445 if (maybe_ne (versioning_threshold, 0U))
3447 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3448 build_int_cst (TREE_TYPE (scalar_loop_iters),
3449 versioning_threshold - 1));
3450 if (cond_expr)
3451 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3452 expr, cond_expr);
3453 else
3454 cond_expr = expr;
3457 tree cost_name = NULL_TREE;
3458 profile_probability prob2 = profile_probability::uninitialized ();
3459 if (cond_expr
3460 && EXPR_P (cond_expr)
3461 && (version_niter
3462 || version_align
3463 || version_alias
3464 || version_simd_if_cond))
3466 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3467 &cond_expr_stmt_list,
3468 is_gimple_val, NULL_TREE);
3469 /* Split prob () into two so that the overall probability of passing
3470 both the cost-model and versioning checks is the orig prob. */
3471 prob2 = prob.split (prob);
3474 if (version_niter)
3475 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3477 if (cond_expr)
3479 gimple_seq tem = NULL;
3480 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3481 &tem, is_gimple_condexpr_for_cond,
3482 NULL_TREE);
3483 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3486 if (version_align)
3487 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3488 &cond_expr_stmt_list);
3490 if (version_alias)
3492 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3493 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3494 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3497 if (version_simd_if_cond)
3499 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3500 if (flag_checking)
3501 if (basic_block bb
3502 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3503 gcc_assert (bb != loop->header
3504 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3505 && (scalar_loop == NULL
3506 || (bb != scalar_loop->header
3507 && dominated_by_p (CDI_DOMINATORS,
3508 scalar_loop->header, bb))));
3509 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3510 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3511 version_simd_if_cond, zero);
3512 if (cond_expr)
3513 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3514 c, cond_expr);
3515 else
3516 cond_expr = c;
3517 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_NOTE, vect_location,
3519 "created versioning for simd if condition check.\n");
3522 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3523 &gimplify_stmt_list,
3524 is_gimple_condexpr_for_cond, NULL_TREE);
3525 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3527 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3528 invariant in. */
3529 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3530 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3531 !gsi_end_p (gsi); gsi_next (&gsi))
3533 gimple *stmt = gsi_stmt (gsi);
3534 update_stmt (stmt);
3535 ssa_op_iter iter;
3536 use_operand_p use_p;
3537 basic_block def_bb;
3538 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3539 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3540 && flow_bb_inside_loop_p (outermost, def_bb))
3541 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3544 /* Search for the outermost loop we can version. Avoid versioning of
3545 non-perfect nests but allow if-conversion versioned loops inside. */
3546 class loop *loop_to_version = loop;
3547 if (flow_loop_nested_p (outermost, loop))
3549 if (dump_enabled_p ())
3550 dump_printf_loc (MSG_NOTE, vect_location,
3551 "trying to apply versioning to outer loop %d\n",
3552 outermost->num);
3553 if (outermost->num == 0)
3554 outermost = superloop_at_depth (loop, 1);
3555 /* And avoid applying versioning on non-perfect nests. */
3556 while (loop_to_version != outermost
3557 && (e = single_exit (loop_outer (loop_to_version)))
3558 && !(e->flags & EDGE_COMPLEX)
3559 && (!loop_outer (loop_to_version)->inner->next
3560 || vect_loop_vectorized_call (loop_to_version))
3561 && (!loop_outer (loop_to_version)->inner->next
3562 || !loop_outer (loop_to_version)->inner->next->next))
3563 loop_to_version = loop_outer (loop_to_version);
3566 /* Apply versioning. If there is already a scalar version created by
3567 if-conversion re-use that. Note we cannot re-use the copy of
3568 an if-converted outer-loop when vectorizing the inner loop only. */
3569 gcond *cond;
3570 if ((!loop_to_version->inner || loop == loop_to_version)
3571 && loop_vectorized_call)
3573 gcc_assert (scalar_loop);
3574 condition_bb = gimple_bb (loop_vectorized_call);
3575 cond = as_a <gcond *> (last_stmt (condition_bb));
3576 gimple_cond_set_condition_from_tree (cond, cond_expr);
3577 update_stmt (cond);
3579 if (cond_expr_stmt_list)
3581 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3582 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3583 GSI_SAME_STMT);
3586 /* if-conversion uses profile_probability::always () for both paths,
3587 reset the paths probabilities appropriately. */
3588 edge te, fe;
3589 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3590 te->probability = prob;
3591 fe->probability = prob.invert ();
3592 /* We can scale loops counts immediately but have to postpone
3593 scaling the scalar loop because we re-use it during peeling. */
3594 scale_loop_frequencies (loop_to_version, te->probability);
3595 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3597 nloop = scalar_loop;
3598 if (dump_enabled_p ())
3599 dump_printf_loc (MSG_NOTE, vect_location,
3600 "reusing %sloop version created by if conversion\n",
3601 loop_to_version != loop ? "outer " : "");
3603 else
3605 if (loop_to_version != loop
3606 && dump_enabled_p ())
3607 dump_printf_loc (MSG_NOTE, vect_location,
3608 "applying loop versioning to outer loop %d\n",
3609 loop_to_version->num);
3611 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3613 initialize_original_copy_tables ();
3614 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3615 prob, prob.invert (), prob, prob.invert (), true);
3616 gcc_assert (nloop);
3617 nloop = get_loop_copy (loop);
3619 /* For cycle vectorization with SLP we rely on the PHI arguments
3620 appearing in the same order as the SLP node operands which for the
3621 loop PHI nodes means the preheader edge dest index needs to remain
3622 the same for the analyzed loop which also becomes the vectorized one.
3623 Make it so in case the state after versioning differs by redirecting
3624 the first edge into the header to the same destination which moves
3625 it last. */
3626 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3628 edge e = EDGE_PRED (loop->header, 0);
3629 ssa_redirect_edge (e, e->dest);
3630 flush_pending_stmts (e);
3632 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3634 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3635 reap those otherwise; they also refer to the original
3636 loops. */
3637 class loop *l = loop;
3638 while (gimple *call = vect_loop_vectorized_call (l))
3640 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3641 fold_loop_internal_call (call, boolean_false_node);
3642 l = loop_outer (l);
3644 free_original_copy_tables ();
3646 if (cond_expr_stmt_list)
3648 cond_exp_gsi = gsi_last_bb (condition_bb);
3649 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3650 GSI_SAME_STMT);
3653 /* Loop versioning violates an assumption we try to maintain during
3654 vectorization - that the loop exit block has a single predecessor.
3655 After versioning, the exit block of both loop versions is the same
3656 basic block (i.e. it has two predecessors). Just in order to simplify
3657 following transformations in the vectorizer, we fix this situation
3658 here by adding a new (empty) block on the exit-edge of the loop,
3659 with the proper loop-exit phis to maintain loop-closed-form.
3660 If loop versioning wasn't done from loop, but scalar_loop instead,
3661 merge_bb will have already just a single successor. */
3663 merge_bb = single_exit (loop_to_version)->dest;
3664 if (EDGE_COUNT (merge_bb->preds) >= 2)
3666 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3667 new_exit_bb = split_edge (single_exit (loop_to_version));
3668 new_exit_e = single_exit (loop_to_version);
3669 e = EDGE_SUCC (new_exit_bb, 0);
3671 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3672 gsi_next (&gsi))
3674 tree new_res;
3675 orig_phi = gsi.phi ();
3676 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3677 new_phi = create_phi_node (new_res, new_exit_bb);
3678 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3679 add_phi_arg (new_phi, arg, new_exit_e,
3680 gimple_phi_arg_location_from_edge (orig_phi, e));
3681 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3685 update_ssa (TODO_update_ssa_no_phi);
3688 /* Split the cost model check off to a separate BB. Costing assumes
3689 this is the only thing we perform when we enter the scalar loop
3690 from a failed cost decision. */
3691 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3693 gimple *def = SSA_NAME_DEF_STMT (cost_name);
3694 gcc_assert (gimple_bb (def) == condition_bb);
3695 /* All uses of the cost check are 'true' after the check we
3696 are going to insert. */
3697 replace_uses_by (cost_name, boolean_true_node);
3698 /* And we're going to build the new single use of it. */
3699 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3700 NULL_TREE, NULL_TREE);
3701 edge e = split_block (gimple_bb (def), def);
3702 gimple_stmt_iterator gsi = gsi_for_stmt (def);
3703 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3704 edge true_e, false_e;
3705 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3706 e->flags &= ~EDGE_FALLTHRU;
3707 e->flags |= EDGE_TRUE_VALUE;
3708 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3709 e->probability = prob2;
3710 e2->probability = prob2.invert ();
3711 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3712 auto_vec<basic_block, 3> adj;
3713 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3714 son;
3715 son = next_dom_son (CDI_DOMINATORS, son))
3716 if (EDGE_COUNT (son->preds) > 1)
3717 adj.safe_push (son);
3718 for (auto son : adj)
3719 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3722 if (version_niter)
3724 /* The versioned loop could be infinite, we need to clear existing
3725 niter information which is copied from the original loop. */
3726 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3727 vect_free_loop_info_assumptions (nloop);
3730 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3731 && dump_enabled_p ())
3733 if (version_alias)
3734 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3735 vect_location,
3736 "loop versioned for vectorization because of "
3737 "possible aliasing\n");
3738 if (version_align)
3739 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3740 vect_location,
3741 "loop versioned for vectorization to enhance "
3742 "alignment\n");
3746 return nloop;