c++: fix throwing cleanup with label
[official-gcc.git] / gcc / tree-vect-loop-manip.cc
blob3f735945e6781064dc05249a7a1af49def4d7962
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
54 /*************************************************************************
55 Simple Loop Peeling Utilities
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
61 /* Renames the use *OP_P. */
63 static void
64 rename_use_op (use_operand_p op_p)
66 tree new_name;
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
71 new_name = get_current_def (USE_FROM_PTR (op_p));
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
77 /* An ordinary ssa name defined in the loop. */
79 SET_USE (op_p, new_name);
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
98 if (rename_from_outer_loop)
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
112 FOR_EACH_EDGE (e, ei, bb->preds)
114 if (!flow_bb_inside_loop_p (loop, e->src))
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
120 if (outer_loop->inner->next)
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
138 struct adjust_info
140 tree from, to;
141 basic_block bb;
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
172 use_operand_p use_p;
173 basic_block bbuse;
175 if (!is_gimple_debug (stmt))
176 continue;
178 gcc_assert (gimple_debug_bind_p (stmt));
180 bbuse = gimple_bb (stmt);
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
199 /* Adjust debug stmts as scheduled before. */
201 static void
202 adjust_vec_debug_stmts (void)
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
207 gcc_assert (adjust_vec.exists ());
209 while (!adjust_vec.is_empty ())
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
224 adjust_info ai;
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
272 /* Add SEQ to the end of LOOP's preheader block. */
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
277 if (seq)
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
285 /* Add SEQ to the beginning of LOOP's header block. */
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
290 if (seq)
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
316 *indices);
319 /* Try to use permutes to define the masks in DEST_RGM using the masks
320 in SRC_RGM, given that the former has twice as many masks as the
321 latter. Return true on success, adding any new statements to SEQ. */
323 static bool
324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
325 rgroup_controls *src_rgm)
327 tree src_masktype = src_rgm->type;
328 tree dest_masktype = dest_rgm->type;
329 machine_mode src_mode = TYPE_MODE (src_masktype);
330 insn_code icode1, icode2;
331 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
332 && (icode1 = optab_handler (vec_unpacku_hi_optab,
333 src_mode)) != CODE_FOR_nothing
334 && (icode2 = optab_handler (vec_unpacku_lo_optab,
335 src_mode)) != CODE_FOR_nothing)
337 /* Unpacking the source masks gives at least as many mask bits as
338 we need. We can then VIEW_CONVERT any excess bits away. */
339 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
340 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
341 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
342 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
344 tree src = src_rgm->controls[i / 2];
345 tree dest = dest_rgm->controls[i];
346 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
347 ? VEC_UNPACK_HI_EXPR
348 : VEC_UNPACK_LO_EXPR);
349 gassign *stmt;
350 if (dest_masktype == unpack_masktype)
351 stmt = gimple_build_assign (dest, code, src);
352 else
354 tree temp = make_ssa_name (unpack_masktype);
355 stmt = gimple_build_assign (temp, code, src);
356 gimple_seq_add_stmt (seq, stmt);
357 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
358 build1 (VIEW_CONVERT_EXPR,
359 dest_masktype, temp));
361 gimple_seq_add_stmt (seq, stmt);
363 return true;
365 vec_perm_indices indices[2];
366 if (dest_masktype == src_masktype
367 && interleave_supported_p (&indices[0], src_masktype, 0)
368 && interleave_supported_p (&indices[1], src_masktype, 1))
370 /* The destination requires twice as many mask bits as the source, so
371 we can use interleaving permutes to double up the number of bits. */
372 tree masks[2];
373 for (unsigned int i = 0; i < 2; ++i)
374 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
375 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
377 tree src = src_rgm->controls[i / 2];
378 tree dest = dest_rgm->controls[i];
379 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
380 src, src, masks[i & 1]);
381 gimple_seq_add_stmt (seq, stmt);
383 return true;
385 return false;
388 /* Populate DEST_RGM->controls, given that they should add up to STEP.
390 STEP = MIN_EXPR <ivtmp_34, VF>;
392 First length (MIN (X, VF/N)):
393 loop_len_15 = MIN_EXPR <STEP, VF/N>;
395 Second length:
396 tmp = STEP - loop_len_15;
397 loop_len_16 = MIN (tmp, VF/N);
399 Third length:
400 tmp2 = tmp - loop_len_16;
401 loop_len_17 = MIN (tmp2, VF/N);
403 Last length:
404 loop_len_18 = tmp2 - loop_len_17;
407 static void
408 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
409 rgroup_controls *dest_rgm, tree step)
411 tree ctrl_type = dest_rgm->type;
412 poly_uint64 nitems_per_ctrl
413 = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
414 tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
416 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
418 tree ctrl = dest_rgm->controls[i];
419 if (i == 0)
421 /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
422 gassign *assign
423 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
424 gimple_seq_add_stmt (seq, assign);
426 else if (i == dest_rgm->controls.length () - 1)
428 /* Last iteration: Remain capped to the range [0, VF/N]. */
429 gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
430 dest_rgm->controls[i - 1]);
431 gimple_seq_add_stmt (seq, assign);
433 else
435 /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
436 step = gimple_build (seq, MINUS_EXPR, iv_type, step,
437 dest_rgm->controls[i - 1]);
438 gassign *assign
439 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
440 gimple_seq_add_stmt (seq, assign);
445 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
446 for all the rgroup controls in RGC and return a control that is nonzero
447 when the loop needs to iterate. Add any new preheader statements to
448 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
450 RGC belongs to loop LOOP. The loop originally iterated NITERS
451 times and has been vectorized according to LOOP_VINFO.
453 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
454 starts with NITERS_SKIP dummy iterations of the scalar loop before
455 the real work starts. The mask elements for these dummy iterations
456 must be 0, to ensure that the extra iterations do not have an effect.
458 It is known that:
460 NITERS * RGC->max_nscalars_per_iter * RGC->factor
462 does not overflow. However, MIGHT_WRAP_P says whether an induction
463 variable that starts at 0 and has step:
465 VF * RGC->max_nscalars_per_iter * RGC->factor
467 might overflow before hitting a value above:
469 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
471 This means that we cannot guarantee that such an induction variable
472 would ever hit a value that produces a set of all-false masks or zero
473 lengths for RGC.
475 Note: the cost of the code generated by this function is modeled
476 by vect_estimate_min_profitable_iters, so changes here may need
477 corresponding changes there. */
479 static tree
480 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
481 gimple_seq *preheader_seq,
482 gimple_seq *header_seq,
483 gimple_stmt_iterator loop_cond_gsi,
484 rgroup_controls *rgc, tree niters,
485 tree niters_skip, bool might_wrap_p,
486 tree *iv_step, tree *compare_step)
488 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
489 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
490 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
492 tree ctrl_type = rgc->type;
493 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
494 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
495 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
496 tree length_limit = NULL_TREE;
497 /* For length, we need length_limit to ensure length in range. */
498 if (!use_masks_p)
499 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
501 /* Calculate the maximum number of item values that the rgroup
502 handles in total, the number that it handles for each iteration
503 of the vector loop, and the number that it should skip during the
504 first iteration of the vector loop. */
505 tree nitems_total = niters;
506 tree nitems_step = build_int_cst (iv_type, vf);
507 tree nitems_skip = niters_skip;
508 if (nitems_per_iter != 1)
510 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
511 these multiplications don't overflow. */
512 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
513 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
514 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
515 nitems_total, compare_factor);
516 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
517 nitems_step, iv_factor);
518 if (nitems_skip)
519 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
520 nitems_skip, compare_factor);
523 /* Create an induction variable that counts the number of items
524 processed. */
525 tree index_before_incr, index_after_incr;
526 gimple_stmt_iterator incr_gsi;
527 bool insert_after;
528 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
529 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
531 /* Create an IV that counts down from niters_total and whose step
532 is the (variable) amount processed in the current iteration:
534 _10 = (unsigned long) count_12(D);
536 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
537 _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
539 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
541 ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
543 if (ivtmp_9 > POLY_INT_CST [4, 4])
544 goto <bb 4>; [83.33%]
545 else
546 goto <bb 5>; [16.67%]
548 nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
549 tree step = rgc->controls.length () == 1 ? rgc->controls[0]
550 : make_ssa_name (iv_type);
551 /* Create decrement IV. */
552 create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
553 &incr_gsi, insert_after, &index_before_incr,
554 &index_after_incr);
555 gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
556 index_before_incr,
557 nitems_step));
558 *iv_step = step;
559 *compare_step = nitems_step;
560 return index_before_incr;
563 /* Create increment IV. */
564 create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
565 loop, &incr_gsi, insert_after, &index_before_incr,
566 &index_after_incr);
568 tree zero_index = build_int_cst (compare_type, 0);
569 tree test_index, test_limit, first_limit;
570 gimple_stmt_iterator *test_gsi;
571 if (might_wrap_p)
573 /* In principle the loop should stop iterating once the incremented
574 IV reaches a value greater than or equal to:
576 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
578 However, there's no guarantee that this addition doesn't overflow
579 the comparison type, or that the IV hits a value above it before
580 wrapping around. We therefore adjust the limit down by one
581 IV step:
583 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
584 -[infinite-prec] NITEMS_STEP
586 and compare the IV against this limit _before_ incrementing it.
587 Since the comparison type is unsigned, we actually want the
588 subtraction to saturate at zero:
590 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
591 -[sat] NITEMS_STEP
593 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
595 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
597 where the rightmost subtraction can be done directly in
598 COMPARE_TYPE. */
599 test_index = index_before_incr;
600 tree adjust = gimple_convert (preheader_seq, compare_type,
601 nitems_step);
602 if (nitems_skip)
603 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
604 adjust, nitems_skip);
605 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
606 nitems_total, adjust);
607 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
608 test_limit, adjust);
609 test_gsi = &incr_gsi;
611 /* Get a safe limit for the first iteration. */
612 if (nitems_skip)
614 /* The first vector iteration can handle at most NITEMS_STEP
615 items. NITEMS_STEP <= CONST_LIMIT, and adding
616 NITEMS_SKIP to that cannot overflow. */
617 tree const_limit = build_int_cst (compare_type,
618 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
619 * nitems_per_iter);
620 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
621 nitems_total, const_limit);
622 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
623 first_limit, nitems_skip);
625 else
626 /* For the first iteration it doesn't matter whether the IV hits
627 a value above NITEMS_TOTAL. That only matters for the latch
628 condition. */
629 first_limit = nitems_total;
631 else
633 /* Test the incremented IV, which will always hit a value above
634 the bound before wrapping. */
635 test_index = index_after_incr;
636 test_limit = nitems_total;
637 if (nitems_skip)
638 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
639 test_limit, nitems_skip);
640 test_gsi = &loop_cond_gsi;
642 first_limit = test_limit;
645 /* Convert the IV value to the comparison type (either a no-op or
646 a demotion). */
647 gimple_seq test_seq = NULL;
648 test_index = gimple_convert (&test_seq, compare_type, test_index);
649 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
651 /* Provide a definition of each control in the group. */
652 tree next_ctrl = NULL_TREE;
653 tree ctrl;
654 unsigned int i;
655 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
657 /* Previous controls will cover BIAS items. This control covers the
658 next batch. */
659 poly_uint64 bias = nitems_per_ctrl * i;
660 tree bias_tree = build_int_cst (compare_type, bias);
662 /* See whether the first iteration of the vector loop is known
663 to have a full control. */
664 poly_uint64 const_limit;
665 bool first_iteration_full
666 = (poly_int_tree_p (first_limit, &const_limit)
667 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
669 /* Rather than have a new IV that starts at BIAS and goes up to
670 TEST_LIMIT, prefer to use the same 0-based IV for each control
671 and adjust the bound down by BIAS. */
672 tree this_test_limit = test_limit;
673 if (i != 0)
675 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
676 compare_type, this_test_limit,
677 bias_tree);
678 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
679 compare_type, this_test_limit,
680 bias_tree);
683 /* Create the initial control. First include all items that
684 are within the loop limit. */
685 tree init_ctrl = NULL_TREE;
686 if (!first_iteration_full)
688 tree start, end;
689 if (first_limit == test_limit)
691 /* Use a natural test between zero (the initial IV value)
692 and the loop limit. The "else" block would be valid too,
693 but this choice can avoid the need to load BIAS_TREE into
694 a register. */
695 start = zero_index;
696 end = this_test_limit;
698 else
700 /* FIRST_LIMIT is the maximum number of items handled by the
701 first iteration of the vector loop. Test the portion
702 associated with this control. */
703 start = bias_tree;
704 end = first_limit;
707 if (use_masks_p)
708 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
709 start, end, "max_mask");
710 else
712 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
713 gimple_seq seq = vect_gen_len (init_ctrl, start,
714 end, length_limit);
715 gimple_seq_add_seq (preheader_seq, seq);
719 /* Now AND out the bits that are within the number of skipped
720 items. */
721 poly_uint64 const_skip;
722 if (nitems_skip
723 && !(poly_int_tree_p (nitems_skip, &const_skip)
724 && known_le (const_skip, bias)))
726 gcc_assert (use_masks_p);
727 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
728 bias_tree, nitems_skip);
729 if (init_ctrl)
730 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
731 init_ctrl, unskipped_mask);
732 else
733 init_ctrl = unskipped_mask;
736 if (!init_ctrl)
738 /* First iteration is full. */
739 if (use_masks_p)
740 init_ctrl = build_minus_one_cst (ctrl_type);
741 else
742 init_ctrl = length_limit;
745 /* Get the control value for the next iteration of the loop. */
746 if (use_masks_p)
748 gimple_seq stmts = NULL;
749 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
750 this_test_limit, "next_mask");
751 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
753 else
755 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
756 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
757 length_limit);
758 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
761 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
764 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
765 if (partial_load_bias != 0)
767 tree adjusted_len = rgc->bias_adjusted_ctrl;
768 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
769 rgc->controls[0],
770 build_int_cst
771 (TREE_TYPE (rgc->controls[0]),
772 partial_load_bias));
773 gimple_seq_add_stmt (header_seq, minus);
776 return next_ctrl;
779 /* Set up the iteration condition and rgroup controls for LOOP, given
780 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
781 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
782 the number of iterations of the original scalar loop that should be
783 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
784 for vect_set_loop_condition.
786 Insert the branch-back condition before LOOP_COND_GSI and return the
787 final gcond. */
789 static gcond *
790 vect_set_loop_condition_partial_vectors (class loop *loop,
791 loop_vec_info loop_vinfo, tree niters,
792 tree final_iv, bool niters_maybe_zero,
793 gimple_stmt_iterator loop_cond_gsi)
795 gimple_seq preheader_seq = NULL;
796 gimple_seq header_seq = NULL;
798 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
799 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
800 unsigned int compare_precision = TYPE_PRECISION (compare_type);
801 tree orig_niters = niters;
803 /* Type of the initial value of NITERS. */
804 tree ni_actual_type = TREE_TYPE (niters);
805 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
806 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
808 /* Convert NITERS to the same size as the compare. */
809 if (compare_precision > ni_actual_precision
810 && niters_maybe_zero)
812 /* We know that there is always at least one iteration, so if the
813 count is zero then it must have wrapped. Cope with this by
814 subtracting 1 before the conversion and adding 1 to the result. */
815 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
816 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
817 niters, build_minus_one_cst (ni_actual_type));
818 niters = gimple_convert (&preheader_seq, compare_type, niters);
819 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
820 niters, build_one_cst (compare_type));
822 else
823 niters = gimple_convert (&preheader_seq, compare_type, niters);
825 /* Iterate over all the rgroups and fill in their controls. We could use
826 the first control from any rgroup for the loop condition; here we
827 arbitrarily pick the last. */
828 tree test_ctrl = NULL_TREE;
829 tree iv_step = NULL_TREE;
830 tree compare_step = NULL_TREE;
831 rgroup_controls *rgc;
832 rgroup_controls *iv_rgc = nullptr;
833 unsigned int i;
834 auto_vec<rgroup_controls> *controls = use_masks_p
835 ? &LOOP_VINFO_MASKS (loop_vinfo)
836 : &LOOP_VINFO_LENS (loop_vinfo);
837 FOR_EACH_VEC_ELT (*controls, i, rgc)
838 if (!rgc->controls.is_empty ())
840 /* First try using permutes. This adds a single vector
841 instruction to the loop for each mask, but needs no extra
842 loop invariants or IVs. */
843 unsigned int nmasks = i + 1;
844 if (use_masks_p && (nmasks & 1) == 0)
846 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
847 if (!half_rgc->controls.is_empty ()
848 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
849 continue;
852 if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
853 || !iv_rgc
854 || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
855 != rgc->max_nscalars_per_iter * rgc->factor))
857 /* See whether zero-based IV would ever generate all-false masks
858 or zero length before wrapping around. */
859 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
861 /* Set up all controls for this group. */
862 test_ctrl
863 = vect_set_loop_controls_directly (loop, loop_vinfo,
864 &preheader_seq, &header_seq,
865 loop_cond_gsi, rgc, niters,
866 niters_skip, might_wrap_p,
867 &iv_step, &compare_step);
869 iv_rgc = rgc;
872 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
873 && rgc->controls.length () > 1)
875 /* vect_set_loop_controls_directly creates an IV whose step
876 is equal to the expected sum of RGC->controls. Use that
877 information to populate RGC->controls. */
878 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
879 gcc_assert (iv_step);
880 vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
884 /* Emit all accumulated statements. */
885 add_preheader_seq (loop, preheader_seq);
886 add_header_seq (loop, header_seq);
888 /* Get a boolean result that tells us whether to iterate. */
889 edge exit_edge = single_exit (loop);
890 gcond *cond_stmt;
891 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
893 gcc_assert (compare_step);
894 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
895 cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
896 NULL_TREE);
898 else
900 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
901 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
902 cond_stmt
903 = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
905 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
907 /* The loop iterates (NITERS - 1) / VF + 1 times.
908 Subtract one from this to get the latch count. */
909 tree step = build_int_cst (compare_type,
910 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
911 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
912 build_minus_one_cst (compare_type));
913 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
914 niters_minus_one, step);
916 if (final_iv)
918 gassign *assign = gimple_build_assign (final_iv, orig_niters);
919 gsi_insert_on_edge_immediate (single_exit (loop), assign);
922 return cond_stmt;
925 /* Like vect_set_loop_condition, but handle the case in which the vector
926 loop handles exactly VF scalars per iteration. */
928 static gcond *
929 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
930 tree final_iv, bool niters_maybe_zero,
931 gimple_stmt_iterator loop_cond_gsi)
933 tree indx_before_incr, indx_after_incr;
934 gcond *cond_stmt;
935 gcond *orig_cond;
936 edge pe = loop_preheader_edge (loop);
937 edge exit_edge = single_exit (loop);
938 gimple_stmt_iterator incr_gsi;
939 bool insert_after;
940 enum tree_code code;
941 tree niters_type = TREE_TYPE (niters);
943 orig_cond = get_loop_exit_condition (loop);
944 gcc_assert (orig_cond);
945 loop_cond_gsi = gsi_for_stmt (orig_cond);
947 tree init, limit;
948 if (!niters_maybe_zero && integer_onep (step))
950 /* In this case we can use a simple 0-based IV:
953 x = 0;
957 x += 1;
959 while (x < NITERS); */
960 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
961 init = build_zero_cst (niters_type);
962 limit = niters;
964 else
966 /* The following works for all values of NITERS except 0:
969 x = 0;
973 x += STEP;
975 while (x <= NITERS - STEP);
977 so that the loop continues to iterate if x + STEP - 1 < NITERS
978 but stops if x + STEP - 1 >= NITERS.
980 However, if NITERS is zero, x never hits a value above NITERS - STEP
981 before wrapping around. There are two obvious ways of dealing with
982 this:
984 - start at STEP - 1 and compare x before incrementing it
985 - start at -1 and compare x after incrementing it
987 The latter is simpler and is what we use. The loop in this case
988 looks like:
991 x = -1;
995 x += STEP;
997 while (x < NITERS - STEP);
999 In both cases the loop limit is NITERS - STEP. */
1000 gimple_seq seq = NULL;
1001 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1002 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1003 if (seq)
1005 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1006 gcc_assert (!new_bb);
1008 if (niters_maybe_zero)
1010 /* Case C. */
1011 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1012 init = build_all_ones_cst (niters_type);
1014 else
1016 /* Case B. */
1017 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1018 init = build_zero_cst (niters_type);
1022 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1023 create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1024 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1025 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1026 true, NULL_TREE, true,
1027 GSI_SAME_STMT);
1028 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1029 true, GSI_SAME_STMT);
1031 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1032 NULL_TREE);
1034 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1036 /* Record the number of latch iterations. */
1037 if (limit == niters)
1038 /* Case A: the loop iterates NITERS times. Subtract one to get the
1039 latch count. */
1040 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1041 build_int_cst (niters_type, 1));
1042 else
1043 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1044 Subtract one from this to get the latch count. */
1045 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1046 limit, step);
1048 if (final_iv)
1050 gassign *assign;
1051 edge exit = single_exit (loop);
1052 gcc_assert (single_pred_p (exit->dest));
1053 tree phi_dest
1054 = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1055 /* Make sure to maintain LC SSA form here and elide the subtraction
1056 if the value is zero. */
1057 gphi *phi = create_phi_node (phi_dest, exit->dest);
1058 add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
1059 if (!integer_zerop (init))
1061 assign = gimple_build_assign (final_iv, MINUS_EXPR,
1062 phi_dest, init);
1063 gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
1064 gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1068 return cond_stmt;
1071 /* If we're using fully-masked loops, make LOOP iterate:
1073 N == (NITERS - 1) / STEP + 1
1075 times. When NITERS is zero, this is equivalent to making the loop
1076 execute (1 << M) / STEP times, where M is the precision of NITERS.
1077 NITERS_MAYBE_ZERO is true if this last case might occur.
1079 If we're not using fully-masked loops, make LOOP iterate:
1081 N == (NITERS - STEP) / STEP + 1
1083 times, where NITERS is known to be outside the range [1, STEP - 1].
1084 This is equivalent to making the loop execute NITERS / STEP times
1085 when NITERS is nonzero and (1 << M) / STEP times otherwise.
1086 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1088 If FINAL_IV is nonnull, it is an SSA name that should be set to
1089 N * STEP on exit from the loop.
1091 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1093 void
1094 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
1095 tree niters, tree step, tree final_iv,
1096 bool niters_maybe_zero)
1098 gcond *cond_stmt;
1099 gcond *orig_cond = get_loop_exit_condition (loop);
1100 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1102 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1103 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
1104 niters, final_iv,
1105 niters_maybe_zero,
1106 loop_cond_gsi);
1107 else
1108 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
1109 niters_maybe_zero,
1110 loop_cond_gsi);
1112 /* Remove old loop exit test. */
1113 stmt_vec_info orig_cond_info;
1114 if (loop_vinfo
1115 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1116 loop_vinfo->remove_stmt (orig_cond_info);
1117 else
1118 gsi_remove (&loop_cond_gsi, true);
1120 if (dump_enabled_p ())
1121 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1122 (gimple *) cond_stmt);
1125 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
1126 For all PHI arguments in FROM->dest and TO->dest from those
1127 edges ensure that TO->dest PHI arguments have current_def
1128 to that in from. */
1130 static void
1131 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1133 gimple_stmt_iterator gsi_from, gsi_to;
1135 for (gsi_from = gsi_start_phis (from->dest),
1136 gsi_to = gsi_start_phis (to->dest);
1137 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1139 gimple *from_phi = gsi_stmt (gsi_from);
1140 gimple *to_phi = gsi_stmt (gsi_to);
1141 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1142 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1143 if (virtual_operand_p (from_arg))
1145 gsi_next (&gsi_from);
1146 continue;
1148 if (virtual_operand_p (to_arg))
1150 gsi_next (&gsi_to);
1151 continue;
1153 if (TREE_CODE (from_arg) != SSA_NAME)
1154 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1155 else if (TREE_CODE (to_arg) == SSA_NAME
1156 && from_arg != to_arg)
1158 if (get_current_def (to_arg) == NULL_TREE)
1160 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1161 TREE_TYPE (get_current_def
1162 (from_arg))));
1163 set_current_def (to_arg, get_current_def (from_arg));
1166 gsi_next (&gsi_from);
1167 gsi_next (&gsi_to);
1170 gphi *from_phi = get_virtual_phi (from->dest);
1171 gphi *to_phi = get_virtual_phi (to->dest);
1172 if (from_phi)
1173 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1174 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1178 /* Given LOOP this function generates a new copy of it and puts it
1179 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1180 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1181 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1182 entry or exit of LOOP. */
1184 class loop *
1185 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1186 class loop *scalar_loop, edge e)
1188 class loop *new_loop;
1189 basic_block *new_bbs, *bbs, *pbbs;
1190 bool at_exit;
1191 bool was_imm_dom;
1192 basic_block exit_dest;
1193 edge exit, new_exit;
1194 bool duplicate_outer_loop = false;
1196 exit = single_exit (loop);
1197 at_exit = (e == exit);
1198 if (!at_exit && e != loop_preheader_edge (loop))
1199 return NULL;
1201 if (scalar_loop == NULL)
1202 scalar_loop = loop;
1204 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1205 pbbs = bbs + 1;
1206 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1207 /* Allow duplication of outer loops. */
1208 if (scalar_loop->inner)
1209 duplicate_outer_loop = true;
1211 /* Generate new loop structure. */
1212 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1213 duplicate_subloops (scalar_loop, new_loop);
1215 exit_dest = exit->dest;
1216 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1217 exit_dest) == loop->header ?
1218 true : false);
1220 /* Also copy the pre-header, this avoids jumping through hoops to
1221 duplicate the loop entry PHI arguments. Create an empty
1222 pre-header unconditionally for this. */
1223 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1224 edge entry_e = single_pred_edge (preheader);
1225 bbs[0] = preheader;
1226 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1228 exit = single_exit (scalar_loop);
1229 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1230 &exit, 1, &new_exit, NULL,
1231 at_exit ? loop->latch : e->src, true);
1232 exit = single_exit (loop);
1233 basic_block new_preheader = new_bbs[0];
1235 /* Before installing PHI arguments make sure that the edges
1236 into them match that of the scalar loop we analyzed. This
1237 makes sure the SLP tree matches up between the main vectorized
1238 loop and the epilogue vectorized copies. */
1239 if (single_succ_edge (preheader)->dest_idx
1240 != single_succ_edge (new_bbs[0])->dest_idx)
1242 basic_block swap_bb = new_bbs[1];
1243 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1244 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1245 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1246 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1248 if (duplicate_outer_loop)
1250 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1251 if (loop_preheader_edge (scalar_loop)->dest_idx
1252 != loop_preheader_edge (new_inner_loop)->dest_idx)
1254 basic_block swap_bb = new_inner_loop->header;
1255 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1256 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1257 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1258 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1262 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1264 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1265 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1266 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1268 if (scalar_loop != loop)
1270 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1271 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1272 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1273 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1274 header) to have current_def set, so copy them over. */
1275 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1276 exit);
1277 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1279 EDGE_SUCC (loop->latch, 0));
1282 if (at_exit) /* Add the loop copy at exit. */
1284 if (scalar_loop != loop)
1286 gphi_iterator gsi;
1287 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1289 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1290 gsi_next (&gsi))
1292 gphi *phi = gsi.phi ();
1293 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1294 location_t orig_locus
1295 = gimple_phi_arg_location_from_edge (phi, e);
1297 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1300 redirect_edge_and_branch_force (e, new_preheader);
1301 flush_pending_stmts (e);
1302 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1303 if (was_imm_dom || duplicate_outer_loop)
1304 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1306 /* And remove the non-necessary forwarder again. Keep the other
1307 one so we have a proper pre-header for the loop at the exit edge. */
1308 redirect_edge_pred (single_succ_edge (preheader),
1309 single_pred (preheader));
1310 delete_basic_block (preheader);
1311 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1312 loop_preheader_edge (scalar_loop)->src);
1314 else /* Add the copy at entry. */
1316 if (scalar_loop != loop)
1318 /* Remove the non-necessary forwarder of scalar_loop again. */
1319 redirect_edge_pred (single_succ_edge (preheader),
1320 single_pred (preheader));
1321 delete_basic_block (preheader);
1322 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1323 loop_preheader_edge (scalar_loop)->src);
1324 preheader = split_edge (loop_preheader_edge (loop));
1325 entry_e = single_pred_edge (preheader);
1328 redirect_edge_and_branch_force (entry_e, new_preheader);
1329 flush_pending_stmts (entry_e);
1330 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1332 redirect_edge_and_branch_force (new_exit, preheader);
1333 flush_pending_stmts (new_exit);
1334 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1336 /* And remove the non-necessary forwarder again. Keep the other
1337 one so we have a proper pre-header for the loop at the exit edge. */
1338 redirect_edge_pred (single_succ_edge (new_preheader),
1339 single_pred (new_preheader));
1340 delete_basic_block (new_preheader);
1341 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1342 loop_preheader_edge (new_loop)->src);
1345 if (scalar_loop != loop)
1347 /* Update new_loop->header PHIs, so that on the preheader
1348 edge they are the ones from loop rather than scalar_loop. */
1349 gphi_iterator gsi_orig, gsi_new;
1350 edge orig_e = loop_preheader_edge (loop);
1351 edge new_e = loop_preheader_edge (new_loop);
1353 for (gsi_orig = gsi_start_phis (loop->header),
1354 gsi_new = gsi_start_phis (new_loop->header);
1355 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1356 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1358 gphi *orig_phi = gsi_orig.phi ();
1359 gphi *new_phi = gsi_new.phi ();
1360 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1361 location_t orig_locus
1362 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1364 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1368 free (new_bbs);
1369 free (bbs);
1371 checking_verify_dominators (CDI_DOMINATORS);
1373 return new_loop;
1377 /* Given the condition expression COND, put it as the last statement of
1378 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1379 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1380 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1381 new edge as irreducible if IRREDUCIBLE_P is true. */
1383 static edge
1384 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1385 basic_block guard_to, basic_block dom_bb,
1386 profile_probability probability, bool irreducible_p)
1388 gimple_stmt_iterator gsi;
1389 edge new_e, enter_e;
1390 gcond *cond_stmt;
1391 gimple_seq gimplify_stmt_list = NULL;
1393 enter_e = EDGE_SUCC (guard_bb, 0);
1394 enter_e->flags &= ~EDGE_FALLTHRU;
1395 enter_e->flags |= EDGE_FALSE_VALUE;
1396 gsi = gsi_last_bb (guard_bb);
1398 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1399 is_gimple_condexpr_for_cond, NULL_TREE);
1400 if (gimplify_stmt_list)
1401 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1403 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1404 gsi = gsi_last_bb (guard_bb);
1405 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1407 /* Add new edge to connect guard block to the merge/loop-exit block. */
1408 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1410 new_e->probability = probability;
1411 if (irreducible_p)
1412 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1414 enter_e->probability = probability.invert ();
1415 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1417 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1418 if (enter_e->dest->loop_father->header == enter_e->dest)
1419 split_edge (enter_e);
1421 return new_e;
1425 /* This function verifies that the following restrictions apply to LOOP:
1426 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1427 for innermost loop and 5 basic blocks for outer-loop.
1428 (2) it is single entry, single exit
1429 (3) its exit condition is the last stmt in the header
1430 (4) E is the entry/exit edge of LOOP.
1433 bool
1434 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1436 edge exit_e = single_exit (loop);
1437 edge entry_e = loop_preheader_edge (loop);
1438 gcond *orig_cond = get_loop_exit_condition (loop);
1439 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1440 unsigned int num_bb = loop->inner? 5 : 2;
1442 /* All loops have an outer scope; the only case loop->outer is NULL is for
1443 the function itself. */
1444 if (!loop_outer (loop)
1445 || loop->num_nodes != num_bb
1446 || !empty_block_p (loop->latch)
1447 || !single_exit (loop)
1448 /* Verify that new loop exit condition can be trivially modified. */
1449 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1450 || (e != exit_e && e != entry_e))
1451 return false;
1453 basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1454 get_loop_body_with_size (loop, bbs, loop->num_nodes);
1455 bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1456 free (bbs);
1457 return ret;
1460 /* Function vect_get_loop_location.
1462 Extract the location of the loop in the source code.
1463 If the loop is not well formed for vectorization, an estimated
1464 location is calculated.
1465 Return the loop location if succeed and NULL if not. */
1467 dump_user_location_t
1468 find_loop_location (class loop *loop)
1470 gimple *stmt = NULL;
1471 basic_block bb;
1472 gimple_stmt_iterator si;
1474 if (!loop)
1475 return dump_user_location_t ();
1477 stmt = get_loop_exit_condition (loop);
1479 if (stmt
1480 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1481 return stmt;
1483 /* If we got here the loop is probably not "well formed",
1484 try to estimate the loop location */
1486 if (!loop->header)
1487 return dump_user_location_t ();
1489 bb = loop->header;
1491 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1493 stmt = gsi_stmt (si);
1494 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1495 return stmt;
1498 return dump_user_location_t ();
1501 /* Return true if the phi described by STMT_INFO defines an IV of the
1502 loop to be vectorized. */
1504 static bool
1505 iv_phi_p (stmt_vec_info stmt_info)
1507 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1508 if (virtual_operand_p (PHI_RESULT (phi)))
1509 return false;
1511 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1512 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1513 return false;
1515 return true;
1518 /* Return true if vectorizer can peel for nonlinear iv. */
1519 static bool
1520 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
1521 enum vect_induction_op_type induction_type)
1523 tree niters_skip;
1524 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
1525 if niters or vf is unkown:
1526 For shift, when shift mount >= precision, there would be UD.
1527 For mult, don't known how to generate
1528 init_expr * pow (step, niters) for variable niters.
1529 For neg, it should be ok, since niters of vectorized main loop
1530 will always be multiple of 2. */
1531 if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1532 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
1533 && induction_type != vect_step_op_neg)
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "Peeling for epilogue is not supported"
1538 " for nonlinear induction except neg"
1539 " when iteration count is unknown.\n");
1540 return false;
1543 /* Also doens't support peel for neg when niter is variable.
1544 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
1545 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
1546 if ((niters_skip != NULL_TREE
1547 && TREE_CODE (niters_skip) != INTEGER_CST)
1548 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
1549 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 "Peeling for alignement is not supported"
1554 " for nonlinear induction when niters_skip"
1555 " is not constant.\n");
1556 return false;
1559 return true;
1562 /* Function vect_can_advance_ivs_p
1564 In case the number of iterations that LOOP iterates is unknown at compile
1565 time, an epilog loop will be generated, and the loop induction variables
1566 (IVs) will be "advanced" to the value they are supposed to take just before
1567 the epilog loop. Here we check that the access function of the loop IVs
1568 and the expression that represents the loop bound are simple enough.
1569 These restrictions will be relaxed in the future. */
1571 bool
1572 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1574 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1575 basic_block bb = loop->header;
1576 gphi_iterator gsi;
1578 /* Analyze phi functions of the loop header. */
1580 if (dump_enabled_p ())
1581 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1582 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1584 tree evolution_part;
1585 enum vect_induction_op_type induction_type;
1587 gphi *phi = gsi.phi ();
1588 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1589 if (dump_enabled_p ())
1590 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1591 phi_info->stmt);
1593 /* Skip virtual phi's. The data dependences that are associated with
1594 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1596 Skip reduction phis. */
1597 if (!iv_phi_p (phi_info))
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_NOTE, vect_location,
1601 "reduc or virtual phi. skip.\n");
1602 continue;
1605 induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1606 if (induction_type != vect_step_op_add)
1608 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
1609 return false;
1611 continue;
1614 /* Analyze the evolution function. */
1616 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1617 if (evolution_part == NULL_TREE)
1619 if (dump_enabled_p ())
1620 dump_printf (MSG_MISSED_OPTIMIZATION,
1621 "No access function or evolution.\n");
1622 return false;
1625 /* FORNOW: We do not transform initial conditions of IVs
1626 which evolution functions are not invariants in the loop. */
1628 if (!expr_invariant_in_loop_p (loop, evolution_part))
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632 "evolution not invariant in loop.\n");
1633 return false;
1636 /* FORNOW: We do not transform initial conditions of IVs
1637 which evolution functions are a polynomial of degree >= 2. */
1639 if (tree_is_chrec (evolution_part))
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "evolution is chrec.\n");
1644 return false;
1648 return true;
1652 /* Function vect_update_ivs_after_vectorizer.
1654 "Advance" the induction variables of LOOP to the value they should take
1655 after the execution of LOOP. This is currently necessary because the
1656 vectorizer does not handle induction variables that are used after the
1657 loop. Such a situation occurs when the last iterations of LOOP are
1658 peeled, because:
1659 1. We introduced new uses after LOOP for IVs that were not originally used
1660 after LOOP: the IVs of LOOP are now used by an epilog loop.
1661 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1662 times, whereas the loop IVs should be bumped N times.
1664 Input:
1665 - LOOP - a loop that is going to be vectorized. The last few iterations
1666 of LOOP were peeled.
1667 - NITERS - the number of iterations that LOOP executes (before it is
1668 vectorized). i.e, the number of times the ivs should be bumped.
1669 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1670 coming out from LOOP on which there are uses of the LOOP ivs
1671 (this is the path from LOOP->exit to epilog_loop->preheader).
1673 The new definitions of the ivs are placed in LOOP->exit.
1674 The phi args associated with the edge UPDATE_E in the bb
1675 UPDATE_E->dest are updated accordingly.
1677 Assumption 1: Like the rest of the vectorizer, this function assumes
1678 a single loop exit that has a single predecessor.
1680 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1681 organized in the same order.
1683 Assumption 3: The access function of the ivs is simple enough (see
1684 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1686 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1687 coming out of LOOP on which the ivs of LOOP are used (this is the path
1688 that leads to the epilog loop; other paths skip the epilog loop). This
1689 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1690 needs to have its phis updated.
1693 static void
1694 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1695 tree niters, edge update_e)
1697 gphi_iterator gsi, gsi1;
1698 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699 basic_block update_bb = update_e->dest;
1700 basic_block exit_bb = single_exit (loop)->dest;
1702 /* Make sure there exists a single-predecessor exit bb: */
1703 gcc_assert (single_pred_p (exit_bb));
1704 gcc_assert (single_succ_edge (exit_bb) == update_e);
1706 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1707 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1708 gsi_next (&gsi), gsi_next (&gsi1))
1710 tree init_expr;
1711 tree step_expr, off;
1712 tree type;
1713 tree var, ni, ni_name;
1714 gimple_stmt_iterator last_gsi;
1716 gphi *phi = gsi.phi ();
1717 gphi *phi1 = gsi1.phi ();
1718 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE, vect_location,
1721 "vect_update_ivs_after_vectorizer: phi: %G",
1722 (gimple *) phi);
1724 /* Skip reduction and virtual phis. */
1725 if (!iv_phi_p (phi_info))
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "reduc or virtual phi. skip.\n");
1730 continue;
1733 type = TREE_TYPE (gimple_phi_result (phi));
1734 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1735 step_expr = unshare_expr (step_expr);
1737 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1738 of degree >= 2 or exponential. */
1739 gcc_assert (!tree_is_chrec (step_expr));
1741 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1742 gimple_seq stmts = NULL;
1743 enum vect_induction_op_type induction_type
1744 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1746 if (induction_type == vect_step_op_add)
1748 tree stype = TREE_TYPE (step_expr);
1749 off = fold_build2 (MULT_EXPR, stype,
1750 fold_convert (stype, niters), step_expr);
1751 if (POINTER_TYPE_P (type))
1752 ni = fold_build_pointer_plus (init_expr, off);
1753 else
1754 ni = fold_convert (type,
1755 fold_build2 (PLUS_EXPR, stype,
1756 fold_convert (stype, init_expr),
1757 off));
1759 /* Don't bother call vect_peel_nonlinear_iv_init. */
1760 else if (induction_type == vect_step_op_neg)
1761 ni = init_expr;
1762 else
1763 ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
1764 niters, step_expr,
1765 induction_type);
1767 var = create_tmp_var (type, "tmp");
1769 last_gsi = gsi_last_bb (exit_bb);
1770 gimple_seq new_stmts = NULL;
1771 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1772 /* Exit_bb shouldn't be empty. */
1773 if (!gsi_end_p (last_gsi))
1775 gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
1776 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1778 else
1780 gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
1781 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1784 /* Fix phi expressions in the successor bb. */
1785 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1789 /* Return a gimple value containing the misalignment (measured in vector
1790 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1791 it is away from a perfectly aligned address. Add any new statements
1792 to SEQ. */
1794 static tree
1795 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1797 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1798 stmt_vec_info stmt_info = dr_info->stmt;
1799 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1801 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1802 unsigned HOST_WIDE_INT target_align_c;
1803 tree target_align_minus_1;
1805 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1806 size_zero_node) < 0;
1807 tree offset = (negative
1808 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1809 * TREE_INT_CST_LOW
1810 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1811 : size_zero_node);
1812 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1813 stmt_info, seq,
1814 offset);
1815 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1816 if (target_align.is_constant (&target_align_c))
1817 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1818 else
1820 tree vla = build_int_cst (type, target_align);
1821 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1822 fold_build2 (MINUS_EXPR, type,
1823 build_int_cst (type, 0), vla));
1824 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1825 build_int_cst (type, 1));
1828 HOST_WIDE_INT elem_size
1829 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1830 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1832 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1833 tree int_start_addr = fold_convert (type, start_addr);
1834 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1835 target_align_minus_1);
1837 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1838 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1839 elem_size_log);
1841 return misalign_in_elems;
1844 /* Function vect_gen_prolog_loop_niters
1846 Generate the number of iterations which should be peeled as prolog for the
1847 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1848 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1849 As a result, after the execution of this loop, the data reference DR will
1850 refer to an aligned location. The following computation is generated:
1852 If the misalignment of DR is known at compile time:
1853 addr_mis = int mis = DR_MISALIGNMENT (dr);
1854 Else, compute address misalignment in bytes:
1855 addr_mis = addr & (target_align - 1)
1857 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1859 (elem_size = element type size; an element is the scalar element whose type
1860 is the inner type of the vectype)
1862 The computations will be emitted at the end of BB. We also compute and
1863 store upper bound (included) of the result in BOUND.
1865 When the step of the data-ref in the loop is not 1 (as in interleaved data
1866 and SLP), the number of iterations of the prolog must be divided by the step
1867 (which is equal to the size of interleaved group).
1869 The above formulas assume that VF == number of elements in the vector. This
1870 may not hold when there are multiple-types in the loop.
1871 In this case, for some data-references in the loop the VF does not represent
1872 the number of elements that fit in the vector. Therefore, instead of VF we
1873 use TYPE_VECTOR_SUBPARTS. */
1875 static tree
1876 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1877 basic_block bb, int *bound)
1879 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1880 tree var;
1881 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1882 gimple_seq stmts = NULL, new_stmts = NULL;
1883 tree iters, iters_name;
1884 stmt_vec_info stmt_info = dr_info->stmt;
1885 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1886 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1888 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1890 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE, vect_location,
1894 "known peeling = %d.\n", npeel);
1896 iters = build_int_cst (niters_type, npeel);
1897 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1899 else
1901 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1902 tree type = TREE_TYPE (misalign_in_elems);
1903 HOST_WIDE_INT elem_size
1904 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1905 /* We only do prolog peeling if the target alignment is known at compile
1906 time. */
1907 poly_uint64 align_in_elems =
1908 exact_div (target_align, elem_size);
1909 tree align_in_elems_minus_1 =
1910 build_int_cst (type, align_in_elems - 1);
1911 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1913 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1914 & (align_in_elems - 1)). */
1915 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1916 size_zero_node) < 0;
1917 if (negative)
1918 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1919 align_in_elems_tree);
1920 else
1921 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1922 misalign_in_elems);
1923 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1924 iters = fold_convert (niters_type, iters);
1925 unsigned HOST_WIDE_INT align_in_elems_c;
1926 if (align_in_elems.is_constant (&align_in_elems_c))
1927 *bound = align_in_elems_c - 1;
1928 else
1929 *bound = -1;
1932 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_NOTE, vect_location,
1934 "niters for prolog loop: %T\n", iters);
1936 var = create_tmp_var (niters_type, "prolog_loop_niters");
1937 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1939 if (new_stmts)
1940 gimple_seq_add_seq (&stmts, new_stmts);
1941 if (stmts)
1943 gcc_assert (single_succ_p (bb));
1944 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1945 if (gsi_end_p (gsi))
1946 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1947 else
1948 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1950 return iters_name;
1954 /* Function vect_update_init_of_dr
1956 If CODE is PLUS, the vector loop starts NITERS iterations after the
1957 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1958 iterations before the scalar one (using masking to skip inactive
1959 elements). This function updates the information recorded in DR to
1960 account for the difference. Specifically, it updates the OFFSET
1961 field of DR_INFO. */
1963 static void
1964 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1966 struct data_reference *dr = dr_info->dr;
1967 tree offset = dr_info->offset;
1968 if (!offset)
1969 offset = build_zero_cst (sizetype);
1971 niters = fold_build2 (MULT_EXPR, sizetype,
1972 fold_convert (sizetype, niters),
1973 fold_convert (sizetype, DR_STEP (dr)));
1974 offset = fold_build2 (code, sizetype,
1975 fold_convert (sizetype, offset), niters);
1976 dr_info->offset = offset;
1980 /* Function vect_update_inits_of_drs
1982 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1983 CODE and NITERS are as for vect_update_inits_of_dr. */
1985 void
1986 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1987 tree_code code)
1989 unsigned int i;
1990 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1991 struct data_reference *dr;
1993 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1995 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1996 here, but since we might use these niters to update the epilogues niters
1997 and data references we can't insert them here as this definition might not
1998 always dominate its uses. */
1999 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2000 niters = fold_convert (sizetype, niters);
2002 FOR_EACH_VEC_ELT (datarefs, i, dr)
2004 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2005 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2006 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2007 vect_update_init_of_dr (dr_info, niters, code);
2011 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2012 by masking. This involves calculating the number of iterations to
2013 be peeled and then aligning all memory references appropriately. */
2015 void
2016 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2018 tree misalign_in_elems;
2019 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
2021 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2023 /* From the information recorded in LOOP_VINFO get the number of iterations
2024 that need to be skipped via masking. */
2025 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2027 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2028 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2029 misalign_in_elems = build_int_cst (type, misalign);
2031 else
2033 gimple_seq seq1 = NULL, seq2 = NULL;
2034 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2035 misalign_in_elems = fold_convert (type, misalign_in_elems);
2036 misalign_in_elems = force_gimple_operand (misalign_in_elems,
2037 &seq2, true, NULL_TREE);
2038 gimple_seq_add_seq (&seq1, seq2);
2039 if (seq1)
2041 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2042 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2043 gcc_assert (!new_bb);
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "misalignment for fully-masked loop: %T\n",
2050 misalign_in_elems);
2052 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2054 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2057 /* This function builds ni_name = number of iterations. Statements
2058 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2059 it to TRUE if new ssa_var is generated. */
2061 tree
2062 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2064 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2065 if (TREE_CODE (ni) == INTEGER_CST)
2066 return ni;
2067 else
2069 tree ni_name, var;
2070 gimple_seq stmts = NULL;
2071 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2073 var = create_tmp_var (TREE_TYPE (ni), "niters");
2074 ni_name = force_gimple_operand (ni, &stmts, false, var);
2075 if (stmts)
2077 gsi_insert_seq_on_edge_immediate (pe, stmts);
2078 if (new_var_p != NULL)
2079 *new_var_p = true;
2082 return ni_name;
2086 /* Calculate the number of iterations above which vectorized loop will be
2087 preferred than scalar loop. NITERS_PROLOG is the number of iterations
2088 of prolog loop. If it's integer const, the integer number is also passed
2089 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2090 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2091 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2092 threshold below which the scalar (rather than vectorized) loop will be
2093 executed. This function stores the upper bound (inclusive) of the result
2094 in BOUND_SCALAR. */
2096 static tree
2097 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2098 int bound_prolog, poly_int64 bound_epilog, int th,
2099 poly_uint64 *bound_scalar,
2100 bool check_profitability)
2102 tree type = TREE_TYPE (niters_prolog);
2103 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2104 build_int_cst (type, bound_epilog));
2106 *bound_scalar = bound_prolog + bound_epilog;
2107 if (check_profitability)
2109 /* TH indicates the minimum niters of vectorized loop, while we
2110 compute the maximum niters of scalar loop. */
2111 th--;
2112 /* Peeling for constant times. */
2113 if (int_niters_prolog >= 0)
2115 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2116 return build_int_cst (type, *bound_scalar);
2118 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2119 and BOUND_EPILOG are inclusive upper bounds. */
2120 if (known_ge (th, bound_prolog + bound_epilog))
2122 *bound_scalar = th;
2123 return build_int_cst (type, th);
2125 /* Need to do runtime comparison. */
2126 else if (maybe_gt (th, bound_epilog))
2128 *bound_scalar = upper_bound (*bound_scalar, th);
2129 return fold_build2 (MAX_EXPR, type,
2130 build_int_cst (type, th), niters);
2133 return niters;
2136 /* NITERS is the number of times that the original scalar loop executes
2137 after peeling. Work out the maximum number of iterations N that can
2138 be handled by the vectorized form of the loop and then either:
2140 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2142 niters_vector = N
2144 b) set *STEP_VECTOR_PTR to one and generate:
2146 niters_vector = N / vf
2148 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2149 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2150 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
2152 void
2153 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2154 tree *niters_vector_ptr, tree *step_vector_ptr,
2155 bool niters_no_overflow)
2157 tree ni_minus_gap, var;
2158 tree niters_vector, step_vector, type = TREE_TYPE (niters);
2159 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2160 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2161 tree log_vf = NULL_TREE;
2163 /* If epilogue loop is required because of data accesses with gaps, we
2164 subtract one iteration from the total number of iterations here for
2165 correct calculation of RATIO. */
2166 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2168 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2169 build_one_cst (type));
2170 if (!is_gimple_val (ni_minus_gap))
2172 var = create_tmp_var (type, "ni_gap");
2173 gimple *stmts = NULL;
2174 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2175 true, var);
2176 gsi_insert_seq_on_edge_immediate (pe, stmts);
2179 else
2180 ni_minus_gap = niters;
2182 /* To silence some unexpected warnings, simply initialize to 0. */
2183 unsigned HOST_WIDE_INT const_vf = 0;
2184 if (vf.is_constant (&const_vf)
2185 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2187 /* Create: niters >> log2(vf) */
2188 /* If it's known that niters == number of latch executions + 1 doesn't
2189 overflow, we can generate niters >> log2(vf); otherwise we generate
2190 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2191 will be at least one. */
2192 log_vf = build_int_cst (type, exact_log2 (const_vf));
2193 if (niters_no_overflow)
2194 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2195 else
2196 niters_vector
2197 = fold_build2 (PLUS_EXPR, type,
2198 fold_build2 (RSHIFT_EXPR, type,
2199 fold_build2 (MINUS_EXPR, type,
2200 ni_minus_gap,
2201 build_int_cst (type, vf)),
2202 log_vf),
2203 build_int_cst (type, 1));
2204 step_vector = build_one_cst (type);
2206 else
2208 niters_vector = ni_minus_gap;
2209 step_vector = build_int_cst (type, vf);
2212 if (!is_gimple_val (niters_vector))
2214 var = create_tmp_var (type, "bnd");
2215 gimple_seq stmts = NULL;
2216 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2217 gsi_insert_seq_on_edge_immediate (pe, stmts);
2218 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2219 we set range information to make niters analyzer's life easier.
2220 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2221 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2222 if (stmts != NULL && log_vf)
2224 if (niters_no_overflow)
2226 value_range vr (type,
2227 wi::one (TYPE_PRECISION (type)),
2228 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2229 TYPE_SIGN (type)),
2230 exact_log2 (const_vf),
2231 TYPE_SIGN (type)));
2232 set_range_info (niters_vector, vr);
2234 /* For VF == 1 the vector IV might also overflow so we cannot
2235 assert a minimum value of 1. */
2236 else if (const_vf > 1)
2238 value_range vr (type,
2239 wi::one (TYPE_PRECISION (type)),
2240 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2241 TYPE_SIGN (type))
2242 - (const_vf - 1),
2243 exact_log2 (const_vf), TYPE_SIGN (type))
2244 + 1);
2245 set_range_info (niters_vector, vr);
2249 *niters_vector_ptr = niters_vector;
2250 *step_vector_ptr = step_vector;
2252 return;
2255 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2256 loop specified by LOOP_VINFO after vectorization, compute the number
2257 of iterations before vectorization (niters_vector * vf) and store it
2258 to NITERS_VECTOR_MULT_VF_PTR. */
2260 static void
2261 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2262 tree niters_vector,
2263 tree *niters_vector_mult_vf_ptr)
2265 /* We should be using a step_vector of VF if VF is variable. */
2266 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2267 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268 tree type = TREE_TYPE (niters_vector);
2269 tree log_vf = build_int_cst (type, exact_log2 (vf));
2270 basic_block exit_bb = single_exit (loop)->dest;
2272 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2273 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2274 niters_vector, log_vf);
2275 if (!is_gimple_val (niters_vector_mult_vf))
2277 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2278 gimple_seq stmts = NULL;
2279 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2280 &stmts, true, var);
2281 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2282 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2284 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2287 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2288 this function searches for the corresponding lcssa phi node in exit
2289 bb of LOOP. If it is found, return the phi result; otherwise return
2290 NULL. */
2292 static tree
2293 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2294 gphi *lcssa_phi)
2296 gphi_iterator gsi;
2297 edge e = single_exit (loop);
2299 gcc_assert (single_pred_p (e->dest));
2300 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2302 gphi *phi = gsi.phi ();
2303 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2304 PHI_ARG_DEF (lcssa_phi, 0), 0))
2305 return PHI_RESULT (phi);
2307 return NULL_TREE;
2310 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2311 from SECOND/FIRST and puts it at the original loop's preheader/exit
2312 edge, the two loops are arranged as below:
2314 preheader_a:
2315 first_loop:
2316 header_a:
2317 i_1 = PHI<i_0, i_2>;
2319 i_2 = i_1 + 1;
2320 if (cond_a)
2321 goto latch_a;
2322 else
2323 goto between_bb;
2324 latch_a:
2325 goto header_a;
2327 between_bb:
2328 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2330 second_loop:
2331 header_b:
2332 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2333 or with i_2 if no LCSSA phi is created
2334 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2336 i_4 = i_3 + 1;
2337 if (cond_b)
2338 goto latch_b;
2339 else
2340 goto exit_bb;
2341 latch_b:
2342 goto header_b;
2344 exit_bb:
2346 This function creates loop closed SSA for the first loop; update the
2347 second loop's PHI nodes by replacing argument on incoming edge with the
2348 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2349 is false, Loop closed ssa phis will only be created for non-iv phis for
2350 the first loop.
2352 This function assumes exit bb of the first loop is preheader bb of the
2353 second loop, i.e, between_bb in the example code. With PHIs updated,
2354 the second loop will execute rest iterations of the first. */
2356 static void
2357 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2358 class loop *first, class loop *second,
2359 bool create_lcssa_for_iv_phis)
2361 gphi_iterator gsi_update, gsi_orig;
2362 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2364 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2365 edge second_preheader_e = loop_preheader_edge (second);
2366 basic_block between_bb = single_exit (first)->dest;
2368 gcc_assert (between_bb == second_preheader_e->src);
2369 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2370 /* Either the first loop or the second is the loop to be vectorized. */
2371 gcc_assert (loop == first || loop == second);
2373 for (gsi_orig = gsi_start_phis (first->header),
2374 gsi_update = gsi_start_phis (second->header);
2375 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2376 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2378 gphi *orig_phi = gsi_orig.phi ();
2379 gphi *update_phi = gsi_update.phi ();
2381 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2382 /* Generate lcssa PHI node for the first loop. */
2383 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2384 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2385 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2387 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2388 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2389 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2390 arg = new_res;
2393 /* Update PHI node in the second loop by replacing arg on the loop's
2394 incoming edge. */
2395 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2398 /* For epilogue peeling we have to make sure to copy all LC PHIs
2399 for correct vectorization of live stmts. */
2400 if (loop == first)
2402 basic_block orig_exit = single_exit (second)->dest;
2403 for (gsi_orig = gsi_start_phis (orig_exit);
2404 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2406 gphi *orig_phi = gsi_orig.phi ();
2407 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2408 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2409 continue;
2411 /* Already created in the above loop. */
2412 if (find_guard_arg (first, second, orig_phi))
2413 continue;
2415 tree new_res = copy_ssa_name (orig_arg);
2416 gphi *lcphi = create_phi_node (new_res, between_bb);
2417 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2422 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2423 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2424 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2425 appear like below:
2427 guard_bb:
2428 if (cond)
2429 goto merge_bb;
2430 else
2431 goto skip_loop;
2433 skip_loop:
2434 header_a:
2435 i_1 = PHI<i_0, i_2>;
2437 i_2 = i_1 + 1;
2438 if (cond_a)
2439 goto latch_a;
2440 else
2441 goto exit_a;
2442 latch_a:
2443 goto header_a;
2445 exit_a:
2446 i_5 = PHI<i_2>;
2448 merge_bb:
2449 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2451 update_loop:
2452 header_b:
2453 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2455 i_4 = i_3 + 1;
2456 if (cond_b)
2457 goto latch_b;
2458 else
2459 goto exit_bb;
2460 latch_b:
2461 goto header_b;
2463 exit_bb:
2465 This function creates PHI nodes at merge_bb and replaces the use of i_5
2466 in the update_loop's PHI node with the result of new PHI result. */
2468 static void
2469 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2470 class loop *update_loop,
2471 edge guard_edge, edge merge_edge)
2473 location_t merge_loc, guard_loc;
2474 edge orig_e = loop_preheader_edge (skip_loop);
2475 edge update_e = loop_preheader_edge (update_loop);
2476 gphi_iterator gsi_orig, gsi_update;
2478 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2479 gsi_update = gsi_start_phis (update_loop->header));
2480 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2481 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2483 gphi *orig_phi = gsi_orig.phi ();
2484 gphi *update_phi = gsi_update.phi ();
2486 /* Generate new phi node at merge bb of the guard. */
2487 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2488 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2490 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2491 args in NEW_PHI for these edges. */
2492 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2493 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2494 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2495 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2496 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2497 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2499 /* Update phi in UPDATE_PHI. */
2500 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2504 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2505 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2506 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2507 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2508 The CFG looks like:
2510 loop:
2511 header_a:
2512 i_1 = PHI<i_0, i_2>;
2514 i_2 = i_1 + 1;
2515 if (cond_a)
2516 goto latch_a;
2517 else
2518 goto exit_a;
2519 latch_a:
2520 goto header_a;
2522 exit_a:
2524 guard_bb:
2525 if (cond)
2526 goto merge_bb;
2527 else
2528 goto epilog_loop;
2530 ;; fall_through_bb
2532 epilog_loop:
2533 header_b:
2534 i_3 = PHI<i_2, i_4>;
2536 i_4 = i_3 + 1;
2537 if (cond_b)
2538 goto latch_b;
2539 else
2540 goto merge_bb;
2541 latch_b:
2542 goto header_b;
2544 merge_bb:
2545 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2547 exit_bb:
2548 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2550 For each name used out side EPILOG (i.e - for each name that has a lcssa
2551 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2552 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2553 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2554 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2555 in exit_bb will also be updated. */
2557 static void
2558 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2559 edge guard_edge, edge merge_edge)
2561 gphi_iterator gsi;
2562 basic_block merge_bb = guard_edge->dest;
2564 gcc_assert (single_succ_p (merge_bb));
2565 edge e = single_succ_edge (merge_bb);
2566 basic_block exit_bb = e->dest;
2567 gcc_assert (single_pred_p (exit_bb));
2568 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2570 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2572 gphi *update_phi = gsi.phi ();
2573 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2575 tree merge_arg = NULL_TREE;
2577 /* If the old argument is a SSA_NAME use its current_def. */
2578 if (TREE_CODE (old_arg) == SSA_NAME)
2579 merge_arg = get_current_def (old_arg);
2580 /* If it's a constant or doesn't have a current_def, just use the old
2581 argument. */
2582 if (!merge_arg)
2583 merge_arg = old_arg;
2585 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2586 /* If the var is live after loop but not a reduction, we simply
2587 use the old arg. */
2588 if (!guard_arg)
2589 guard_arg = old_arg;
2591 /* Create new phi node in MERGE_BB: */
2592 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2593 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2595 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2596 the two PHI args in merge_phi for these edges. */
2597 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2598 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2600 /* Update the original phi in exit_bb. */
2601 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2605 /* EPILOG loop is duplicated from the original loop for vectorizing,
2606 the arg of its loop closed ssa PHI needs to be updated. */
2608 static void
2609 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2611 gphi_iterator gsi;
2612 basic_block exit_bb = single_exit (epilog)->dest;
2614 gcc_assert (single_pred_p (exit_bb));
2615 edge e = EDGE_PRED (exit_bb, 0);
2616 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2617 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2620 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2621 iterate exactly CONST_NITERS times. Make a final decision about
2622 whether the epilogue loop should be used, returning true if so. */
2624 static bool
2625 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2626 unsigned HOST_WIDE_INT const_niters)
2628 /* Avoid wrap-around when computing const_niters - 1. Also reject
2629 using an epilogue loop for a single scalar iteration, even if
2630 we could in principle implement that using partial vectors. */
2631 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2632 if (const_niters <= gap_niters + 1)
2633 return false;
2635 /* Install the number of iterations. */
2636 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2637 tree niters_tree = build_int_cst (niters_type, const_niters);
2638 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2640 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2641 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2643 /* Decide what to do if the number of epilogue iterations is not
2644 a multiple of the epilogue loop's vectorization factor. */
2645 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2648 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2649 Return a value that equals:
2651 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2652 - SKIP_VALUE when the main loop is skipped. */
2654 tree
2655 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2656 tree skip_value)
2658 gcc_assert (loop_vinfo->main_loop_edge);
2660 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2661 basic_block bb = loop_vinfo->main_loop_edge->dest;
2662 gphi *new_phi = create_phi_node (phi_result, bb);
2663 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2664 UNKNOWN_LOCATION);
2665 add_phi_arg (new_phi, skip_value,
2666 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2667 return phi_result;
2670 /* Function vect_do_peeling.
2672 Input:
2673 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2675 preheader:
2676 LOOP:
2677 header_bb:
2678 loop_body
2679 if (exit_loop_cond) goto exit_bb
2680 else goto header_bb
2681 exit_bb:
2683 - NITERS: The number of iterations of the loop.
2684 - NITERSM1: The number of iterations of the loop's latch.
2685 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2686 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2687 CHECK_PROFITABILITY is true.
2688 Output:
2689 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2690 iterate after vectorization; see vect_set_loop_condition for details.
2691 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2692 should be set to the number of scalar iterations handled by the
2693 vector loop. The SSA name is only used on exit from the loop.
2695 This function peels prolog and epilog from the loop, adds guards skipping
2696 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2697 would look like:
2699 guard_bb_1:
2700 if (prefer_scalar_loop) goto merge_bb_1
2701 else goto guard_bb_2
2703 guard_bb_2:
2704 if (skip_prolog) goto merge_bb_2
2705 else goto prolog_preheader
2707 prolog_preheader:
2708 PROLOG:
2709 prolog_header_bb:
2710 prolog_body
2711 if (exit_prolog_cond) goto prolog_exit_bb
2712 else goto prolog_header_bb
2713 prolog_exit_bb:
2715 merge_bb_2:
2717 vector_preheader:
2718 VECTOR LOOP:
2719 vector_header_bb:
2720 vector_body
2721 if (exit_vector_cond) goto vector_exit_bb
2722 else goto vector_header_bb
2723 vector_exit_bb:
2725 guard_bb_3:
2726 if (skip_epilog) goto merge_bb_3
2727 else goto epilog_preheader
2729 merge_bb_1:
2731 epilog_preheader:
2732 EPILOG:
2733 epilog_header_bb:
2734 epilog_body
2735 if (exit_epilog_cond) goto merge_bb_3
2736 else goto epilog_header_bb
2738 merge_bb_3:
2740 Note this function peels prolog and epilog only if it's necessary,
2741 as well as guards.
2742 This function returns the epilogue loop if a decision was made to vectorize
2743 it, otherwise NULL.
2745 The analysis resulting in this epilogue loop's loop_vec_info was performed
2746 in the same vect_analyze_loop call as the main loop's. At that time
2747 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2748 vectorization factors than the main loop. This list is stored in the main
2749 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2750 vectorize the epilogue loop for a lower vectorization factor, the
2751 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2752 updated and linked to the epilogue loop. This is later used to vectorize
2753 the epilogue. The reason the loop_vec_info needs updating is that it was
2754 constructed based on the original main loop, and the epilogue loop is a
2755 copy of this loop, so all links pointing to statements in the original loop
2756 need updating. Furthermore, these loop_vec_infos share the
2757 data_reference's records, which will also need to be updated.
2759 TODO: Guard for prefer_scalar_loop should be emitted along with
2760 versioning conditions if loop versioning is needed. */
2763 class loop *
2764 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2765 tree *niters_vector, tree *step_vector,
2766 tree *niters_vector_mult_vf_var, int th,
2767 bool check_profitability, bool niters_no_overflow,
2768 tree *advance)
2770 edge e, guard_e;
2771 tree type = TREE_TYPE (niters), guard_cond;
2772 basic_block guard_bb, guard_to;
2773 profile_probability prob_prolog, prob_vector, prob_epilog;
2774 int estimated_vf;
2775 int prolog_peeling = 0;
2776 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2777 bool vect_epilogues_updated_niters = false;
2778 /* We currently do not support prolog peeling if the target alignment is not
2779 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2780 target alignment being constant. */
2781 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2782 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2783 return NULL;
2785 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2786 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2788 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2789 poly_uint64 bound_epilog = 0;
2790 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2791 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2792 bound_epilog += vf - 1;
2793 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2794 bound_epilog += 1;
2795 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2796 poly_uint64 bound_scalar = bound_epilog;
2798 if (!prolog_peeling && !epilog_peeling)
2799 return NULL;
2801 /* Before doing any peeling make sure to reset debug binds outside of
2802 the loop refering to defs not in LC SSA. */
2803 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2804 for (unsigned i = 0; i < loop->num_nodes; ++i)
2806 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2807 imm_use_iterator ui;
2808 gimple *use_stmt;
2809 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2810 gsi_next (&gsi))
2812 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2813 if (gimple_debug_bind_p (use_stmt)
2814 && loop != gimple_bb (use_stmt)->loop_father
2815 && !flow_loop_nested_p (loop,
2816 gimple_bb (use_stmt)->loop_father))
2818 gimple_debug_bind_reset_value (use_stmt);
2819 update_stmt (use_stmt);
2822 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2823 gsi_next (&gsi))
2825 ssa_op_iter op_iter;
2826 def_operand_p def_p;
2827 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2828 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2829 if (gimple_debug_bind_p (use_stmt)
2830 && loop != gimple_bb (use_stmt)->loop_father
2831 && !flow_loop_nested_p (loop,
2832 gimple_bb (use_stmt)->loop_father))
2834 gimple_debug_bind_reset_value (use_stmt);
2835 update_stmt (use_stmt);
2840 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2841 estimated_vf = vect_vf_for_cost (loop_vinfo);
2842 if (estimated_vf == 2)
2843 estimated_vf = 3;
2844 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2845 .apply_scale (estimated_vf - 1, estimated_vf);
2847 class loop *prolog, *epilog = NULL;
2848 class loop *first_loop = loop;
2849 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2851 /* SSA form needs to be up-to-date since we are going to manually
2852 update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2853 update SSA state after that, so we have to make sure to not lose any
2854 pending update needs. */
2855 gcc_assert (!need_ssa_update_p (cfun));
2857 /* If we're vectorizing an epilogue loop, we have ensured that the
2858 virtual operand is in SSA form throughout the vectorized main loop.
2859 Normally it is possible to trace the updated
2860 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2861 back to scalar-stmt vuses, meaning that the effect of the SSA update
2862 remains local to the main loop. However, there are rare cases in
2863 which the vectorized loop should have vdefs even when the original scalar
2864 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2865 introduces clobbers of the temporary vector array, which in turn
2866 needs new vdefs. If the scalar loop doesn't write to memory, these
2867 new vdefs will be the only ones in the vector loop.
2868 We are currently defering updating virtual SSA form and creating
2869 of a virtual PHI for this case so we do not have to make sure the
2870 newly introduced virtual def is in LCSSA form. */
2872 if (MAY_HAVE_DEBUG_BIND_STMTS)
2874 gcc_assert (!adjust_vec.exists ());
2875 adjust_vec.create (32);
2877 initialize_original_copy_tables ();
2879 /* Record the anchor bb at which the guard should be placed if the scalar
2880 loop might be preferred. */
2881 basic_block anchor = loop_preheader_edge (loop)->src;
2883 /* Generate the number of iterations for the prolog loop. We do this here
2884 so that we can also get the upper bound on the number of iterations. */
2885 tree niters_prolog;
2886 int bound_prolog = 0;
2887 if (prolog_peeling)
2888 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2889 &bound_prolog);
2890 else
2891 niters_prolog = build_int_cst (type, 0);
2893 loop_vec_info epilogue_vinfo = NULL;
2894 if (vect_epilogues)
2896 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2897 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2900 tree niters_vector_mult_vf = NULL_TREE;
2901 /* Saving NITERs before the loop, as this may be changed by prologue. */
2902 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2903 edge update_e = NULL, skip_e = NULL;
2904 unsigned int lowest_vf = constant_lower_bound (vf);
2905 /* If we know the number of scalar iterations for the main loop we should
2906 check whether after the main loop there are enough iterations left over
2907 for the epilogue. */
2908 if (vect_epilogues
2909 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2910 && prolog_peeling >= 0
2911 && known_eq (vf, lowest_vf))
2913 unsigned HOST_WIDE_INT eiters
2914 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2915 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2917 eiters -= prolog_peeling;
2918 eiters
2919 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2921 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2923 delete epilogue_vinfo;
2924 epilogue_vinfo = NULL;
2925 if (loop_vinfo->epilogue_vinfos.length () == 0)
2927 vect_epilogues = false;
2928 break;
2930 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2931 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2933 vect_epilogues_updated_niters = true;
2935 /* Prolog loop may be skipped. */
2936 bool skip_prolog = (prolog_peeling != 0);
2937 /* Skip this loop to epilog when there are not enough iterations to enter this
2938 vectorized loop. If true we should perform runtime checks on the NITERS
2939 to check whether we should skip the current vectorized loop. If we know
2940 the number of scalar iterations we may choose to add a runtime check if
2941 this number "maybe" smaller than the number of iterations required
2942 when we know the number of scalar iterations may potentially
2943 be smaller than the number of iterations required to enter this loop, for
2944 this we use the upper bounds on the prolog and epilog peeling. When we
2945 don't know the number of iterations and don't require versioning it is
2946 because we have asserted that there are enough scalar iterations to enter
2947 the main loop, so this skip is not necessary. When we are versioning then
2948 we only add such a skip if we have chosen to vectorize the epilogue. */
2949 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2950 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2951 bound_prolog + bound_epilog)
2952 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2953 || vect_epilogues));
2954 /* Epilog loop must be executed if the number of iterations for epilog
2955 loop is known at compile time, otherwise we need to add a check at
2956 the end of vector loop and skip to the end of epilog loop. */
2957 bool skip_epilog = (prolog_peeling < 0
2958 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2959 || !vf.is_constant ());
2960 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2961 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2962 skip_epilog = false;
2964 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2965 auto_vec<profile_count> original_counts;
2966 basic_block *original_bbs = NULL;
2968 if (skip_vector)
2970 split_edge (loop_preheader_edge (loop));
2972 if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2974 original_bbs = get_loop_body (loop);
2975 for (unsigned int i = 0; i < loop->num_nodes; i++)
2976 original_counts.safe_push(original_bbs[i]->count);
2979 /* Due to the order in which we peel prolog and epilog, we first
2980 propagate probability to the whole loop. The purpose is to
2981 avoid adjusting probabilities of both prolog and vector loops
2982 separately. Note in this case, the probability of epilog loop
2983 needs to be scaled back later. */
2984 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2985 if (prob_vector.initialized_p ())
2987 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2988 scale_loop_profile (loop, prob_vector, 0);
2992 if (vect_epilogues)
2993 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2994 use the original scalar loop as remaining epilogue if necessary. */
2995 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2996 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2998 if (prolog_peeling)
3000 e = loop_preheader_edge (loop);
3001 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
3003 /* Peel prolog and put it on preheader edge of loop. */
3004 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
3005 gcc_assert (prolog);
3006 prolog->force_vectorize = false;
3007 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
3008 first_loop = prolog;
3009 reset_original_copy_tables ();
3011 /* Update the number of iterations for prolog loop. */
3012 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3013 vect_set_loop_condition (prolog, NULL, niters_prolog,
3014 step_prolog, NULL_TREE, false);
3016 /* Skip the prolog loop. */
3017 if (skip_prolog)
3019 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3020 niters_prolog, build_int_cst (type, 0));
3021 guard_bb = loop_preheader_edge (prolog)->src;
3022 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3023 guard_to = split_edge (loop_preheader_edge (loop));
3024 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3025 guard_to, guard_bb,
3026 prob_prolog.invert (),
3027 irred_flag);
3028 e = EDGE_PRED (guard_to, 0);
3029 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3030 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3032 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3033 scale_loop_profile (prolog, prob_prolog, bound_prolog);
3036 /* Update init address of DRs. */
3037 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3038 /* Update niters for vector loop. */
3039 LOOP_VINFO_NITERS (loop_vinfo)
3040 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3041 LOOP_VINFO_NITERSM1 (loop_vinfo)
3042 = fold_build2 (MINUS_EXPR, type,
3043 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3044 bool new_var_p = false;
3045 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3046 /* It's guaranteed that vector loop bound before vectorization is at
3047 least VF, so set range information for newly generated var. */
3048 if (new_var_p)
3050 value_range vr (type,
3051 wi::to_wide (build_int_cst (type, lowest_vf)),
3052 wi::to_wide (TYPE_MAX_VALUE (type)));
3053 set_range_info (niters, vr);
3056 /* Prolog iterates at most bound_prolog times, latch iterates at
3057 most bound_prolog - 1 times. */
3058 record_niter_bound (prolog, bound_prolog - 1, false, true);
3059 delete_update_ssa ();
3060 adjust_vec_debug_stmts ();
3061 scev_reset ();
3064 if (epilog_peeling)
3066 e = single_exit (loop);
3067 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
3069 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3070 said epilog then we should use a copy of the main loop as a starting
3071 point. This loop may have already had some preliminary transformations
3072 to allow for more optimal vectorization, for example if-conversion.
3073 If we are not vectorizing the epilog then we should use the scalar loop
3074 as the transformations mentioned above make less or no sense when not
3075 vectorizing. */
3076 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3077 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
3078 gcc_assert (epilog);
3080 epilog->force_vectorize = false;
3081 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
3083 /* Scalar version loop may be preferred. In this case, add guard
3084 and skip to epilog. Note this only happens when the number of
3085 iterations of loop is unknown at compile time, otherwise this
3086 won't be vectorized. */
3087 if (skip_vector)
3089 /* Additional epilogue iteration is peeled if gap exists. */
3090 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3091 bound_prolog, bound_epilog,
3092 th, &bound_scalar,
3093 check_profitability);
3094 /* Build guard against NITERSM1 since NITERS may overflow. */
3095 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3096 guard_bb = anchor;
3097 guard_to = split_edge (loop_preheader_edge (epilog));
3098 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3099 guard_to, guard_bb,
3100 prob_vector.invert (),
3101 irred_flag);
3102 skip_e = guard_e;
3103 e = EDGE_PRED (guard_to, 0);
3104 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3105 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3107 /* Simply propagate profile info from guard_bb to guard_to which is
3108 a merge point of control flow. */
3109 guard_to->count = guard_bb->count;
3111 /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3112 if (vect_epilogues || scalar_loop == NULL)
3114 gcc_assert(epilog->num_nodes == loop->num_nodes);
3115 basic_block *bbs = get_loop_body (epilog);
3116 for (unsigned int i = 0; i < epilog->num_nodes; i++)
3118 gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3119 bbs[i]->count = original_counts[i];
3121 free (bbs);
3122 free (original_bbs);
3126 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
3127 /* If loop is peeled for non-zero constant times, now niters refers to
3128 orig_niters - prolog_peeling, it won't overflow even the orig_niters
3129 overflows. */
3130 niters_no_overflow |= (prolog_peeling > 0);
3131 vect_gen_vector_loop_niters (loop_vinfo, niters,
3132 niters_vector, step_vector,
3133 niters_no_overflow);
3134 if (!integer_onep (*step_vector))
3136 /* On exit from the loop we will have an easy way of calcalating
3137 NITERS_VECTOR / STEP * STEP. Install a dummy definition
3138 until then. */
3139 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3140 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3141 *niters_vector_mult_vf_var = niters_vector_mult_vf;
3143 else
3144 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3145 &niters_vector_mult_vf);
3146 /* Update IVs of original loop as if they were advanced by
3147 niters_vector_mult_vf steps. */
3148 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3149 update_e = skip_vector ? e : loop_preheader_edge (epilog);
3150 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3151 update_e);
3153 if (skip_epilog)
3155 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3156 niters, niters_vector_mult_vf);
3157 guard_bb = single_exit (loop)->dest;
3158 guard_to = split_edge (single_exit (epilog));
3159 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3160 skip_vector ? anchor : guard_bb,
3161 prob_epilog.invert (),
3162 irred_flag);
3163 if (vect_epilogues)
3164 epilogue_vinfo->skip_this_loop_edge = guard_e;
3165 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3166 single_exit (epilog));
3167 /* Only need to handle basic block before epilog loop if it's not
3168 the guard_bb, which is the case when skip_vector is true. */
3169 if (guard_bb != bb_before_epilog)
3171 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3173 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3175 scale_loop_profile (epilog, prob_epilog, 0);
3177 else
3178 slpeel_update_phi_nodes_for_lcssa (epilog);
3180 unsigned HOST_WIDE_INT bound;
3181 if (bound_scalar.is_constant (&bound))
3183 gcc_assert (bound != 0);
3184 /* -1 to convert loop iterations to latch iterations. */
3185 record_niter_bound (epilog, bound - 1, false, true);
3188 delete_update_ssa ();
3189 adjust_vec_debug_stmts ();
3190 scev_reset ();
3193 if (vect_epilogues)
3195 epilog->aux = epilogue_vinfo;
3196 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3198 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3200 /* We now must calculate the number of NITERS performed by the previous
3201 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3202 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3203 niters_prolog, niters_vector_mult_vf);
3205 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3206 determine whether we are coming from the previous vectorized loop
3207 using the update_e edge or the skip_vector basic block using the
3208 skip_e edge. */
3209 if (skip_vector)
3211 gcc_assert (update_e != NULL
3212 && skip_e != NULL
3213 && !vect_epilogues_updated_niters);
3214 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3215 update_e->dest);
3216 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3217 gimple *stmt = gimple_build_assign (new_ssa, niters);
3218 gimple_stmt_iterator gsi;
3219 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3220 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3222 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3223 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3225 else
3227 gsi = gsi_last_bb (update_e->src);
3228 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3231 niters = new_ssa;
3232 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3233 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3234 UNKNOWN_LOCATION);
3235 niters = PHI_RESULT (new_phi);
3236 epilogue_vinfo->main_loop_edge = update_e;
3237 epilogue_vinfo->skip_main_loop_edge = skip_e;
3240 /* Set ADVANCE to the number of iterations performed by the previous
3241 loop and its prologue. */
3242 *advance = niters;
3244 if (!vect_epilogues_updated_niters)
3246 /* Subtract the number of iterations performed by the vectorized loop
3247 from the number of total iterations. */
3248 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3249 before_loop_niters,
3250 niters);
3252 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3253 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3254 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3255 epilogue_niters,
3256 build_one_cst (TREE_TYPE (epilogue_niters)));
3258 /* Decide what to do if the number of epilogue iterations is not
3259 a multiple of the epilogue loop's vectorization factor.
3260 We should have rejected the loop during the analysis phase
3261 if this fails. */
3262 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3263 true))
3264 gcc_unreachable ();
3268 adjust_vec.release ();
3269 free_original_copy_tables ();
3271 return vect_epilogues ? epilog : NULL;
3274 /* Function vect_create_cond_for_niters_checks.
3276 Create a conditional expression that represents the run-time checks for
3277 loop's niter. The loop is guaranteed to terminate if the run-time
3278 checks hold.
3280 Input:
3281 COND_EXPR - input conditional expression. New conditions will be chained
3282 with logical AND operation. If it is NULL, then the function
3283 is used to return the number of alias checks.
3284 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3285 to be checked.
3287 Output:
3288 COND_EXPR - conditional expression.
3290 The returned COND_EXPR is the conditional expression to be used in the
3291 if statement that controls which version of the loop gets executed at
3292 runtime. */
3294 static void
3295 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3297 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3299 if (*cond_expr)
3300 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3301 *cond_expr, part_cond_expr);
3302 else
3303 *cond_expr = part_cond_expr;
3306 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3307 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3309 static void
3310 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3312 if (*cond_expr)
3313 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3314 *cond_expr, part_cond_expr);
3315 else
3316 *cond_expr = part_cond_expr;
3319 /* Function vect_create_cond_for_align_checks.
3321 Create a conditional expression that represents the alignment checks for
3322 all of data references (array element references) whose alignment must be
3323 checked at runtime.
3325 Input:
3326 COND_EXPR - input conditional expression. New conditions will be chained
3327 with logical AND operation.
3328 LOOP_VINFO - two fields of the loop information are used.
3329 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3330 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3332 Output:
3333 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3334 expression.
3335 The returned value is the conditional expression to be used in the if
3336 statement that controls which version of the loop gets executed at runtime.
3338 The algorithm makes two assumptions:
3339 1) The number of bytes "n" in a vector is a power of 2.
3340 2) An address "a" is aligned if a%n is zero and that this
3341 test can be done as a&(n-1) == 0. For example, for 16
3342 byte vectors the test is a&0xf == 0. */
3344 static void
3345 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3346 tree *cond_expr,
3347 gimple_seq *cond_expr_stmt_list)
3349 const vec<stmt_vec_info> &may_misalign_stmts
3350 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3351 stmt_vec_info stmt_info;
3352 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3353 tree mask_cst;
3354 unsigned int i;
3355 tree int_ptrsize_type;
3356 char tmp_name[20];
3357 tree or_tmp_name = NULL_TREE;
3358 tree and_tmp_name;
3359 gimple *and_stmt;
3360 tree ptrsize_zero;
3361 tree part_cond_expr;
3363 /* Check that mask is one less than a power of 2, i.e., mask is
3364 all zeros followed by all ones. */
3365 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3367 int_ptrsize_type = signed_type_for (ptr_type_node);
3369 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3370 of the first vector of the i'th data reference. */
3372 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3374 gimple_seq new_stmt_list = NULL;
3375 tree addr_base;
3376 tree addr_tmp_name;
3377 tree new_or_tmp_name;
3378 gimple *addr_stmt, *or_stmt;
3379 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3380 bool negative = tree_int_cst_compare
3381 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3382 tree offset = negative
3383 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3384 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3385 : size_zero_node;
3387 /* create: addr_tmp = (int)(address_of_first_vector) */
3388 addr_base =
3389 vect_create_addr_base_for_vector_ref (loop_vinfo,
3390 stmt_info, &new_stmt_list,
3391 offset);
3392 if (new_stmt_list != NULL)
3393 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3395 sprintf (tmp_name, "addr2int%d", i);
3396 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3397 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3398 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3400 /* The addresses are OR together. */
3402 if (or_tmp_name != NULL_TREE)
3404 /* create: or_tmp = or_tmp | addr_tmp */
3405 sprintf (tmp_name, "orptrs%d", i);
3406 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3407 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3408 or_tmp_name, addr_tmp_name);
3409 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3410 or_tmp_name = new_or_tmp_name;
3412 else
3413 or_tmp_name = addr_tmp_name;
3415 } /* end for i */
3417 mask_cst = build_int_cst (int_ptrsize_type, mask);
3419 /* create: and_tmp = or_tmp & mask */
3420 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3422 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3423 or_tmp_name, mask_cst);
3424 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3426 /* Make and_tmp the left operand of the conditional test against zero.
3427 if and_tmp has a nonzero bit then some address is unaligned. */
3428 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3429 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3430 and_tmp_name, ptrsize_zero);
3431 chain_cond_expr (cond_expr, part_cond_expr);
3434 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3435 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3436 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3437 and this new condition are true. Treat a null *COND_EXPR as "true". */
3439 static void
3440 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3442 const vec<vec_object_pair> &pairs
3443 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3444 unsigned int i;
3445 vec_object_pair *pair;
3446 FOR_EACH_VEC_ELT (pairs, i, pair)
3448 tree addr1 = build_fold_addr_expr (pair->first);
3449 tree addr2 = build_fold_addr_expr (pair->second);
3450 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3451 addr1, addr2);
3452 chain_cond_expr (cond_expr, part_cond_expr);
3456 /* Create an expression that is true when all lower-bound conditions for
3457 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3459 static void
3460 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3462 const vec<vec_lower_bound> &lower_bounds
3463 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3464 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3466 tree expr = lower_bounds[i].expr;
3467 tree type = unsigned_type_for (TREE_TYPE (expr));
3468 expr = fold_convert (type, expr);
3469 poly_uint64 bound = lower_bounds[i].min_value;
3470 if (!lower_bounds[i].unsigned_p)
3472 expr = fold_build2 (PLUS_EXPR, type, expr,
3473 build_int_cstu (type, bound - 1));
3474 bound += bound - 1;
3476 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3477 build_int_cstu (type, bound));
3478 chain_cond_expr (cond_expr, part_cond_expr);
3482 /* Function vect_create_cond_for_alias_checks.
3484 Create a conditional expression that represents the run-time checks for
3485 overlapping of address ranges represented by a list of data references
3486 relations passed as input.
3488 Input:
3489 COND_EXPR - input conditional expression. New conditions will be chained
3490 with logical AND operation. If it is NULL, then the function
3491 is used to return the number of alias checks.
3492 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3493 to be checked.
3495 Output:
3496 COND_EXPR - conditional expression.
3498 The returned COND_EXPR is the conditional expression to be used in the if
3499 statement that controls which version of the loop gets executed at runtime.
3502 void
3503 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3505 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3506 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3508 if (comp_alias_ddrs.is_empty ())
3509 return;
3511 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3512 &comp_alias_ddrs, cond_expr);
3513 if (dump_enabled_p ())
3514 dump_printf_loc (MSG_NOTE, vect_location,
3515 "created %u versioning for alias checks.\n",
3516 comp_alias_ddrs.length ());
3520 /* Function vect_loop_versioning.
3522 If the loop has data references that may or may not be aligned or/and
3523 has data reference relations whose independence was not proven then
3524 two versions of the loop need to be generated, one which is vectorized
3525 and one which isn't. A test is then generated to control which of the
3526 loops is executed. The test checks for the alignment of all of the
3527 data references that may or may not be aligned. An additional
3528 sequence of runtime tests is generated for each pairs of DDRs whose
3529 independence was not proven. The vectorized version of loop is
3530 executed only if both alias and alignment tests are passed.
3532 The test generated to check which version of loop is executed
3533 is modified to also check for profitability as indicated by the
3534 cost model threshold TH.
3536 The versioning precondition(s) are placed in *COND_EXPR and
3537 *COND_EXPR_STMT_LIST. */
3539 class loop *
3540 vect_loop_versioning (loop_vec_info loop_vinfo,
3541 gimple *loop_vectorized_call)
3543 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3544 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3545 basic_block condition_bb;
3546 gphi_iterator gsi;
3547 gimple_stmt_iterator cond_exp_gsi;
3548 basic_block merge_bb;
3549 basic_block new_exit_bb;
3550 edge new_exit_e, e;
3551 gphi *orig_phi, *new_phi;
3552 tree cond_expr = NULL_TREE;
3553 gimple_seq cond_expr_stmt_list = NULL;
3554 tree arg;
3555 profile_probability prob = profile_probability::likely ();
3556 gimple_seq gimplify_stmt_list = NULL;
3557 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3558 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3559 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3560 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3561 poly_uint64 versioning_threshold
3562 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3563 tree version_simd_if_cond
3564 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3565 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3567 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3568 && !ordered_p (th, versioning_threshold))
3569 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3570 build_int_cst (TREE_TYPE (scalar_loop_iters),
3571 th - 1));
3572 if (maybe_ne (versioning_threshold, 0U))
3574 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3575 build_int_cst (TREE_TYPE (scalar_loop_iters),
3576 versioning_threshold - 1));
3577 if (cond_expr)
3578 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3579 expr, cond_expr);
3580 else
3581 cond_expr = expr;
3584 tree cost_name = NULL_TREE;
3585 profile_probability prob2 = profile_probability::uninitialized ();
3586 if (cond_expr
3587 && EXPR_P (cond_expr)
3588 && (version_niter
3589 || version_align
3590 || version_alias
3591 || version_simd_if_cond))
3593 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3594 &cond_expr_stmt_list,
3595 is_gimple_val, NULL_TREE);
3596 /* Split prob () into two so that the overall probability of passing
3597 both the cost-model and versioning checks is the orig prob. */
3598 prob2 = prob.split (prob);
3601 if (version_niter)
3602 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3604 if (cond_expr)
3606 gimple_seq tem = NULL;
3607 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3608 &tem, is_gimple_condexpr_for_cond,
3609 NULL_TREE);
3610 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3613 if (version_align)
3614 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3615 &cond_expr_stmt_list);
3617 if (version_alias)
3619 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3620 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3621 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3624 if (version_simd_if_cond)
3626 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3627 if (flag_checking)
3628 if (basic_block bb
3629 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3630 gcc_assert (bb != loop->header
3631 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3632 && (scalar_loop == NULL
3633 || (bb != scalar_loop->header
3634 && dominated_by_p (CDI_DOMINATORS,
3635 scalar_loop->header, bb))));
3636 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3637 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3638 version_simd_if_cond, zero);
3639 if (cond_expr)
3640 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3641 c, cond_expr);
3642 else
3643 cond_expr = c;
3644 if (dump_enabled_p ())
3645 dump_printf_loc (MSG_NOTE, vect_location,
3646 "created versioning for simd if condition check.\n");
3649 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3650 &gimplify_stmt_list,
3651 is_gimple_condexpr_for_cond, NULL_TREE);
3652 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3654 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3655 invariant in. */
3656 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3657 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3658 !gsi_end_p (gsi); gsi_next (&gsi))
3660 gimple *stmt = gsi_stmt (gsi);
3661 update_stmt (stmt);
3662 ssa_op_iter iter;
3663 use_operand_p use_p;
3664 basic_block def_bb;
3665 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3666 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3667 && flow_bb_inside_loop_p (outermost, def_bb))
3668 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3671 /* Search for the outermost loop we can version. Avoid versioning of
3672 non-perfect nests but allow if-conversion versioned loops inside. */
3673 class loop *loop_to_version = loop;
3674 if (flow_loop_nested_p (outermost, loop))
3676 if (dump_enabled_p ())
3677 dump_printf_loc (MSG_NOTE, vect_location,
3678 "trying to apply versioning to outer loop %d\n",
3679 outermost->num);
3680 if (outermost->num == 0)
3681 outermost = superloop_at_depth (loop, 1);
3682 /* And avoid applying versioning on non-perfect nests. */
3683 while (loop_to_version != outermost
3684 && (e = single_exit (loop_outer (loop_to_version)))
3685 && !(e->flags & EDGE_COMPLEX)
3686 && (!loop_outer (loop_to_version)->inner->next
3687 || vect_loop_vectorized_call (loop_to_version))
3688 && (!loop_outer (loop_to_version)->inner->next
3689 || !loop_outer (loop_to_version)->inner->next->next))
3690 loop_to_version = loop_outer (loop_to_version);
3693 /* Apply versioning. If there is already a scalar version created by
3694 if-conversion re-use that. Note we cannot re-use the copy of
3695 an if-converted outer-loop when vectorizing the inner loop only. */
3696 gcond *cond;
3697 if ((!loop_to_version->inner || loop == loop_to_version)
3698 && loop_vectorized_call)
3700 gcc_assert (scalar_loop);
3701 condition_bb = gimple_bb (loop_vectorized_call);
3702 cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
3703 gimple_cond_set_condition_from_tree (cond, cond_expr);
3704 update_stmt (cond);
3706 if (cond_expr_stmt_list)
3708 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3709 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3710 GSI_SAME_STMT);
3713 /* if-conversion uses profile_probability::always () for both paths,
3714 reset the paths probabilities appropriately. */
3715 edge te, fe;
3716 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3717 te->probability = prob;
3718 fe->probability = prob.invert ();
3719 /* We can scale loops counts immediately but have to postpone
3720 scaling the scalar loop because we re-use it during peeling. */
3721 scale_loop_frequencies (loop_to_version, te->probability);
3722 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3724 nloop = scalar_loop;
3725 if (dump_enabled_p ())
3726 dump_printf_loc (MSG_NOTE, vect_location,
3727 "reusing %sloop version created by if conversion\n",
3728 loop_to_version != loop ? "outer " : "");
3730 else
3732 if (loop_to_version != loop
3733 && dump_enabled_p ())
3734 dump_printf_loc (MSG_NOTE, vect_location,
3735 "applying loop versioning to outer loop %d\n",
3736 loop_to_version->num);
3738 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3740 initialize_original_copy_tables ();
3741 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3742 prob, prob.invert (), prob, prob.invert (), true);
3743 gcc_assert (nloop);
3744 nloop = get_loop_copy (loop);
3746 /* For cycle vectorization with SLP we rely on the PHI arguments
3747 appearing in the same order as the SLP node operands which for the
3748 loop PHI nodes means the preheader edge dest index needs to remain
3749 the same for the analyzed loop which also becomes the vectorized one.
3750 Make it so in case the state after versioning differs by redirecting
3751 the first edge into the header to the same destination which moves
3752 it last. */
3753 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3755 edge e = EDGE_PRED (loop->header, 0);
3756 ssa_redirect_edge (e, e->dest);
3757 flush_pending_stmts (e);
3759 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3761 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3762 reap those otherwise; they also refer to the original
3763 loops. */
3764 class loop *l = loop;
3765 while (gimple *call = vect_loop_vectorized_call (l))
3767 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3768 fold_loop_internal_call (call, boolean_false_node);
3769 l = loop_outer (l);
3771 free_original_copy_tables ();
3773 if (cond_expr_stmt_list)
3775 cond_exp_gsi = gsi_last_bb (condition_bb);
3776 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3777 GSI_SAME_STMT);
3780 /* Loop versioning violates an assumption we try to maintain during
3781 vectorization - that the loop exit block has a single predecessor.
3782 After versioning, the exit block of both loop versions is the same
3783 basic block (i.e. it has two predecessors). Just in order to simplify
3784 following transformations in the vectorizer, we fix this situation
3785 here by adding a new (empty) block on the exit-edge of the loop,
3786 with the proper loop-exit phis to maintain loop-closed-form.
3787 If loop versioning wasn't done from loop, but scalar_loop instead,
3788 merge_bb will have already just a single successor. */
3790 merge_bb = single_exit (loop_to_version)->dest;
3791 if (EDGE_COUNT (merge_bb->preds) >= 2)
3793 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3794 new_exit_bb = split_edge (single_exit (loop_to_version));
3795 new_exit_e = single_exit (loop_to_version);
3796 e = EDGE_SUCC (new_exit_bb, 0);
3798 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3799 gsi_next (&gsi))
3801 tree new_res;
3802 orig_phi = gsi.phi ();
3803 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3804 new_phi = create_phi_node (new_res, new_exit_bb);
3805 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3806 add_phi_arg (new_phi, arg, new_exit_e,
3807 gimple_phi_arg_location_from_edge (orig_phi, e));
3808 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3812 update_ssa (TODO_update_ssa_no_phi);
3815 /* Split the cost model check off to a separate BB. Costing assumes
3816 this is the only thing we perform when we enter the scalar loop
3817 from a failed cost decision. */
3818 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3820 gimple *def = SSA_NAME_DEF_STMT (cost_name);
3821 gcc_assert (gimple_bb (def) == condition_bb);
3822 /* All uses of the cost check are 'true' after the check we
3823 are going to insert. */
3824 replace_uses_by (cost_name, boolean_true_node);
3825 /* And we're going to build the new single use of it. */
3826 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3827 NULL_TREE, NULL_TREE);
3828 edge e = split_block (gimple_bb (def), def);
3829 gimple_stmt_iterator gsi = gsi_for_stmt (def);
3830 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3831 edge true_e, false_e;
3832 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3833 e->flags &= ~EDGE_FALLTHRU;
3834 e->flags |= EDGE_TRUE_VALUE;
3835 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3836 e->probability = prob2;
3837 e2->probability = prob2.invert ();
3838 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3839 auto_vec<basic_block, 3> adj;
3840 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3841 son;
3842 son = next_dom_son (CDI_DOMINATORS, son))
3843 if (EDGE_COUNT (son->preds) > 1)
3844 adj.safe_push (son);
3845 for (auto son : adj)
3846 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3849 if (version_niter)
3851 /* The versioned loop could be infinite, we need to clear existing
3852 niter information which is copied from the original loop. */
3853 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3854 vect_free_loop_info_assumptions (nloop);
3857 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3858 && dump_enabled_p ())
3860 if (version_alias)
3861 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3862 vect_location,
3863 "loop versioned for vectorization because of "
3864 "possible aliasing\n");
3865 if (version_align)
3866 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3867 vect_location,
3868 "loop versioned for vectorization to enhance "
3869 "alignment\n");
3873 return nloop;