c++: Implement modules ABI for vtable emissions
[official-gcc.git] / gcc / tree-vect-loop-manip.cc
blob43c7881c640df45a81314d82d231680cf9e9843a
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53 #include "langhooks.h"
54 #include "tree-vector-builder.h"
55 #include "optabs-tree.h"
57 /*************************************************************************
58 Simple Loop Peeling Utilities
60 Utilities to support loop peeling for vectorization purposes.
61 *************************************************************************/
64 /* Renames the use *OP_P. */
66 static void
67 rename_use_op (use_operand_p op_p)
69 tree new_name;
71 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
72 return;
74 new_name = get_current_def (USE_FROM_PTR (op_p));
76 /* Something defined outside of the loop. */
77 if (!new_name)
78 return;
80 /* An ordinary ssa name defined in the loop. */
82 SET_USE (op_p, new_name);
86 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
87 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
88 true. */
90 static void
91 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
93 gimple *stmt;
94 use_operand_p use_p;
95 ssa_op_iter iter;
96 edge e;
97 edge_iterator ei;
98 class loop *loop = bb->loop_father;
99 class loop *outer_loop = NULL;
101 if (rename_from_outer_loop)
103 gcc_assert (loop);
104 outer_loop = loop_outer (loop);
107 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
108 gsi_next (&gsi))
110 stmt = gsi_stmt (gsi);
111 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
112 rename_use_op (use_p);
115 FOR_EACH_EDGE (e, ei, bb->preds)
117 if (!flow_bb_inside_loop_p (loop, e->src))
119 if (!rename_from_outer_loop)
120 continue;
121 if (e->src != outer_loop->header)
123 if (outer_loop->inner->next)
125 /* If outer_loop has 2 inner loops, allow there to
126 be an extra basic block which decides which of the
127 two loops to use using LOOP_VECTORIZED. */
128 if (!single_pred_p (e->src)
129 || single_pred (e->src) != outer_loop->header)
130 continue;
134 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
135 gsi_next (&gsi))
136 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
141 struct adjust_info
143 tree from, to;
144 basic_block bb;
147 /* A stack of values to be adjusted in debug stmts. We have to
148 process them LIFO, so that the closest substitution applies. If we
149 processed them FIFO, without the stack, we might substitute uses
150 with a PHI DEF that would soon become non-dominant, and when we got
151 to the suitable one, it wouldn't have anything to substitute any
152 more. */
153 static vec<adjust_info, va_heap> adjust_vec;
155 /* Adjust any debug stmts that referenced AI->from values to use the
156 loop-closed AI->to, if the references are dominated by AI->bb and
157 not by the definition of AI->from. */
159 static void
160 adjust_debug_stmts_now (adjust_info *ai)
162 basic_block bbphi = ai->bb;
163 tree orig_def = ai->from;
164 tree new_def = ai->to;
165 imm_use_iterator imm_iter;
166 gimple *stmt;
167 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
169 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
171 /* Adjust any debug stmts that held onto non-loop-closed
172 references. */
173 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
175 use_operand_p use_p;
176 basic_block bbuse;
178 if (!is_gimple_debug (stmt))
179 continue;
181 gcc_assert (gimple_debug_bind_p (stmt));
183 bbuse = gimple_bb (stmt);
185 if ((bbuse == bbphi
186 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
187 && !(bbuse == bbdef
188 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
190 if (new_def)
191 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
192 SET_USE (use_p, new_def);
193 else
195 gimple_debug_bind_reset_value (stmt);
196 update_stmt (stmt);
202 /* Adjust debug stmts as scheduled before. */
204 static void
205 adjust_vec_debug_stmts (void)
207 if (!MAY_HAVE_DEBUG_BIND_STMTS)
208 return;
210 gcc_assert (adjust_vec.exists ());
212 while (!adjust_vec.is_empty ())
214 adjust_debug_stmts_now (&adjust_vec.last ());
215 adjust_vec.pop ();
219 /* Adjust any debug stmts that referenced FROM values to use the
220 loop-closed TO, if the references are dominated by BB and not by
221 the definition of FROM. If adjust_vec is non-NULL, adjustments
222 will be postponed until adjust_vec_debug_stmts is called. */
224 static void
225 adjust_debug_stmts (tree from, tree to, basic_block bb)
227 adjust_info ai;
229 if (MAY_HAVE_DEBUG_BIND_STMTS
230 && TREE_CODE (from) == SSA_NAME
231 && ! SSA_NAME_IS_DEFAULT_DEF (from)
232 && ! virtual_operand_p (from))
234 ai.from = from;
235 ai.to = to;
236 ai.bb = bb;
238 if (adjust_vec.exists ())
239 adjust_vec.safe_push (ai);
240 else
241 adjust_debug_stmts_now (&ai);
245 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
246 to adjust any debug stmts that referenced the old phi arg,
247 presumably non-loop-closed references left over from other
248 transformations. */
250 static void
251 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
253 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
255 gcc_assert (TREE_CODE (orig_def) != SSA_NAME
256 || orig_def != new_def);
258 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
260 if (MAY_HAVE_DEBUG_BIND_STMTS)
261 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
262 gimple_bb (update_phi));
265 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
266 that the control should have during the first iteration and NEXT_CTRL is the
267 value that it should have on subsequent iterations. */
269 static void
270 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
271 tree next_ctrl)
273 gphi *phi = create_phi_node (ctrl, loop->header);
274 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
275 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
278 /* Add SEQ to the end of LOOP's preheader block. */
280 static void
281 add_preheader_seq (class loop *loop, gimple_seq seq)
283 if (seq)
285 edge pe = loop_preheader_edge (loop);
286 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
287 gcc_assert (!new_bb);
291 /* Add SEQ to the beginning of LOOP's header block. */
293 static void
294 add_header_seq (class loop *loop, gimple_seq seq)
296 if (seq)
298 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
299 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
303 /* Return true if the target can interleave elements of two vectors.
304 OFFSET is 0 if the first half of the vectors should be interleaved
305 or 1 if the second half should. When returning true, store the
306 associated permutation in INDICES. */
308 static bool
309 interleave_supported_p (vec_perm_indices *indices, tree vectype,
310 unsigned int offset)
312 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
313 poly_uint64 base = exact_div (nelts, 2) * offset;
314 vec_perm_builder sel (nelts, 2, 3);
315 for (unsigned int i = 0; i < 3; ++i)
317 sel.quick_push (base + i);
318 sel.quick_push (base + i + nelts);
320 indices->new_vector (sel, 2, nelts);
321 return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
322 *indices);
325 /* Try to use permutes to define the masks in DEST_RGM using the masks
326 in SRC_RGM, given that the former has twice as many masks as the
327 latter. Return true on success, adding any new statements to SEQ. */
329 static bool
330 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
331 rgroup_controls *src_rgm)
333 tree src_masktype = src_rgm->type;
334 tree dest_masktype = dest_rgm->type;
335 machine_mode src_mode = TYPE_MODE (src_masktype);
336 insn_code icode1, icode2;
337 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
338 && (icode1 = optab_handler (vec_unpacku_hi_optab,
339 src_mode)) != CODE_FOR_nothing
340 && (icode2 = optab_handler (vec_unpacku_lo_optab,
341 src_mode)) != CODE_FOR_nothing)
343 /* Unpacking the source masks gives at least as many mask bits as
344 we need. We can then VIEW_CONVERT any excess bits away. */
345 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
346 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
347 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
348 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
350 tree src = src_rgm->controls[i / 2];
351 tree dest = dest_rgm->controls[i];
352 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
353 ? VEC_UNPACK_HI_EXPR
354 : VEC_UNPACK_LO_EXPR);
355 gassign *stmt;
356 if (dest_masktype == unpack_masktype)
357 stmt = gimple_build_assign (dest, code, src);
358 else
360 tree temp = make_ssa_name (unpack_masktype);
361 stmt = gimple_build_assign (temp, code, src);
362 gimple_seq_add_stmt (seq, stmt);
363 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
364 build1 (VIEW_CONVERT_EXPR,
365 dest_masktype, temp));
367 gimple_seq_add_stmt (seq, stmt);
369 return true;
371 vec_perm_indices indices[2];
372 if (dest_masktype == src_masktype
373 && interleave_supported_p (&indices[0], src_masktype, 0)
374 && interleave_supported_p (&indices[1], src_masktype, 1))
376 /* The destination requires twice as many mask bits as the source, so
377 we can use interleaving permutes to double up the number of bits. */
378 tree masks[2];
379 for (unsigned int i = 0; i < 2; ++i)
380 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
381 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
383 tree src = src_rgm->controls[i / 2];
384 tree dest = dest_rgm->controls[i];
385 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
386 src, src, masks[i & 1]);
387 gimple_seq_add_stmt (seq, stmt);
389 return true;
391 return false;
394 /* Populate DEST_RGM->controls, given that they should add up to STEP.
396 STEP = MIN_EXPR <ivtmp_34, VF>;
398 First length (MIN (X, VF/N)):
399 loop_len_15 = MIN_EXPR <STEP, VF/N>;
401 Second length:
402 tmp = STEP - loop_len_15;
403 loop_len_16 = MIN (tmp, VF/N);
405 Third length:
406 tmp2 = tmp - loop_len_16;
407 loop_len_17 = MIN (tmp2, VF/N);
409 Last length:
410 loop_len_18 = tmp2 - loop_len_17;
413 static void
414 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
415 rgroup_controls *dest_rgm, tree step)
417 tree ctrl_type = dest_rgm->type;
418 poly_uint64 nitems_per_ctrl
419 = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
420 tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
422 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
424 tree ctrl = dest_rgm->controls[i];
425 if (i == 0)
427 /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
428 gassign *assign
429 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
430 gimple_seq_add_stmt (seq, assign);
432 else if (i == dest_rgm->controls.length () - 1)
434 /* Last iteration: Remain capped to the range [0, VF/N]. */
435 gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
436 dest_rgm->controls[i - 1]);
437 gimple_seq_add_stmt (seq, assign);
439 else
441 /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
442 step = gimple_build (seq, MINUS_EXPR, iv_type, step,
443 dest_rgm->controls[i - 1]);
444 gassign *assign
445 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
446 gimple_seq_add_stmt (seq, assign);
451 /* Stores the standard position for induction variable increment in belonging to
452 LOOP_EXIT (just before the exit condition of the given exit to BSI.
453 INSERT_AFTER is set to true if the increment should be inserted after
454 *BSI. */
456 void
457 vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
458 bool *insert_after)
460 basic_block bb = loop_exit->src;
461 *bsi = gsi_last_bb (bb);
462 *insert_after = false;
465 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
466 for all the rgroup controls in RGC and return a control that is nonzero
467 when the loop needs to iterate. Add any new preheader statements to
468 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
470 RGC belongs to loop LOOP. The loop originally iterated NITERS
471 times and has been vectorized according to LOOP_VINFO.
473 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
474 starts with NITERS_SKIP dummy iterations of the scalar loop before
475 the real work starts. The mask elements for these dummy iterations
476 must be 0, to ensure that the extra iterations do not have an effect.
478 It is known that:
480 NITERS * RGC->max_nscalars_per_iter * RGC->factor
482 does not overflow. However, MIGHT_WRAP_P says whether an induction
483 variable that starts at 0 and has step:
485 VF * RGC->max_nscalars_per_iter * RGC->factor
487 might overflow before hitting a value above:
489 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
491 This means that we cannot guarantee that such an induction variable
492 would ever hit a value that produces a set of all-false masks or zero
493 lengths for RGC.
495 Note: the cost of the code generated by this function is modeled
496 by vect_estimate_min_profitable_iters, so changes here may need
497 corresponding changes there. */
499 static tree
500 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
501 gimple_seq *preheader_seq,
502 gimple_seq *header_seq,
503 gimple_stmt_iterator loop_cond_gsi,
504 rgroup_controls *rgc, tree niters,
505 tree niters_skip, bool might_wrap_p,
506 tree *iv_step, tree *compare_step)
508 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
509 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
510 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
512 tree ctrl_type = rgc->type;
513 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
514 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
515 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
516 tree length_limit = NULL_TREE;
517 /* For length, we need length_limit to ensure length in range. */
518 if (!use_masks_p)
519 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
521 /* Calculate the maximum number of item values that the rgroup
522 handles in total, the number that it handles for each iteration
523 of the vector loop, and the number that it should skip during the
524 first iteration of the vector loop. */
525 tree nitems_total = niters;
526 tree nitems_step = build_int_cst (iv_type, vf);
527 tree nitems_skip = niters_skip;
528 if (nitems_per_iter != 1)
530 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
531 these multiplications don't overflow. */
532 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
533 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
534 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
535 nitems_total, compare_factor);
536 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
537 nitems_step, iv_factor);
538 if (nitems_skip)
539 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
540 nitems_skip, compare_factor);
543 /* Create an induction variable that counts the number of items
544 processed. */
545 tree index_before_incr, index_after_incr;
546 gimple_stmt_iterator incr_gsi;
547 bool insert_after;
548 edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
549 vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
550 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
552 /* Create an IV that counts down from niters_total and whose step
553 is the (variable) amount processed in the current iteration:
555 _10 = (unsigned long) count_12(D);
557 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
558 _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
560 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
562 ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
564 if (ivtmp_9 > POLY_INT_CST [4, 4])
565 goto <bb 4>; [83.33%]
566 else
567 goto <bb 5>; [16.67%]
569 nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
570 tree step = rgc->controls.length () == 1 ? rgc->controls[0]
571 : make_ssa_name (iv_type);
572 /* Create decrement IV. */
573 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
575 create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
576 insert_after, &index_before_incr, &index_after_incr);
577 tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
578 index_before_incr, nitems_step);
579 gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
581 else
583 create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
584 &incr_gsi, insert_after, &index_before_incr,
585 &index_after_incr);
586 gimple_seq_add_stmt (header_seq,
587 gimple_build_assign (step, MIN_EXPR,
588 index_before_incr,
589 nitems_step));
591 *iv_step = step;
592 *compare_step = nitems_step;
593 return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
594 : index_before_incr;
597 /* Create increment IV. */
598 create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
599 loop, &incr_gsi, insert_after, &index_before_incr,
600 &index_after_incr);
602 tree zero_index = build_int_cst (compare_type, 0);
603 tree test_index, test_limit, first_limit;
604 gimple_stmt_iterator *test_gsi;
605 if (might_wrap_p)
607 /* In principle the loop should stop iterating once the incremented
608 IV reaches a value greater than or equal to:
610 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
612 However, there's no guarantee that this addition doesn't overflow
613 the comparison type, or that the IV hits a value above it before
614 wrapping around. We therefore adjust the limit down by one
615 IV step:
617 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
618 -[infinite-prec] NITEMS_STEP
620 and compare the IV against this limit _before_ incrementing it.
621 Since the comparison type is unsigned, we actually want the
622 subtraction to saturate at zero:
624 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
625 -[sat] NITEMS_STEP
627 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
629 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
631 where the rightmost subtraction can be done directly in
632 COMPARE_TYPE. */
633 test_index = index_before_incr;
634 tree adjust = gimple_convert (preheader_seq, compare_type,
635 nitems_step);
636 if (nitems_skip)
637 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
638 adjust, nitems_skip);
639 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
640 nitems_total, adjust);
641 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
642 test_limit, adjust);
643 test_gsi = &incr_gsi;
645 /* Get a safe limit for the first iteration. */
646 if (nitems_skip)
648 /* The first vector iteration can handle at most NITEMS_STEP
649 items. NITEMS_STEP <= CONST_LIMIT, and adding
650 NITEMS_SKIP to that cannot overflow. */
651 tree const_limit = build_int_cst (compare_type,
652 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
653 * nitems_per_iter);
654 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
655 nitems_total, const_limit);
656 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
657 first_limit, nitems_skip);
659 else
660 /* For the first iteration it doesn't matter whether the IV hits
661 a value above NITEMS_TOTAL. That only matters for the latch
662 condition. */
663 first_limit = nitems_total;
665 else
667 /* Test the incremented IV, which will always hit a value above
668 the bound before wrapping. */
669 test_index = index_after_incr;
670 test_limit = nitems_total;
671 if (nitems_skip)
672 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
673 test_limit, nitems_skip);
674 test_gsi = &loop_cond_gsi;
676 first_limit = test_limit;
679 /* Convert the IV value to the comparison type (either a no-op or
680 a demotion). */
681 gimple_seq test_seq = NULL;
682 test_index = gimple_convert (&test_seq, compare_type, test_index);
683 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
685 /* Provide a definition of each control in the group. */
686 tree next_ctrl = NULL_TREE;
687 tree ctrl;
688 unsigned int i;
689 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
691 /* Previous controls will cover BIAS items. This control covers the
692 next batch. */
693 poly_uint64 bias = nitems_per_ctrl * i;
694 tree bias_tree = build_int_cst (compare_type, bias);
696 /* See whether the first iteration of the vector loop is known
697 to have a full control. */
698 poly_uint64 const_limit;
699 bool first_iteration_full
700 = (poly_int_tree_p (first_limit, &const_limit)
701 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
703 /* Rather than have a new IV that starts at BIAS and goes up to
704 TEST_LIMIT, prefer to use the same 0-based IV for each control
705 and adjust the bound down by BIAS. */
706 tree this_test_limit = test_limit;
707 if (i != 0)
709 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
710 compare_type, this_test_limit,
711 bias_tree);
712 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
713 compare_type, this_test_limit,
714 bias_tree);
717 /* Create the initial control. First include all items that
718 are within the loop limit. */
719 tree init_ctrl = NULL_TREE;
720 if (!first_iteration_full)
722 tree start, end;
723 if (first_limit == test_limit)
725 /* Use a natural test between zero (the initial IV value)
726 and the loop limit. The "else" block would be valid too,
727 but this choice can avoid the need to load BIAS_TREE into
728 a register. */
729 start = zero_index;
730 end = this_test_limit;
732 else
734 /* FIRST_LIMIT is the maximum number of items handled by the
735 first iteration of the vector loop. Test the portion
736 associated with this control. */
737 start = bias_tree;
738 end = first_limit;
741 if (use_masks_p)
742 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
743 start, end, "max_mask");
744 else
746 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
747 gimple_seq seq = vect_gen_len (init_ctrl, start,
748 end, length_limit);
749 gimple_seq_add_seq (preheader_seq, seq);
753 /* Now AND out the bits that are within the number of skipped
754 items. */
755 poly_uint64 const_skip;
756 if (nitems_skip
757 && !(poly_int_tree_p (nitems_skip, &const_skip)
758 && known_le (const_skip, bias)))
760 gcc_assert (use_masks_p);
761 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
762 bias_tree, nitems_skip);
763 if (init_ctrl)
764 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
765 init_ctrl, unskipped_mask);
766 else
767 init_ctrl = unskipped_mask;
770 if (!init_ctrl)
772 /* First iteration is full. */
773 if (use_masks_p)
774 init_ctrl = build_minus_one_cst (ctrl_type);
775 else
776 init_ctrl = length_limit;
779 /* Get the control value for the next iteration of the loop. */
780 if (use_masks_p)
782 gimple_seq stmts = NULL;
783 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
784 this_test_limit, "next_mask");
785 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
787 else
789 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
790 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
791 length_limit);
792 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
795 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
798 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
799 if (partial_load_bias != 0)
801 tree adjusted_len = rgc->bias_adjusted_ctrl;
802 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
803 rgc->controls[0],
804 build_int_cst
805 (TREE_TYPE (rgc->controls[0]),
806 partial_load_bias));
807 gimple_seq_add_stmt (header_seq, minus);
810 return next_ctrl;
813 /* Set up the iteration condition and rgroup controls for LOOP, given
814 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
815 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
816 the number of iterations of the original scalar loop that should be
817 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
818 for vect_set_loop_condition.
820 Insert the branch-back condition before LOOP_COND_GSI and return the
821 final gcond. */
823 static gcond *
824 vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
825 loop_vec_info loop_vinfo, tree niters,
826 tree final_iv, bool niters_maybe_zero,
827 gimple_stmt_iterator loop_cond_gsi)
829 gimple_seq preheader_seq = NULL;
830 gimple_seq header_seq = NULL;
832 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
833 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
834 unsigned int compare_precision = TYPE_PRECISION (compare_type);
835 tree orig_niters = niters;
837 /* Type of the initial value of NITERS. */
838 tree ni_actual_type = TREE_TYPE (niters);
839 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
840 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
841 if (niters_skip)
842 niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
844 /* Convert NITERS to the same size as the compare. */
845 if (compare_precision > ni_actual_precision
846 && niters_maybe_zero)
848 /* We know that there is always at least one iteration, so if the
849 count is zero then it must have wrapped. Cope with this by
850 subtracting 1 before the conversion and adding 1 to the result. */
851 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
852 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
853 niters, build_minus_one_cst (ni_actual_type));
854 niters = gimple_convert (&preheader_seq, compare_type, niters);
855 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
856 niters, build_one_cst (compare_type));
858 else
859 niters = gimple_convert (&preheader_seq, compare_type, niters);
861 /* Iterate over all the rgroups and fill in their controls. We could use
862 the first control from any rgroup for the loop condition; here we
863 arbitrarily pick the last. */
864 tree test_ctrl = NULL_TREE;
865 tree iv_step = NULL_TREE;
866 tree compare_step = NULL_TREE;
867 rgroup_controls *rgc;
868 rgroup_controls *iv_rgc = nullptr;
869 unsigned int i;
870 auto_vec<rgroup_controls> *controls = use_masks_p
871 ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
872 : &LOOP_VINFO_LENS (loop_vinfo);
873 FOR_EACH_VEC_ELT (*controls, i, rgc)
874 if (!rgc->controls.is_empty ())
876 /* First try using permutes. This adds a single vector
877 instruction to the loop for each mask, but needs no extra
878 loop invariants or IVs. */
879 unsigned int nmasks = i + 1;
880 if (use_masks_p && (nmasks & 1) == 0)
882 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
883 if (!half_rgc->controls.is_empty ()
884 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
885 continue;
888 if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
889 || !iv_rgc
890 || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
891 != rgc->max_nscalars_per_iter * rgc->factor))
893 /* See whether zero-based IV would ever generate all-false masks
894 or zero length before wrapping around. */
895 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
897 /* Set up all controls for this group. */
898 test_ctrl
899 = vect_set_loop_controls_directly (loop, loop_vinfo,
900 &preheader_seq, &header_seq,
901 loop_cond_gsi, rgc, niters,
902 niters_skip, might_wrap_p,
903 &iv_step, &compare_step);
905 iv_rgc = rgc;
908 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
909 && rgc->controls.length () > 1)
911 /* vect_set_loop_controls_directly creates an IV whose step
912 is equal to the expected sum of RGC->controls. Use that
913 information to populate RGC->controls. */
914 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
915 gcc_assert (iv_step);
916 vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
920 /* Emit all accumulated statements. */
921 add_preheader_seq (loop, preheader_seq);
922 add_header_seq (loop, header_seq);
924 /* Get a boolean result that tells us whether to iterate. */
925 gcond *cond_stmt;
926 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
927 && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
929 gcc_assert (compare_step);
930 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
931 cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
932 NULL_TREE);
934 else
936 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
937 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
938 cond_stmt
939 = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
941 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
943 /* The loop iterates (NITERS - 1) / VF + 1 times.
944 Subtract one from this to get the latch count. */
945 tree step = build_int_cst (compare_type,
946 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
947 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
948 build_minus_one_cst (compare_type));
949 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
950 niters_minus_one, step);
952 if (final_iv)
954 gassign *assign;
955 /* If vectorizing an inverted early break loop we have to restart the
956 scalar loop at niters - vf. This matches what we do in
957 vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
958 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
960 tree ftype = TREE_TYPE (orig_niters);
961 tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
962 assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
964 else
965 assign = gimple_build_assign (final_iv, orig_niters);
966 gsi_insert_on_edge_immediate (exit_edge, assign);
969 return cond_stmt;
972 /* Set up the iteration condition and rgroup controls for LOOP in AVX512
973 style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
974 vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
975 the number of iterations of the original scalar loop that should be
976 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
977 for vect_set_loop_condition.
979 Insert the branch-back condition before LOOP_COND_GSI and return the
980 final gcond. */
982 static gcond *
983 vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
984 edge exit_edge,
985 loop_vec_info loop_vinfo, tree niters,
986 tree final_iv,
987 bool niters_maybe_zero,
988 gimple_stmt_iterator loop_cond_gsi)
990 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
991 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
992 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
993 tree orig_niters = niters;
994 gimple_seq preheader_seq = NULL;
996 /* Create an IV that counts down from niters and whose step
997 is the number of iterations processed in the current iteration.
998 Produce the controls with compares like the following.
1000 # iv_2 = PHI <niters, iv_3>
1001 rem_4 = MIN <iv_2, VF>;
1002 remv_6 = { rem_4, rem_4, rem_4, ... }
1003 mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1004 iv_3 = iv_2 - VF;
1005 if (iv_2 > VF)
1006 continue;
1008 Where the constant is built with elements at most VF - 1 and
1009 repetitions according to max_nscalars_per_iter which is guarnateed
1010 to be the same within a group. */
1012 /* Convert NITERS to the determined IV type. */
1013 if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1014 && niters_maybe_zero)
1016 /* We know that there is always at least one iteration, so if the
1017 count is zero then it must have wrapped. Cope with this by
1018 subtracting 1 before the conversion and adding 1 to the result. */
1019 gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1020 niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1021 niters, build_minus_one_cst (TREE_TYPE (niters)));
1022 niters = gimple_convert (&preheader_seq, iv_type, niters);
1023 niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1024 niters, build_one_cst (iv_type));
1026 else
1027 niters = gimple_convert (&preheader_seq, iv_type, niters);
1029 /* Bias the initial value of the IV in case we need to skip iterations
1030 at the beginning. */
1031 tree niters_adj = niters;
1032 if (niters_skip)
1034 tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1035 niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1036 iv_type, niters, skip);
1039 /* The iteration step is the vectorization factor. */
1040 tree iv_step = build_int_cst (iv_type, vf);
1042 /* Create the decrement IV. */
1043 tree index_before_incr, index_after_incr;
1044 gimple_stmt_iterator incr_gsi;
1045 bool insert_after;
1046 vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1047 create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1048 &incr_gsi, insert_after, &index_before_incr,
1049 &index_after_incr);
1051 /* Iterate over all the rgroups and fill in their controls. */
1052 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1054 if (rgc.controls.is_empty ())
1055 continue;
1057 tree ctrl_type = rgc.type;
1058 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1060 tree vectype = rgc.compare_type;
1062 /* index_after_incr is the IV specifying the remaining iterations in
1063 the next iteration. */
1064 tree rem = index_after_incr;
1065 /* When the data type for the compare to produce the mask is
1066 smaller than the IV type we need to saturate. Saturate to
1067 the smallest possible value (IV_TYPE) so we only have to
1068 saturate once (CSE will catch redundant ones we add). */
1069 if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1070 rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1071 UNKNOWN_LOCATION,
1072 MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1073 rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1074 UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1076 /* Build a data vector composed of the remaining iterations. */
1077 rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078 UNKNOWN_LOCATION, vectype, rem);
1080 /* Provide a definition of each vector in the control group. */
1081 tree next_ctrl = NULL_TREE;
1082 tree first_rem = NULL_TREE;
1083 tree ctrl;
1084 unsigned int i;
1085 FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1087 /* Previous controls will cover BIAS items. This control covers the
1088 next batch. */
1089 poly_uint64 bias = nitems_per_ctrl * i;
1091 /* Build the constant to compare the remaining iters against,
1092 this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1093 split into pieces. */
1094 unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1095 tree_vector_builder builder (vectype, n, 1);
1096 for (unsigned i = 0; i < n; ++i)
1098 unsigned HOST_WIDE_INT val
1099 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1100 gcc_assert (val < vf.to_constant ());
1101 builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1103 tree cmp_series = builder.build ();
1105 /* Create the initial control. First include all items that
1106 are within the loop limit. */
1107 tree init_ctrl = NULL_TREE;
1108 poly_uint64 const_limit;
1109 /* See whether the first iteration of the vector loop is known
1110 to have a full control. */
1111 if (poly_int_tree_p (niters, &const_limit)
1112 && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1113 init_ctrl = build_minus_one_cst (ctrl_type);
1114 else
1116 /* The remaining work items initially are niters. Saturate,
1117 splat and compare. */
1118 if (!first_rem)
1120 first_rem = niters;
1121 if (TYPE_PRECISION (TREE_TYPE (vectype))
1122 < TYPE_PRECISION (iv_type))
1123 first_rem = gimple_build (&preheader_seq,
1124 MIN_EXPR, TREE_TYPE (first_rem),
1125 first_rem, iv_step);
1126 first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1127 first_rem);
1128 first_rem = gimple_build_vector_from_val (&preheader_seq,
1129 vectype, first_rem);
1131 init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1132 cmp_series, first_rem);
1135 /* Now AND out the bits that are within the number of skipped
1136 items. */
1137 poly_uint64 const_skip;
1138 if (niters_skip
1139 && !(poly_int_tree_p (niters_skip, &const_skip)
1140 && known_le (const_skip, bias)))
1142 /* For integer mode masks it's cheaper to shift out the bits
1143 since that avoids loading a constant. */
1144 gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1145 init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1146 lang_hooks.types.type_for_mode
1147 (TYPE_MODE (ctrl_type), 1),
1148 init_ctrl);
1149 /* ??? But when the shift amount isn't constant this requires
1150 a round-trip to GRPs. We could apply the bias to either
1151 side of the compare instead. */
1152 tree shift = gimple_build (&preheader_seq, MULT_EXPR,
1153 TREE_TYPE (niters_skip), niters_skip,
1154 build_int_cst (TREE_TYPE (niters_skip),
1155 rgc.max_nscalars_per_iter));
1156 init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1157 TREE_TYPE (init_ctrl),
1158 init_ctrl, shift);
1159 init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1160 ctrl_type, init_ctrl);
1163 /* Get the control value for the next iteration of the loop. */
1164 next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1165 UNKNOWN_LOCATION,
1166 LT_EXPR, ctrl_type, cmp_series, rem);
1168 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1172 /* Emit all accumulated statements. */
1173 add_preheader_seq (loop, preheader_seq);
1175 /* Adjust the exit test using the decrementing IV. */
1176 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1177 /* When we peel for alignment with niter_skip != 0 this can
1178 cause niter + niter_skip to wrap and since we are comparing the
1179 value before the decrement here we get a false early exit.
1180 We can't compare the value after decrement either because that
1181 decrement could wrap as well as we're not doing a saturating
1182 decrement. To avoid this situation we force a larger
1183 iv_type. */
1184 gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1185 NULL_TREE, NULL_TREE);
1186 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1188 /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1189 Subtract one from this to get the latch count. */
1190 tree niters_minus_one
1191 = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1192 build_minus_one_cst (TREE_TYPE (orig_niters)));
1193 tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1194 if (niters_skip)
1195 niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1196 fold_convert (iv_type, niters_skip));
1197 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1198 niters_adj2, iv_step);
1200 if (final_iv)
1202 gassign *assign;
1203 /* If vectorizing an inverted early break loop we have to restart the
1204 scalar loop at niters - vf. This matches what we do in
1205 vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
1206 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1208 tree ftype = TREE_TYPE (orig_niters);
1209 tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1210 assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1212 else
1213 assign = gimple_build_assign (final_iv, orig_niters);
1214 gsi_insert_on_edge_immediate (exit_edge, assign);
1217 return cond_stmt;
1221 /* Like vect_set_loop_condition, but handle the case in which the vector
1222 loop handles exactly VF scalars per iteration. */
1224 static gcond *
1225 vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1226 class loop *loop, tree niters, tree step,
1227 tree final_iv, bool niters_maybe_zero,
1228 gimple_stmt_iterator loop_cond_gsi)
1230 tree indx_before_incr, indx_after_incr;
1231 gcond *cond_stmt;
1232 gcond *orig_cond;
1233 edge pe = loop_preheader_edge (loop);
1234 gimple_stmt_iterator incr_gsi;
1235 bool insert_after;
1236 enum tree_code code;
1237 tree niters_type = TREE_TYPE (niters);
1239 orig_cond = get_loop_exit_condition (exit_edge);
1240 gcc_assert (orig_cond);
1241 loop_cond_gsi = gsi_for_stmt (orig_cond);
1243 tree init, limit;
1244 if (!niters_maybe_zero && integer_onep (step))
1246 /* In this case we can use a simple 0-based IV:
1249 x = 0;
1253 x += 1;
1255 while (x < NITERS); */
1256 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1257 init = build_zero_cst (niters_type);
1258 limit = niters;
1260 else
1262 /* The following works for all values of NITERS except 0:
1265 x = 0;
1269 x += STEP;
1271 while (x <= NITERS - STEP);
1273 so that the loop continues to iterate if x + STEP - 1 < NITERS
1274 but stops if x + STEP - 1 >= NITERS.
1276 However, if NITERS is zero, x never hits a value above NITERS - STEP
1277 before wrapping around. There are two obvious ways of dealing with
1278 this:
1280 - start at STEP - 1 and compare x before incrementing it
1281 - start at -1 and compare x after incrementing it
1283 The latter is simpler and is what we use. The loop in this case
1284 looks like:
1287 x = -1;
1291 x += STEP;
1293 while (x < NITERS - STEP);
1295 In both cases the loop limit is NITERS - STEP. */
1296 gimple_seq seq = NULL;
1297 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1298 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1299 if (seq)
1301 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1302 gcc_assert (!new_bb);
1304 if (niters_maybe_zero)
1306 /* Case C. */
1307 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1308 init = build_all_ones_cst (niters_type);
1310 else
1312 /* Case B. */
1313 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1314 init = build_zero_cst (niters_type);
1318 vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1319 create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1320 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1321 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1322 true, NULL_TREE, true,
1323 GSI_SAME_STMT);
1324 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1325 true, GSI_SAME_STMT);
1327 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1328 NULL_TREE);
1330 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1332 /* Record the number of latch iterations. */
1333 if (limit == niters)
1334 /* Case A: the loop iterates NITERS times. Subtract one to get the
1335 latch count. */
1336 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1337 build_int_cst (niters_type, 1));
1338 else
1339 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1340 Subtract one from this to get the latch count. */
1341 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1342 limit, step);
1344 if (final_iv)
1346 gassign *assign;
1347 gcc_assert (single_pred_p (exit_edge->dest));
1348 tree phi_dest
1349 = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1350 /* Make sure to maintain LC SSA form here and elide the subtraction
1351 if the value is zero. */
1352 gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1353 add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1354 if (!integer_zerop (init))
1356 assign = gimple_build_assign (final_iv, MINUS_EXPR,
1357 phi_dest, init);
1358 gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1359 gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1363 return cond_stmt;
1366 /* If we're using fully-masked loops, make LOOP iterate:
1368 N == (NITERS - 1) / STEP + 1
1370 times. When NITERS is zero, this is equivalent to making the loop
1371 execute (1 << M) / STEP times, where M is the precision of NITERS.
1372 NITERS_MAYBE_ZERO is true if this last case might occur.
1374 If we're not using fully-masked loops, make LOOP iterate:
1376 N == (NITERS - STEP) / STEP + 1
1378 times, where NITERS is known to be outside the range [1, STEP - 1].
1379 This is equivalent to making the loop execute NITERS / STEP times
1380 when NITERS is nonzero and (1 << M) / STEP times otherwise.
1381 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1383 If FINAL_IV is nonnull, it is an SSA name that should be set to
1384 N * STEP on exit from the loop.
1386 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1388 void
1389 vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1390 tree niters, tree step, tree final_iv,
1391 bool niters_maybe_zero)
1393 gcond *cond_stmt;
1394 gcond *orig_cond = get_loop_exit_condition (loop_e);
1395 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1397 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1399 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1400 cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1401 loop_vinfo,
1402 niters, final_iv,
1403 niters_maybe_zero,
1404 loop_cond_gsi);
1405 else
1406 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1407 loop_vinfo,
1408 niters, final_iv,
1409 niters_maybe_zero,
1410 loop_cond_gsi);
1412 else
1413 cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1414 niters,
1415 step, final_iv,
1416 niters_maybe_zero,
1417 loop_cond_gsi);
1419 /* Remove old loop exit test. */
1420 stmt_vec_info orig_cond_info;
1421 if (loop_vinfo
1422 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1423 loop_vinfo->remove_stmt (orig_cond_info);
1424 else
1425 gsi_remove (&loop_cond_gsi, true);
1427 if (dump_enabled_p ())
1428 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1429 (gimple *) cond_stmt);
1432 /* Get the virtual operand live on E. The precondition on this is valid
1433 immediate dominators and an actual virtual definition dominating E. */
1434 /* ??? Costly band-aid. For the use in question we can populate a
1435 live-on-exit/end-of-BB virtual operand when copying stmts. */
1437 static tree
1438 get_live_virtual_operand_on_edge (edge e)
1440 basic_block bb = e->src;
1443 for (auto gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1445 gimple *stmt = gsi_stmt (gsi);
1446 if (gimple_vdef (stmt))
1447 return gimple_vdef (stmt);
1448 if (gimple_vuse (stmt))
1449 return gimple_vuse (stmt);
1451 if (gphi *vphi = get_virtual_phi (bb))
1452 return gimple_phi_result (vphi);
1453 bb = get_immediate_dominator (CDI_DOMINATORS, bb);
1455 while (1);
1458 /* Given LOOP this function generates a new copy of it and puts it
1459 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1460 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1461 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1462 entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1463 continuation. This is correct for cases where one loop continues from the
1464 other like in the vectorizer, but not true for uses in e.g. loop distribution
1465 where the contents of the loop body are split but the iteration space of both
1466 copies remains the same.
1468 If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1469 dominators were updated during the peeling. When doing early break vectorization
1470 then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1471 memory references that need to be updated should we decide to vectorize. */
1473 class loop *
1474 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1475 class loop *scalar_loop,
1476 edge scalar_exit, edge e, edge *new_e,
1477 bool flow_loops,
1478 vec<basic_block> *updated_doms)
1480 class loop *new_loop;
1481 basic_block *new_bbs, *bbs, *pbbs;
1482 bool at_exit;
1483 bool was_imm_dom;
1484 basic_block exit_dest;
1485 edge exit, new_exit;
1486 bool duplicate_outer_loop = false;
1488 exit = loop_exit;
1489 at_exit = (e == exit);
1490 if (!at_exit && e != loop_preheader_edge (loop))
1491 return NULL;
1493 if (scalar_loop == NULL)
1495 scalar_loop = loop;
1496 scalar_exit = loop_exit;
1498 else if (scalar_loop == loop)
1499 scalar_exit = loop_exit;
1500 else
1502 /* Loop has been version, match exits up using the aux index. */
1503 for (edge exit : get_loop_exit_edges (scalar_loop))
1504 if (exit->aux == loop_exit->aux)
1506 scalar_exit = exit;
1507 break;
1510 gcc_assert (scalar_exit);
1513 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1514 pbbs = bbs + 1;
1515 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1516 /* Allow duplication of outer loops. */
1517 if (scalar_loop->inner)
1518 duplicate_outer_loop = true;
1520 /* Generate new loop structure. */
1521 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1522 duplicate_subloops (scalar_loop, new_loop);
1524 exit_dest = exit->dest;
1525 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1526 exit_dest) == exit->src ?
1527 true : false);
1529 /* Also copy the pre-header, this avoids jumping through hoops to
1530 duplicate the loop entry PHI arguments. Create an empty
1531 pre-header unconditionally for this. */
1532 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1533 edge entry_e = single_pred_edge (preheader);
1534 bbs[0] = preheader;
1535 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1537 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1538 &scalar_exit, 1, &new_exit, NULL,
1539 at_exit ? loop->latch : e->src, true);
1540 exit = loop_exit;
1541 basic_block new_preheader = new_bbs[0];
1543 gcc_assert (new_exit);
1545 /* Record the new loop exit information. new_loop doesn't have SCEV data and
1546 so we must initialize the exit information. */
1547 if (new_e)
1548 *new_e = new_exit;
1550 /* Before installing PHI arguments make sure that the edges
1551 into them match that of the scalar loop we analyzed. This
1552 makes sure the SLP tree matches up between the main vectorized
1553 loop and the epilogue vectorized copies. */
1554 if (single_succ_edge (preheader)->dest_idx
1555 != single_succ_edge (new_bbs[0])->dest_idx)
1557 basic_block swap_bb = new_bbs[1];
1558 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1559 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1560 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1561 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1563 if (duplicate_outer_loop)
1565 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1566 if (loop_preheader_edge (scalar_loop)->dest_idx
1567 != loop_preheader_edge (new_inner_loop)->dest_idx)
1569 basic_block swap_bb = new_inner_loop->header;
1570 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1571 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1572 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1573 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1577 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1579 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1580 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1581 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1583 /* Rename the exit uses. */
1584 for (edge exit : get_loop_exit_edges (new_loop))
1585 for (auto gsi = gsi_start_phis (exit->dest);
1586 !gsi_end_p (gsi); gsi_next (&gsi))
1588 tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1589 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1590 if (MAY_HAVE_DEBUG_BIND_STMTS)
1591 adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1594 auto loop_exits = get_loop_exit_edges (loop);
1595 bool multiple_exits_p = loop_exits.length () > 1;
1596 auto_vec<basic_block> doms;
1598 if (at_exit) /* Add the loop copy at exit. */
1600 if (scalar_loop != loop && new_exit->dest != exit_dest)
1602 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1603 flush_pending_stmts (new_exit);
1606 bool need_virtual_phi = get_virtual_phi (loop->header);
1608 /* For the main loop exit preserve the LC PHI nodes. For vectorization
1609 we need them to continue or finalize reductions. Since we do not
1610 copy the loop exit blocks we have to materialize PHIs at the
1611 new destination before redirecting edges. */
1612 for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1613 !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1615 tree res = gimple_phi_result (*gsi_from);
1616 create_phi_node (copy_ssa_name (res), new_preheader);
1618 edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1619 gcc_assert (e == loop_exit);
1620 flush_pending_stmts (loop_exit);
1621 set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1623 /* If we ended up choosing an exit leading to a path not using memory
1624 we can end up without a virtual LC PHI. Create it when it is
1625 needed because of the epilog loop continuation. */
1626 if (need_virtual_phi && !get_virtual_phi (loop_exit->dest))
1628 tree header_def = gimple_phi_result (get_virtual_phi (loop->header));
1629 gphi *vphi = create_phi_node (copy_ssa_name (header_def),
1630 new_preheader);
1631 add_phi_arg (vphi, get_live_virtual_operand_on_edge (loop_exit),
1632 loop_exit, UNKNOWN_LOCATION);
1635 bool multiple_exits_p = loop_exits.length () > 1;
1636 basic_block main_loop_exit_block = new_preheader;
1637 basic_block alt_loop_exit_block = NULL;
1638 /* Create the CFG for multiple exits.
1639 | loop_exit | alt1 | altN
1640 v v ... v
1641 main_loop_exit_block: alt_loop_exit_block:
1644 new_preheader:
1645 where in the new preheader we need merge PHIs for
1646 the continuation values into the epilogue header.
1647 Do not bother with exit PHIs for the early exits but
1648 their live virtual operand. We'll fix up things below. */
1649 if (multiple_exits_p)
1651 edge loop_e = single_succ_edge (new_preheader);
1652 new_preheader = split_edge (loop_e);
1654 gphi *vphi = NULL;
1655 alt_loop_exit_block = new_preheader;
1656 for (auto exit : loop_exits)
1657 if (exit != loop_exit)
1659 tree vphi_def = NULL_TREE;
1660 if (gphi *evphi = get_virtual_phi (exit->dest))
1661 vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1662 edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1663 gcc_assert (res == exit);
1664 redirect_edge_var_map_clear (exit);
1665 if (alt_loop_exit_block == new_preheader)
1666 alt_loop_exit_block = split_edge (exit);
1667 if (!need_virtual_phi)
1668 continue;
1669 /* When the edge has no virtual LC PHI get at the live
1670 virtual operand by other means. */
1671 if (!vphi_def)
1672 vphi_def = get_live_virtual_operand_on_edge (exit);
1673 if (!vphi)
1674 vphi = create_phi_node (copy_ssa_name (vphi_def),
1675 alt_loop_exit_block);
1676 else
1677 /* Edge redirection might re-allocate the PHI node
1678 so we have to rediscover it. */
1679 vphi = get_virtual_phi (alt_loop_exit_block);
1680 add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1683 set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1684 loop->header);
1687 /* Adjust the epilog loop PHI entry values to continue iteration.
1688 This adds remaining necessary LC PHI nodes to the main exit
1689 and creates merge PHIs when we have multiple exits with
1690 their appropriate continuation. */
1691 if (flow_loops)
1693 edge loop_entry = single_succ_edge (new_preheader);
1694 bool peeled_iters = single_pred (loop->latch) != loop_exit->src;
1696 /* Record the new SSA names in the cache so that we can skip
1697 materializing them again when we fill in the rest of the LC SSA
1698 variables. */
1699 hash_map <tree, tree> new_phi_args;
1700 for (auto psi = gsi_start_phis (main_loop_exit_block);
1701 !gsi_end_p (psi); gsi_next (&psi))
1703 gphi *phi = *psi;
1704 tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1705 if (TREE_CODE (new_arg) != SSA_NAME)
1706 continue;
1708 /* If the loop doesn't have a virtual def then only possibly keep
1709 the epilog LC PHI for it and avoid creating new defs. */
1710 if (virtual_operand_p (new_arg) && !need_virtual_phi)
1712 auto gsi = gsi_for_stmt (phi);
1713 remove_phi_node (&gsi, true);
1714 continue;
1717 /* If we decided not to remove the PHI node we should also not
1718 rematerialize it later on. */
1719 new_phi_args.put (new_arg, gimple_phi_result (phi));
1722 /* Create the merge PHI nodes in new_preheader and populate the
1723 arguments for the exits. */
1724 if (multiple_exits_p)
1726 for (auto gsi_from = gsi_start_phis (loop->header),
1727 gsi_to = gsi_start_phis (new_loop->header);
1728 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1729 gsi_next (&gsi_from), gsi_next (&gsi_to))
1731 gimple *from_phi = gsi_stmt (gsi_from);
1732 gimple *to_phi = gsi_stmt (gsi_to);
1734 /* When the vector loop is peeled then we need to use the
1735 value at start of the loop, otherwise the main loop exit
1736 should use the final iter value. */
1737 tree new_arg;
1738 if (peeled_iters)
1739 new_arg = gimple_phi_result (from_phi);
1740 else
1741 new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1742 loop_latch_edge (loop));
1744 /* Check if we've already created a new phi node during edge
1745 redirection and re-use it if so. Otherwise create a
1746 LC PHI node to feed the merge PHI. */
1747 tree *res;
1748 if (virtual_operand_p (new_arg))
1750 /* Use the existing virtual LC SSA from exit block. */
1751 gphi *vphi = get_virtual_phi (main_loop_exit_block);
1752 new_arg = gimple_phi_result (vphi);
1754 else if ((res = new_phi_args.get (new_arg)))
1755 new_arg = *res;
1756 else
1758 /* Create the LC PHI node for the exit. */
1759 tree new_def = copy_ssa_name (new_arg);
1760 gphi *lc_phi
1761 = create_phi_node (new_def, main_loop_exit_block);
1762 SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1763 new_arg = new_def;
1766 /* Create the PHI node in the merge block merging the
1767 main and early exit values. */
1768 tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1769 gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1770 edge main_e = single_succ_edge (main_loop_exit_block);
1771 SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1773 /* And adjust the epilog entry value. */
1774 adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1777 /* After creating the merge PHIs handle the early exits those
1778 should use the values at the start of the loop. */
1779 for (auto gsi_from = gsi_start_phis (loop->header),
1780 gsi_to = gsi_start_phis (new_preheader);
1781 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1782 gsi_next (&gsi_from), gsi_next (&gsi_to))
1784 gimple *from_phi = gsi_stmt (gsi_from);
1785 gimple *to_phi = gsi_stmt (gsi_to);
1787 /* Now update the virtual PHI nodes with the right value. */
1788 tree alt_arg = gimple_phi_result (from_phi);
1789 if (virtual_operand_p (alt_arg))
1791 gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1792 alt_arg = gimple_phi_result (vphi);
1794 /* For other live args we didn't create LC PHI nodes.
1795 Do so here. */
1796 else
1798 tree alt_def = copy_ssa_name (alt_arg);
1799 gphi *lc_phi
1800 = create_phi_node (alt_def, alt_loop_exit_block);
1801 for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1802 ++i)
1803 SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1804 alt_arg = alt_def;
1806 edge alt_e = single_succ_edge (alt_loop_exit_block);
1807 SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1810 /* For the single exit case only create the missing LC PHI nodes
1811 for the continuation of the loop IVs that are not also already
1812 reductions and thus had LC PHI nodes on the exit already. */
1813 else
1815 for (auto gsi_from = gsi_start_phis (loop->header),
1816 gsi_to = gsi_start_phis (new_loop->header);
1817 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1818 gsi_next (&gsi_from), gsi_next (&gsi_to))
1820 gimple *from_phi = gsi_stmt (gsi_from);
1821 gimple *to_phi = gsi_stmt (gsi_to);
1822 tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1823 loop_latch_edge (loop));
1825 /* Check if we've already created a new phi node during edge
1826 redirection. If we have, only propagate the value
1827 downwards. */
1828 if (tree *res = new_phi_args.get (new_arg))
1830 adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1831 continue;
1834 tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1835 gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1836 SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1837 adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1842 if (was_imm_dom || duplicate_outer_loop)
1843 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1845 /* And remove the non-necessary forwarder again. Keep the other
1846 one so we have a proper pre-header for the loop at the exit edge. */
1847 redirect_edge_pred (single_succ_edge (preheader),
1848 single_pred (preheader));
1849 delete_basic_block (preheader);
1850 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1851 loop_preheader_edge (scalar_loop)->src);
1853 /* Finally after wiring the new epilogue we need to update its main exit
1854 to the original function exit we recorded. Other exits are already
1855 correct. */
1856 if (multiple_exits_p)
1858 class loop *update_loop = new_loop;
1859 doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1860 for (unsigned i = 0; i < doms.length (); ++i)
1861 if (flow_bb_inside_loop_p (loop, doms[i]))
1862 doms.unordered_remove (i);
1864 for (edge e : get_loop_exit_edges (update_loop))
1866 edge ex;
1867 edge_iterator ei;
1868 FOR_EACH_EDGE (ex, ei, e->dest->succs)
1870 /* Find the first non-fallthrough block as fall-throughs can't
1871 dominate other blocks. */
1872 if (single_succ_p (ex->dest))
1874 doms.safe_push (ex->dest);
1875 ex = single_succ_edge (ex->dest);
1877 doms.safe_push (ex->dest);
1879 doms.safe_push (e->dest);
1882 iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1883 if (updated_doms)
1884 updated_doms->safe_splice (doms);
1887 else /* Add the copy at entry. */
1889 /* Copy the current loop LC PHI nodes between the original loop exit
1890 block and the new loop header. This allows us to later split the
1891 preheader block and still find the right LC nodes. */
1892 if (flow_loops)
1893 for (auto gsi_from = gsi_start_phis (new_loop->header),
1894 gsi_to = gsi_start_phis (loop->header);
1895 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1896 gsi_next (&gsi_from), gsi_next (&gsi_to))
1898 gimple *from_phi = gsi_stmt (gsi_from);
1899 gimple *to_phi = gsi_stmt (gsi_to);
1900 tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1901 loop_latch_edge (new_loop));
1902 adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1903 new_arg);
1906 if (scalar_loop != loop)
1908 /* Remove the non-necessary forwarder of scalar_loop again. */
1909 redirect_edge_pred (single_succ_edge (preheader),
1910 single_pred (preheader));
1911 delete_basic_block (preheader);
1912 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1913 loop_preheader_edge (scalar_loop)->src);
1914 preheader = split_edge (loop_preheader_edge (loop));
1915 entry_e = single_pred_edge (preheader);
1918 redirect_edge_and_branch_force (entry_e, new_preheader);
1919 flush_pending_stmts (entry_e);
1920 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1922 redirect_edge_and_branch_force (new_exit, preheader);
1923 flush_pending_stmts (new_exit);
1924 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1926 /* And remove the non-necessary forwarder again. Keep the other
1927 one so we have a proper pre-header for the loop at the exit edge. */
1928 redirect_edge_pred (single_succ_edge (new_preheader),
1929 single_pred (new_preheader));
1930 delete_basic_block (new_preheader);
1931 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1932 loop_preheader_edge (new_loop)->src);
1934 /* Update dominators for multiple exits. */
1935 if (multiple_exits_p)
1937 for (edge alt_e : loop_exits)
1939 if (alt_e == loop_exit)
1940 continue;
1941 basic_block old_dom
1942 = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
1943 if (flow_bb_inside_loop_p (loop, old_dom))
1945 auto_vec<basic_block, 8> queue;
1946 for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
1947 son; son = next_dom_son (CDI_DOMINATORS, son))
1948 if (!flow_bb_inside_loop_p (loop, son))
1949 queue.safe_push (son);
1950 for (auto son : queue)
1951 set_immediate_dominator (CDI_DOMINATORS,
1952 son, get_bb_copy (old_dom));
1958 free (new_bbs);
1959 free (bbs);
1961 checking_verify_dominators (CDI_DOMINATORS);
1963 return new_loop;
1967 /* Given the condition expression COND, put it as the last statement of
1968 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1969 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1970 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1971 new edge as irreducible if IRREDUCIBLE_P is true. */
1973 static edge
1974 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1975 basic_block guard_to, basic_block dom_bb,
1976 profile_probability probability, bool irreducible_p)
1978 gimple_stmt_iterator gsi;
1979 edge new_e, enter_e;
1980 gcond *cond_stmt;
1981 gimple_seq gimplify_stmt_list = NULL;
1983 enter_e = EDGE_SUCC (guard_bb, 0);
1984 enter_e->flags &= ~EDGE_FALLTHRU;
1985 enter_e->flags |= EDGE_FALSE_VALUE;
1986 gsi = gsi_last_bb (guard_bb);
1988 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1989 is_gimple_condexpr_for_cond, NULL_TREE);
1990 if (gimplify_stmt_list)
1991 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1993 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1994 gsi = gsi_last_bb (guard_bb);
1995 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1997 /* Add new edge to connect guard block to the merge/loop-exit block. */
1998 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
2000 new_e->probability = probability;
2001 if (irreducible_p)
2002 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
2004 enter_e->probability = probability.invert ();
2005 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
2007 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
2008 if (enter_e->dest->loop_father->header == enter_e->dest)
2009 split_edge (enter_e);
2011 return new_e;
2015 /* This function verifies that the following restrictions apply to LOOP:
2016 (1) it consists of exactly 2 basic blocks - header, and an empty latch
2017 for innermost loop and 5 basic blocks for outer-loop.
2018 (2) it is single entry, single exit
2019 (3) its exit condition is the last stmt in the header
2020 (4) E is the entry/exit edge of LOOP.
2023 bool
2024 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
2025 const_edge e)
2027 edge entry_e = loop_preheader_edge (loop);
2028 gcond *orig_cond = get_loop_exit_condition (exit_e);
2029 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
2031 /* All loops have an outer scope; the only case loop->outer is NULL is for
2032 the function itself. */
2033 if (!loop_outer (loop)
2034 || !empty_block_p (loop->latch)
2035 || !exit_e
2036 /* Verify that new loop exit condition can be trivially modified. */
2037 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
2038 || (e != exit_e && e != entry_e))
2039 return false;
2041 basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2042 get_loop_body_with_size (loop, bbs, loop->num_nodes);
2043 bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2044 free (bbs);
2045 return ret;
2048 /* Function find_loop_location.
2050 Extract the location of the loop in the source code.
2051 If the loop is not well formed for vectorization, an estimated
2052 location is calculated.
2053 Return the loop location if succeed and NULL if not. */
2055 dump_user_location_t
2056 find_loop_location (class loop *loop)
2058 gimple *stmt = NULL;
2059 basic_block bb;
2060 gimple_stmt_iterator si;
2062 if (!loop)
2063 return dump_user_location_t ();
2065 /* For the root of the loop tree return the function location. */
2066 if (!loop_outer (loop))
2067 return dump_user_location_t::from_function_decl (cfun->decl);
2069 if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2071 /* We only care about the loop location, so use any exit with location
2072 information. */
2073 for (edge e : get_loop_exit_edges (loop))
2075 stmt = get_loop_exit_condition (e);
2077 if (stmt
2078 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2079 return stmt;
2083 /* If we got here the loop is probably not "well formed",
2084 try to estimate the loop location */
2086 if (!loop->header)
2087 return dump_user_location_t ();
2089 bb = loop->header;
2091 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2093 stmt = gsi_stmt (si);
2094 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2095 return stmt;
2098 return dump_user_location_t ();
2101 /* Return true if the phi described by STMT_INFO defines an IV of the
2102 loop to be vectorized. */
2104 static bool
2105 iv_phi_p (stmt_vec_info stmt_info)
2107 gphi *phi = as_a <gphi *> (stmt_info->stmt);
2108 if (virtual_operand_p (PHI_RESULT (phi)))
2109 return false;
2111 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2112 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2113 return false;
2115 return true;
2118 /* Return true if vectorizer can peel for nonlinear iv. */
2119 static bool
2120 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2121 stmt_vec_info stmt_info)
2123 enum vect_induction_op_type induction_type
2124 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2125 tree niters_skip;
2126 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2127 if niters or vf is unkown:
2128 For shift, when shift mount >= precision, there would be UD.
2129 For mult, don't known how to generate
2130 init_expr * pow (step, niters) for variable niters.
2131 For neg unknown niters are ok, since niters of vectorized main loop
2132 will always be multiple of 2.
2133 See also PR113163, PR114196 and PR114485. */
2134 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2135 || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2136 || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2137 && induction_type != vect_step_op_neg))
2139 if (dump_enabled_p ())
2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141 "Peeling for epilogue is not supported"
2142 " for this nonlinear induction"
2143 " when iteration count is unknown or"
2144 " when using partial vectorization.\n");
2145 return false;
2148 /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
2149 if (induction_type == vect_step_op_mul)
2151 tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2152 tree type = TREE_TYPE (step_expr);
2154 if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2155 && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2159 "Avoid compile time hog on"
2160 " vect_peel_nonlinear_iv_init"
2161 " for nonlinear induction vec_step_op_mul"
2162 " when iteration count is too big.\n");
2163 return false;
2167 /* Also doens't support peel for neg when niter is variable.
2168 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
2169 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2170 if ((niters_skip != NULL_TREE
2171 && (TREE_CODE (niters_skip) != INTEGER_CST
2172 || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2173 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2174 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "Peeling for alignement is not supported"
2179 " for nonlinear induction when niters_skip"
2180 " is not constant.\n");
2181 return false;
2184 return true;
2187 /* Function vect_can_advance_ivs_p
2189 In case the number of iterations that LOOP iterates is unknown at compile
2190 time, an epilog loop will be generated, and the loop induction variables
2191 (IVs) will be "advanced" to the value they are supposed to take just before
2192 the epilog loop. Here we check that the access function of the loop IVs
2193 and the expression that represents the loop bound are simple enough.
2194 These restrictions will be relaxed in the future. */
2196 bool
2197 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2199 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2200 basic_block bb = loop->header;
2201 gphi_iterator gsi;
2203 /* Analyze phi functions of the loop header. */
2205 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2207 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2209 tree evolution_part;
2210 enum vect_induction_op_type induction_type;
2212 gphi *phi = gsi.phi ();
2213 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2214 if (dump_enabled_p ())
2215 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2216 phi_info->stmt);
2218 /* Skip virtual phi's. The data dependences that are associated with
2219 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2221 Skip reduction phis. */
2222 if (!iv_phi_p (phi_info))
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_NOTE, vect_location,
2226 "reduc or virtual phi. skip.\n");
2227 continue;
2230 induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2231 if (induction_type != vect_step_op_add)
2233 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2234 return false;
2236 continue;
2239 /* Analyze the evolution function. */
2241 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2242 if (evolution_part == NULL_TREE)
2244 if (dump_enabled_p ())
2245 dump_printf (MSG_MISSED_OPTIMIZATION,
2246 "No access function or evolution.\n");
2247 return false;
2250 /* FORNOW: We do not transform initial conditions of IVs
2251 which evolution functions are not invariants in the loop. */
2253 if (!expr_invariant_in_loop_p (loop, evolution_part))
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 "evolution not invariant in loop.\n");
2258 return false;
2261 /* FORNOW: We do not transform initial conditions of IVs
2262 which evolution functions are a polynomial of degree >= 2. */
2264 if (tree_is_chrec (evolution_part))
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268 "evolution is chrec.\n");
2269 return false;
2273 return true;
2277 /* Function vect_update_ivs_after_vectorizer.
2279 "Advance" the induction variables of LOOP to the value they should take
2280 after the execution of LOOP. This is currently necessary because the
2281 vectorizer does not handle induction variables that are used after the
2282 loop. Such a situation occurs when the last iterations of LOOP are
2283 peeled, because:
2284 1. We introduced new uses after LOOP for IVs that were not originally used
2285 after LOOP: the IVs of LOOP are now used by an epilog loop.
2286 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2287 times, whereas the loop IVs should be bumped N times.
2289 Input:
2290 - LOOP - a loop that is going to be vectorized. The last few iterations
2291 of LOOP were peeled.
2292 - NITERS - the number of iterations that LOOP executes (before it is
2293 vectorized). i.e, the number of times the ivs should be bumped.
2294 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2295 coming out from LOOP on which there are uses of the LOOP ivs
2296 (this is the path from LOOP->exit to epilog_loop->preheader).
2298 The new definitions of the ivs are placed in LOOP->exit.
2299 The phi args associated with the edge UPDATE_E in the bb
2300 UPDATE_E->dest are updated accordingly.
2302 Assumption 1: Like the rest of the vectorizer, this function assumes
2303 a single loop exit that has a single predecessor.
2305 Assumption 2: The phi nodes in the LOOP header and in update_bb are
2306 organized in the same order.
2308 Assumption 3: The access function of the ivs is simple enough (see
2309 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2311 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2312 coming out of LOOP on which the ivs of LOOP are used (this is the path
2313 that leads to the epilog loop; other paths skip the epilog loop). This
2314 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2315 needs to have its phis updated.
2318 static void
2319 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2320 tree niters, edge update_e)
2322 gphi_iterator gsi, gsi1;
2323 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2324 basic_block update_bb = update_e->dest;
2325 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2326 gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2328 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2329 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2330 gsi_next (&gsi), gsi_next (&gsi1))
2332 tree init_expr;
2333 tree step_expr, off;
2334 tree type;
2335 tree var, ni, ni_name;
2337 gphi *phi = gsi.phi ();
2338 gphi *phi1 = gsi1.phi ();
2339 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2340 if (dump_enabled_p ())
2341 dump_printf_loc (MSG_NOTE, vect_location,
2342 "vect_update_ivs_after_vectorizer: phi: %G",
2343 (gimple *) phi);
2345 /* Skip reduction and virtual phis. */
2346 if (!iv_phi_p (phi_info))
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "reduc or virtual phi. skip.\n");
2351 continue;
2354 type = TREE_TYPE (gimple_phi_result (phi));
2355 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2356 step_expr = unshare_expr (step_expr);
2358 /* FORNOW: We do not support IVs whose evolution function is a polynomial
2359 of degree >= 2 or exponential. */
2360 gcc_assert (!tree_is_chrec (step_expr));
2362 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2363 gimple_seq stmts = NULL;
2364 enum vect_induction_op_type induction_type
2365 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2367 if (induction_type == vect_step_op_add)
2369 tree stype = TREE_TYPE (step_expr);
2370 off = fold_build2 (MULT_EXPR, stype,
2371 fold_convert (stype, niters), step_expr);
2373 if (POINTER_TYPE_P (type))
2374 ni = fold_build_pointer_plus (init_expr, off);
2375 else
2376 ni = fold_convert (type,
2377 fold_build2 (PLUS_EXPR, stype,
2378 fold_convert (stype, init_expr),
2379 off));
2381 /* Don't bother call vect_peel_nonlinear_iv_init. */
2382 else if (induction_type == vect_step_op_neg)
2383 ni = init_expr;
2384 else
2385 ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2386 niters, step_expr,
2387 induction_type);
2389 var = create_tmp_var (type, "tmp");
2391 gimple_seq new_stmts = NULL;
2392 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2394 /* Exit_bb shouldn't be empty. */
2395 if (!gsi_end_p (last_gsi))
2397 gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2398 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2400 else
2402 gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2403 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2406 /* Fix phi expressions in the successor bb. */
2407 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2411 /* Return a gimple value containing the misalignment (measured in vector
2412 elements) for the loop described by LOOP_VINFO, i.e. how many elements
2413 it is away from a perfectly aligned address. Add any new statements
2414 to SEQ. */
2416 static tree
2417 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2419 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2420 stmt_vec_info stmt_info = dr_info->stmt;
2421 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2423 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2424 unsigned HOST_WIDE_INT target_align_c;
2425 tree target_align_minus_1;
2427 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2428 size_zero_node) < 0;
2429 tree offset = (negative
2430 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2431 * TREE_INT_CST_LOW
2432 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2433 : size_zero_node);
2434 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2435 stmt_info, seq,
2436 offset);
2437 tree type = unsigned_type_for (TREE_TYPE (start_addr));
2438 if (target_align.is_constant (&target_align_c))
2439 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2440 else
2442 tree vla = build_int_cst (type, target_align);
2443 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2444 fold_build2 (MINUS_EXPR, type,
2445 build_int_cst (type, 0), vla));
2446 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2447 build_int_cst (type, 1));
2450 HOST_WIDE_INT elem_size
2451 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2452 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2454 /* Create: misalign_in_bytes = addr & (target_align - 1). */
2455 tree int_start_addr = fold_convert (type, start_addr);
2456 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2457 target_align_minus_1);
2459 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2460 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2461 elem_size_log);
2463 return misalign_in_elems;
2466 /* Function vect_gen_prolog_loop_niters
2468 Generate the number of iterations which should be peeled as prolog for the
2469 loop represented by LOOP_VINFO. It is calculated as the misalignment of
2470 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2471 As a result, after the execution of this loop, the data reference DR will
2472 refer to an aligned location. The following computation is generated:
2474 If the misalignment of DR is known at compile time:
2475 addr_mis = int mis = DR_MISALIGNMENT (dr);
2476 Else, compute address misalignment in bytes:
2477 addr_mis = addr & (target_align - 1)
2479 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2481 (elem_size = element type size; an element is the scalar element whose type
2482 is the inner type of the vectype)
2484 The computations will be emitted at the end of BB. We also compute and
2485 store upper bound (included) of the result in BOUND.
2487 When the step of the data-ref in the loop is not 1 (as in interleaved data
2488 and SLP), the number of iterations of the prolog must be divided by the step
2489 (which is equal to the size of interleaved group).
2491 The above formulas assume that VF == number of elements in the vector. This
2492 may not hold when there are multiple-types in the loop.
2493 In this case, for some data-references in the loop the VF does not represent
2494 the number of elements that fit in the vector. Therefore, instead of VF we
2495 use TYPE_VECTOR_SUBPARTS. */
2497 static tree
2498 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2499 basic_block bb, int *bound)
2501 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2502 tree var;
2503 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2504 gimple_seq stmts = NULL, new_stmts = NULL;
2505 tree iters, iters_name;
2506 stmt_vec_info stmt_info = dr_info->stmt;
2507 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2508 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2510 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2512 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "known peeling = %d.\n", npeel);
2518 iters = build_int_cst (niters_type, npeel);
2519 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2521 else
2523 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2524 tree type = TREE_TYPE (misalign_in_elems);
2525 HOST_WIDE_INT elem_size
2526 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2527 /* We only do prolog peeling if the target alignment is known at compile
2528 time. */
2529 poly_uint64 align_in_elems =
2530 exact_div (target_align, elem_size);
2531 tree align_in_elems_minus_1 =
2532 build_int_cst (type, align_in_elems - 1);
2533 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2535 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2536 & (align_in_elems - 1)). */
2537 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2538 size_zero_node) < 0;
2539 if (negative)
2540 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2541 align_in_elems_tree);
2542 else
2543 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2544 misalign_in_elems);
2545 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2546 iters = fold_convert (niters_type, iters);
2547 unsigned HOST_WIDE_INT align_in_elems_c;
2548 if (align_in_elems.is_constant (&align_in_elems_c))
2549 *bound = align_in_elems_c - 1;
2550 else
2551 *bound = -1;
2554 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_NOTE, vect_location,
2556 "niters for prolog loop: %T\n", iters);
2558 var = create_tmp_var (niters_type, "prolog_loop_niters");
2559 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2561 if (new_stmts)
2562 gimple_seq_add_seq (&stmts, new_stmts);
2563 if (stmts)
2565 gcc_assert (single_succ_p (bb));
2566 gimple_stmt_iterator gsi = gsi_last_bb (bb);
2567 if (gsi_end_p (gsi))
2568 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2569 else
2570 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2572 return iters_name;
2576 /* Function vect_update_init_of_dr
2578 If CODE is PLUS, the vector loop starts NITERS iterations after the
2579 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2580 iterations before the scalar one (using masking to skip inactive
2581 elements). This function updates the information recorded in DR to
2582 account for the difference. Specifically, it updates the OFFSET
2583 field of DR_INFO. */
2585 static void
2586 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2588 struct data_reference *dr = dr_info->dr;
2589 tree offset = dr_info->offset;
2590 if (!offset)
2591 offset = build_zero_cst (sizetype);
2593 niters = fold_build2 (MULT_EXPR, sizetype,
2594 fold_convert (sizetype, niters),
2595 fold_convert (sizetype, DR_STEP (dr)));
2596 offset = fold_build2 (code, sizetype,
2597 fold_convert (sizetype, offset), niters);
2598 dr_info->offset = offset;
2602 /* Function vect_update_inits_of_drs
2604 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2605 CODE and NITERS are as for vect_update_inits_of_dr. */
2607 void
2608 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2609 tree_code code)
2611 unsigned int i;
2612 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2613 struct data_reference *dr;
2615 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2617 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2618 here, but since we might use these niters to update the epilogues niters
2619 and data references we can't insert them here as this definition might not
2620 always dominate its uses. */
2621 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2622 niters = fold_convert (sizetype, niters);
2624 FOR_EACH_VEC_ELT (datarefs, i, dr)
2626 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2627 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2628 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2629 vect_update_init_of_dr (dr_info, niters, code);
2633 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2634 by masking. This involves calculating the number of iterations to
2635 be peeled and then aligning all memory references appropriately. */
2637 void
2638 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2640 tree misalign_in_elems;
2641 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2643 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2645 /* From the information recorded in LOOP_VINFO get the number of iterations
2646 that need to be skipped via masking. */
2647 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2649 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2650 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2651 misalign_in_elems = build_int_cst (type, misalign);
2653 else
2655 gimple_seq seq1 = NULL, seq2 = NULL;
2656 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2657 misalign_in_elems = fold_convert (type, misalign_in_elems);
2658 misalign_in_elems = force_gimple_operand (misalign_in_elems,
2659 &seq2, true, NULL_TREE);
2660 gimple_seq_add_seq (&seq1, seq2);
2661 if (seq1)
2663 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2664 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2665 gcc_assert (!new_bb);
2669 if (dump_enabled_p ())
2670 dump_printf_loc (MSG_NOTE, vect_location,
2671 "misalignment for fully-masked loop: %T\n",
2672 misalign_in_elems);
2674 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2676 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2679 /* This function builds ni_name = number of iterations. Statements
2680 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2681 it to TRUE if new ssa_var is generated. */
2683 tree
2684 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2686 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2687 if (TREE_CODE (ni) == INTEGER_CST)
2688 return ni;
2689 else
2691 tree ni_name, var;
2692 gimple_seq stmts = NULL;
2693 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2695 var = create_tmp_var (TREE_TYPE (ni), "niters");
2696 ni_name = force_gimple_operand (ni, &stmts, false, var);
2697 if (stmts)
2699 gsi_insert_seq_on_edge_immediate (pe, stmts);
2700 if (new_var_p != NULL)
2701 *new_var_p = true;
2704 return ni_name;
2708 /* Calculate the number of iterations above which vectorized loop will be
2709 preferred than scalar loop. NITERS_PROLOG is the number of iterations
2710 of prolog loop. If it's integer const, the integer number is also passed
2711 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2712 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2713 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2714 threshold below which the scalar (rather than vectorized) loop will be
2715 executed. This function stores the upper bound (inclusive) of the result
2716 in BOUND_SCALAR. */
2718 static tree
2719 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2720 int bound_prolog, poly_int64 bound_epilog, int th,
2721 poly_uint64 *bound_scalar,
2722 bool check_profitability)
2724 tree type = TREE_TYPE (niters_prolog);
2725 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2726 build_int_cst (type, bound_epilog));
2728 *bound_scalar = bound_prolog + bound_epilog;
2729 if (check_profitability)
2731 /* TH indicates the minimum niters of vectorized loop, while we
2732 compute the maximum niters of scalar loop. */
2733 th--;
2734 /* Peeling for constant times. */
2735 if (int_niters_prolog >= 0)
2737 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2738 return build_int_cst (type, *bound_scalar);
2740 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2741 and BOUND_EPILOG are inclusive upper bounds. */
2742 if (known_ge (th, bound_prolog + bound_epilog))
2744 *bound_scalar = th;
2745 return build_int_cst (type, th);
2747 /* Need to do runtime comparison. */
2748 else if (maybe_gt (th, bound_epilog))
2750 *bound_scalar = upper_bound (*bound_scalar, th);
2751 return fold_build2 (MAX_EXPR, type,
2752 build_int_cst (type, th), niters);
2755 return niters;
2758 /* NITERS is the number of times that the original scalar loop executes
2759 after peeling. Work out the maximum number of iterations N that can
2760 be handled by the vectorized form of the loop and then either:
2762 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2764 niters_vector = N
2766 b) set *STEP_VECTOR_PTR to one and generate:
2768 niters_vector = N / vf
2770 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2771 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2772 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
2774 void
2775 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2776 tree *niters_vector_ptr, tree *step_vector_ptr,
2777 bool niters_no_overflow)
2779 tree ni_minus_gap, var;
2780 tree niters_vector, step_vector, type = TREE_TYPE (niters);
2781 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2782 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2783 tree log_vf = NULL_TREE;
2785 /* If epilogue loop is required because of data accesses with gaps, we
2786 subtract one iteration from the total number of iterations here for
2787 correct calculation of RATIO. */
2788 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2790 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2791 build_one_cst (type));
2792 if (!is_gimple_val (ni_minus_gap))
2794 var = create_tmp_var (type, "ni_gap");
2795 gimple *stmts = NULL;
2796 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2797 true, var);
2798 gsi_insert_seq_on_edge_immediate (pe, stmts);
2801 else
2802 ni_minus_gap = niters;
2804 /* To silence some unexpected warnings, simply initialize to 0. */
2805 unsigned HOST_WIDE_INT const_vf = 0;
2806 if (vf.is_constant (&const_vf)
2807 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2809 /* Create: niters >> log2(vf) */
2810 /* If it's known that niters == number of latch executions + 1 doesn't
2811 overflow, we can generate niters >> log2(vf); otherwise we generate
2812 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2813 will be at least one. */
2814 log_vf = build_int_cst (type, exact_log2 (const_vf));
2815 if (niters_no_overflow)
2816 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2817 else
2818 niters_vector
2819 = fold_build2 (PLUS_EXPR, type,
2820 fold_build2 (RSHIFT_EXPR, type,
2821 fold_build2 (MINUS_EXPR, type,
2822 ni_minus_gap,
2823 build_int_cst (type, vf)),
2824 log_vf),
2825 build_int_cst (type, 1));
2826 step_vector = build_one_cst (type);
2828 else
2830 niters_vector = ni_minus_gap;
2831 step_vector = build_int_cst (type, vf);
2834 if (!is_gimple_val (niters_vector))
2836 var = create_tmp_var (type, "bnd");
2837 gimple_seq stmts = NULL;
2838 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2839 gsi_insert_seq_on_edge_immediate (pe, stmts);
2840 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2841 we set range information to make niters analyzer's life easier.
2842 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2843 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2844 if (stmts != NULL && log_vf)
2846 if (niters_no_overflow)
2848 value_range vr (type,
2849 wi::one (TYPE_PRECISION (type)),
2850 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2851 TYPE_SIGN (type)),
2852 exact_log2 (const_vf),
2853 TYPE_SIGN (type)));
2854 set_range_info (niters_vector, vr);
2856 /* For VF == 1 the vector IV might also overflow so we cannot
2857 assert a minimum value of 1. */
2858 else if (const_vf > 1)
2860 value_range vr (type,
2861 wi::one (TYPE_PRECISION (type)),
2862 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2863 TYPE_SIGN (type))
2864 - (const_vf - 1),
2865 exact_log2 (const_vf), TYPE_SIGN (type))
2866 + 1);
2867 set_range_info (niters_vector, vr);
2871 *niters_vector_ptr = niters_vector;
2872 *step_vector_ptr = step_vector;
2874 return;
2877 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2878 loop specified by LOOP_VINFO after vectorization, compute the number
2879 of iterations before vectorization (niters_vector * vf) and store it
2880 to NITERS_VECTOR_MULT_VF_PTR. */
2882 static void
2883 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2884 tree niters_vector,
2885 tree *niters_vector_mult_vf_ptr)
2887 /* We should be using a step_vector of VF if VF is variable. */
2888 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2889 tree type = TREE_TYPE (niters_vector);
2890 tree log_vf = build_int_cst (type, exact_log2 (vf));
2891 tree tree_vf = build_int_cst (type, vf);
2892 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2894 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2895 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2896 niters_vector, log_vf);
2898 /* If we've peeled a vector iteration then subtract one full vector
2899 iteration. */
2900 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2901 niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
2902 niters_vector_mult_vf, tree_vf);
2904 if (!is_gimple_val (niters_vector_mult_vf))
2906 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2907 gimple_seq stmts = NULL;
2908 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2909 &stmts, true, var);
2910 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2911 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2913 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2916 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2917 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2918 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2919 appear like below:
2921 guard_bb:
2922 if (cond)
2923 goto merge_bb;
2924 else
2925 goto skip_loop;
2927 skip_loop:
2928 header_a:
2929 i_1 = PHI<i_0, i_2>;
2931 i_2 = i_1 + 1;
2932 if (cond_a)
2933 goto latch_a;
2934 else
2935 goto exit_a;
2936 latch_a:
2937 goto header_a;
2939 exit_a:
2940 i_5 = PHI<i_2>;
2942 merge_bb:
2943 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2945 update_loop:
2946 header_b:
2947 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2949 i_4 = i_3 + 1;
2950 if (cond_b)
2951 goto latch_b;
2952 else
2953 goto exit_bb;
2954 latch_b:
2955 goto header_b;
2957 exit_bb:
2959 This function creates PHI nodes at merge_bb and replaces the use of i_5
2960 in the update_loop's PHI node with the result of new PHI result. */
2962 static void
2963 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2964 class loop *update_loop,
2965 edge guard_edge, edge merge_edge)
2967 location_t merge_loc, guard_loc;
2968 edge orig_e = loop_preheader_edge (skip_loop);
2969 edge update_e = loop_preheader_edge (update_loop);
2970 gphi_iterator gsi_orig, gsi_update;
2972 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2973 gsi_update = gsi_start_phis (update_loop->header));
2974 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2975 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2977 gphi *orig_phi = gsi_orig.phi ();
2978 gphi *update_phi = gsi_update.phi ();
2980 /* Generate new phi node at merge bb of the guard. */
2981 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2982 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2984 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2985 args in NEW_PHI for these edges. */
2986 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2987 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2988 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2989 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2990 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2991 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2993 /* Update phi in UPDATE_PHI. */
2994 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2998 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2999 Return a value that equals:
3001 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
3002 - SKIP_VALUE when the main loop is skipped. */
3004 tree
3005 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
3006 tree skip_value)
3008 gcc_assert (loop_vinfo->main_loop_edge);
3010 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
3011 basic_block bb = loop_vinfo->main_loop_edge->dest;
3012 gphi *new_phi = create_phi_node (phi_result, bb);
3013 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
3014 UNKNOWN_LOCATION);
3015 add_phi_arg (new_phi, skip_value,
3016 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3017 return phi_result;
3020 /* Function vect_do_peeling.
3022 Input:
3023 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3025 preheader:
3026 LOOP:
3027 header_bb:
3028 loop_body
3029 if (exit_loop_cond) goto exit_bb
3030 else goto header_bb
3031 exit_bb:
3033 - NITERS: The number of iterations of the loop.
3034 - NITERSM1: The number of iterations of the loop's latch.
3035 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3036 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3037 CHECK_PROFITABILITY is true.
3038 Output:
3039 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3040 iterate after vectorization; see vect_set_loop_condition for details.
3041 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3042 should be set to the number of scalar iterations handled by the
3043 vector loop. The SSA name is only used on exit from the loop.
3045 This function peels prolog and epilog from the loop, adds guards skipping
3046 PROLOG and EPILOG for various conditions. As a result, the changed CFG
3047 would look like:
3049 guard_bb_1:
3050 if (prefer_scalar_loop) goto merge_bb_1
3051 else goto guard_bb_2
3053 guard_bb_2:
3054 if (skip_prolog) goto merge_bb_2
3055 else goto prolog_preheader
3057 prolog_preheader:
3058 PROLOG:
3059 prolog_header_bb:
3060 prolog_body
3061 if (exit_prolog_cond) goto prolog_exit_bb
3062 else goto prolog_header_bb
3063 prolog_exit_bb:
3065 merge_bb_2:
3067 vector_preheader:
3068 VECTOR LOOP:
3069 vector_header_bb:
3070 vector_body
3071 if (exit_vector_cond) goto vector_exit_bb
3072 else goto vector_header_bb
3073 vector_exit_bb:
3075 guard_bb_3:
3076 if (skip_epilog) goto merge_bb_3
3077 else goto epilog_preheader
3079 merge_bb_1:
3081 epilog_preheader:
3082 EPILOG:
3083 epilog_header_bb:
3084 epilog_body
3085 if (exit_epilog_cond) goto merge_bb_3
3086 else goto epilog_header_bb
3088 merge_bb_3:
3090 Note this function peels prolog and epilog only if it's necessary,
3091 as well as guards.
3092 This function returns the epilogue loop if a decision was made to vectorize
3093 it, otherwise NULL.
3095 The analysis resulting in this epilogue loop's loop_vec_info was performed
3096 in the same vect_analyze_loop call as the main loop's. At that time
3097 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3098 vectorization factors than the main loop. This list is stored in the main
3099 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
3100 vectorize the epilogue loop for a lower vectorization factor, the
3101 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
3102 updated and linked to the epilogue loop. This is later used to vectorize
3103 the epilogue. The reason the loop_vec_info needs updating is that it was
3104 constructed based on the original main loop, and the epilogue loop is a
3105 copy of this loop, so all links pointing to statements in the original loop
3106 need updating. Furthermore, these loop_vec_infos share the
3107 data_reference's records, which will also need to be updated.
3109 TODO: Guard for prefer_scalar_loop should be emitted along with
3110 versioning conditions if loop versioning is needed. */
3113 class loop *
3114 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3115 tree *niters_vector, tree *step_vector,
3116 tree *niters_vector_mult_vf_var, int th,
3117 bool check_profitability, bool niters_no_overflow,
3118 tree *advance)
3120 edge e, guard_e;
3121 tree type = TREE_TYPE (niters), guard_cond;
3122 basic_block guard_bb, guard_to;
3123 profile_probability prob_prolog, prob_vector, prob_epilog;
3124 int estimated_vf;
3125 int prolog_peeling = 0;
3126 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
3127 /* We currently do not support prolog peeling if the target alignment is not
3128 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
3129 target alignment being constant. */
3130 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3131 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
3132 return NULL;
3134 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3135 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3137 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3138 poly_uint64 bound_epilog = 0;
3139 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3140 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3141 bound_epilog += vf - 1;
3142 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3143 bound_epilog += 1;
3145 /* For early breaks the scalar loop needs to execute at most VF times
3146 to find the element that caused the break. */
3147 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3148 bound_epilog = vf;
3150 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3151 poly_uint64 bound_scalar = bound_epilog;
3153 if (!prolog_peeling && !epilog_peeling)
3154 return NULL;
3156 /* Before doing any peeling make sure to reset debug binds outside of
3157 the loop refering to defs not in LC SSA. */
3158 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3159 for (unsigned i = 0; i < loop->num_nodes; ++i)
3161 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3162 imm_use_iterator ui;
3163 gimple *use_stmt;
3164 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3165 gsi_next (&gsi))
3167 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3168 if (gimple_debug_bind_p (use_stmt)
3169 && loop != gimple_bb (use_stmt)->loop_father
3170 && !flow_loop_nested_p (loop,
3171 gimple_bb (use_stmt)->loop_father))
3173 gimple_debug_bind_reset_value (use_stmt);
3174 update_stmt (use_stmt);
3177 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3178 gsi_next (&gsi))
3180 ssa_op_iter op_iter;
3181 def_operand_p def_p;
3182 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3183 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3184 if (gimple_debug_bind_p (use_stmt)
3185 && loop != gimple_bb (use_stmt)->loop_father
3186 && !flow_loop_nested_p (loop,
3187 gimple_bb (use_stmt)->loop_father))
3189 gimple_debug_bind_reset_value (use_stmt);
3190 update_stmt (use_stmt);
3195 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3196 estimated_vf = vect_vf_for_cost (loop_vinfo);
3197 if (estimated_vf == 2)
3198 estimated_vf = 3;
3199 prob_prolog = prob_epilog = profile_probability::guessed_always ()
3200 .apply_scale (estimated_vf - 1, estimated_vf);
3202 class loop *prolog, *epilog = NULL;
3203 class loop *first_loop = loop;
3204 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3206 /* SSA form needs to be up-to-date since we are going to manually
3207 update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3208 update SSA state after that, so we have to make sure to not lose any
3209 pending update needs. */
3210 gcc_assert (!need_ssa_update_p (cfun));
3212 /* If we're vectorizing an epilogue loop, we have ensured that the
3213 virtual operand is in SSA form throughout the vectorized main loop.
3214 Normally it is possible to trace the updated
3215 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3216 back to scalar-stmt vuses, meaning that the effect of the SSA update
3217 remains local to the main loop. However, there are rare cases in
3218 which the vectorized loop should have vdefs even when the original scalar
3219 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
3220 introduces clobbers of the temporary vector array, which in turn
3221 needs new vdefs. If the scalar loop doesn't write to memory, these
3222 new vdefs will be the only ones in the vector loop.
3223 We are currently defering updating virtual SSA form and creating
3224 of a virtual PHI for this case so we do not have to make sure the
3225 newly introduced virtual def is in LCSSA form. */
3227 if (MAY_HAVE_DEBUG_BIND_STMTS)
3229 gcc_assert (!adjust_vec.exists ());
3230 adjust_vec.create (32);
3232 initialize_original_copy_tables ();
3234 /* Record the anchor bb at which the guard should be placed if the scalar
3235 loop might be preferred. */
3236 basic_block anchor = loop_preheader_edge (loop)->src;
3238 /* Generate the number of iterations for the prolog loop. We do this here
3239 so that we can also get the upper bound on the number of iterations. */
3240 tree niters_prolog;
3241 int bound_prolog = 0;
3242 if (prolog_peeling)
3244 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3245 &bound_prolog);
3246 /* If algonment peeling is known, we will always execute prolog. */
3247 if (TREE_CODE (niters_prolog) == INTEGER_CST)
3248 prob_prolog = profile_probability::always ();
3250 else
3251 niters_prolog = build_int_cst (type, 0);
3253 loop_vec_info epilogue_vinfo = NULL;
3254 if (vect_epilogues)
3256 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3257 loop_vinfo->epilogue_vinfos.ordered_remove (0);
3260 tree niters_vector_mult_vf = NULL_TREE;
3261 /* Saving NITERs before the loop, as this may be changed by prologue. */
3262 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3263 edge update_e = NULL, skip_e = NULL;
3264 unsigned int lowest_vf = constant_lower_bound (vf);
3265 /* Prolog loop may be skipped. */
3266 bool skip_prolog = (prolog_peeling != 0);
3267 /* Skip this loop to epilog when there are not enough iterations to enter this
3268 vectorized loop. If true we should perform runtime checks on the NITERS
3269 to check whether we should skip the current vectorized loop. If we know
3270 the number of scalar iterations we may choose to add a runtime check if
3271 this number "maybe" smaller than the number of iterations required
3272 when we know the number of scalar iterations may potentially
3273 be smaller than the number of iterations required to enter this loop, for
3274 this we use the upper bounds on the prolog and epilog peeling. When we
3275 don't know the number of iterations and don't require versioning it is
3276 because we have asserted that there are enough scalar iterations to enter
3277 the main loop, so this skip is not necessary. When we are versioning then
3278 we only add such a skip if we have chosen to vectorize the epilogue. */
3279 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3280 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3281 bound_prolog + bound_epilog)
3282 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3283 || vect_epilogues));
3285 /* Epilog loop must be executed if the number of iterations for epilog
3286 loop is known at compile time, otherwise we need to add a check at
3287 the end of vector loop and skip to the end of epilog loop. */
3288 bool skip_epilog = (prolog_peeling < 0
3289 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3290 || !vf.is_constant ());
3291 /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3292 loop must be executed. */
3293 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3294 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3295 skip_epilog = false;
3297 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3298 auto_vec<profile_count> original_counts;
3299 basic_block *original_bbs = NULL;
3301 if (skip_vector)
3303 split_edge (loop_preheader_edge (loop));
3305 if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3307 original_bbs = get_loop_body (loop);
3308 for (unsigned int i = 0; i < loop->num_nodes; i++)
3309 original_counts.safe_push(original_bbs[i]->count);
3312 /* Due to the order in which we peel prolog and epilog, we first
3313 propagate probability to the whole loop. The purpose is to
3314 avoid adjusting probabilities of both prolog and vector loops
3315 separately. Note in this case, the probability of epilog loop
3316 needs to be scaled back later. */
3317 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3318 if (prob_vector.initialized_p ())
3320 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3321 scale_loop_profile (loop, prob_vector, -1);
3325 if (vect_epilogues)
3327 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3328 use the original scalar loop as remaining epilogue if necessary. */
3329 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3330 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3331 LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3332 = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3335 if (prolog_peeling)
3337 e = loop_preheader_edge (loop);
3338 edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3339 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3340 && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo));
3342 /* Peel prolog and put it on preheader edge of loop. */
3343 edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3344 edge prolog_e = NULL;
3345 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3346 scalar_loop, scalar_e,
3347 e, &prolog_e);
3348 gcc_assert (prolog);
3349 prolog->force_vectorize = false;
3351 first_loop = prolog;
3352 reset_original_copy_tables ();
3354 /* Update the number of iterations for prolog loop. */
3355 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3356 vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3357 step_prolog, NULL_TREE, false);
3359 /* Skip the prolog loop. */
3360 if (skip_prolog)
3362 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3363 niters_prolog, build_int_cst (type, 0));
3364 guard_bb = loop_preheader_edge (prolog)->src;
3365 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3366 guard_to = split_edge (loop_preheader_edge (loop));
3367 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3368 guard_to, guard_bb,
3369 prob_prolog.invert (),
3370 irred_flag);
3371 for (edge alt_e : get_loop_exit_edges (prolog))
3373 if (alt_e == prolog_e)
3374 continue;
3375 basic_block old_dom
3376 = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
3377 if (flow_bb_inside_loop_p (prolog, old_dom))
3379 auto_vec<basic_block, 8> queue;
3380 for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
3381 son; son = next_dom_son (CDI_DOMINATORS, son))
3382 if (!flow_bb_inside_loop_p (prolog, son))
3383 queue.safe_push (son);
3384 for (auto son : queue)
3385 set_immediate_dominator (CDI_DOMINATORS, son, guard_bb);
3389 e = EDGE_PRED (guard_to, 0);
3390 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3391 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3393 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3394 scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3397 /* Update init address of DRs. */
3398 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3399 /* Update niters for vector loop. */
3400 LOOP_VINFO_NITERS (loop_vinfo)
3401 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3402 LOOP_VINFO_NITERSM1 (loop_vinfo)
3403 = fold_build2 (MINUS_EXPR, type,
3404 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3405 bool new_var_p = false;
3406 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3407 /* It's guaranteed that vector loop bound before vectorization is at
3408 least VF, so set range information for newly generated var. */
3409 if (new_var_p)
3411 value_range vr (type,
3412 wi::to_wide (build_int_cst (type, lowest_vf)),
3413 wi::to_wide (TYPE_MAX_VALUE (type)));
3414 set_range_info (niters, vr);
3417 /* Prolog iterates at most bound_prolog times, latch iterates at
3418 most bound_prolog - 1 times. */
3419 record_niter_bound (prolog, bound_prolog - 1, false, true);
3420 delete_update_ssa ();
3421 adjust_vec_debug_stmts ();
3422 scev_reset ();
3424 basic_block bb_before_epilog = NULL;
3426 if (epilog_peeling)
3428 e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3429 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3431 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3432 said epilog then we should use a copy of the main loop as a starting
3433 point. This loop may have already had some preliminary transformations
3434 to allow for more optimal vectorization, for example if-conversion.
3435 If we are not vectorizing the epilog then we should use the scalar loop
3436 as the transformations mentioned above make less or no sense when not
3437 vectorizing. */
3438 edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3439 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3440 edge epilog_e = vect_epilogues ? e : scalar_e;
3441 edge new_epilog_e = NULL;
3442 auto_vec<basic_block> doms;
3443 epilog
3444 = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3445 &new_epilog_e, true, &doms);
3447 LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3448 gcc_assert (epilog);
3449 gcc_assert (new_epilog_e);
3450 epilog->force_vectorize = false;
3451 bb_before_epilog = loop_preheader_edge (epilog)->src;
3453 /* Scalar version loop may be preferred. In this case, add guard
3454 and skip to epilog. Note this only happens when the number of
3455 iterations of loop is unknown at compile time, otherwise this
3456 won't be vectorized. */
3457 if (skip_vector)
3459 /* Additional epilogue iteration is peeled if gap exists. */
3460 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3461 bound_prolog, bound_epilog,
3462 th, &bound_scalar,
3463 check_profitability);
3464 /* Build guard against NITERSM1 since NITERS may overflow. */
3465 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3466 guard_bb = anchor;
3467 guard_to = split_edge (loop_preheader_edge (epilog));
3468 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3469 guard_to, guard_bb,
3470 prob_vector.invert (),
3471 irred_flag);
3472 skip_e = guard_e;
3473 e = EDGE_PRED (guard_to, 0);
3474 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3475 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3477 /* Simply propagate profile info from guard_bb to guard_to which is
3478 a merge point of control flow. */
3479 profile_count old_count = guard_to->count;
3480 guard_to->count = guard_bb->count;
3482 /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3483 if (vect_epilogues || scalar_loop == NULL)
3485 gcc_assert(epilog->num_nodes == loop->num_nodes);
3486 basic_block *bbs = get_loop_body (epilog);
3487 for (unsigned int i = 0; i < epilog->num_nodes; i++)
3489 gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3490 bbs[i]->count = original_counts[i];
3492 free (bbs);
3493 free (original_bbs);
3495 else if (old_count.nonzero_p ())
3496 scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3498 /* Only need to handle basic block before epilog loop if it's not
3499 the guard_bb, which is the case when skip_vector is true. */
3500 if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3501 bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3502 bb_before_epilog = loop_preheader_edge (epilog)->src;
3505 /* If loop is peeled for non-zero constant times, now niters refers to
3506 orig_niters - prolog_peeling, it won't overflow even the orig_niters
3507 overflows. */
3508 niters_no_overflow |= (prolog_peeling > 0);
3509 vect_gen_vector_loop_niters (loop_vinfo, niters,
3510 niters_vector, step_vector,
3511 niters_no_overflow);
3512 if (!integer_onep (*step_vector))
3514 /* On exit from the loop we will have an easy way of calcalating
3515 NITERS_VECTOR / STEP * STEP. Install a dummy definition
3516 until then. */
3517 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3518 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3519 *niters_vector_mult_vf_var = niters_vector_mult_vf;
3521 else
3522 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3523 &niters_vector_mult_vf);
3524 /* Update IVs of original loop as if they were advanced by
3525 niters_vector_mult_vf steps. */
3526 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3527 update_e = skip_vector ? e : loop_preheader_edge (epilog);
3528 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3529 update_e = single_succ_edge (LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
3531 /* If we have a peeled vector iteration, all exits are the same, leave it
3532 and so the main exit needs to be treated the same as the alternative
3533 exits in that we leave their updates to vectorizable_live_operations.
3535 if (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3536 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3537 update_e);
3539 /* If we have a peeled vector iteration we will never skip the epilog loop
3540 and we can simplify the cfg a lot by not doing the edge split. */
3541 if (skip_epilog || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3543 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3544 niters, niters_vector_mult_vf);
3546 guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3547 edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3548 guard_to = epilog_e->dest;
3549 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3550 skip_vector ? anchor : guard_bb,
3551 prob_epilog.invert (),
3552 irred_flag);
3553 doms.safe_push (guard_to);
3554 if (vect_epilogues)
3555 epilogue_vinfo->skip_this_loop_edge = guard_e;
3556 edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3557 gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3558 for (gphi_iterator gsi = gsi_start_phis (guard_to);
3559 !gsi_end_p (gsi); gsi_next (&gsi))
3561 /* We are expecting all of the PHIs we have on epilog_e
3562 to be also on the main loop exit. But sometimes
3563 a stray virtual definition can appear at epilog_e
3564 which we can then take as the same on all exits,
3565 we've removed the LC SSA PHI on the main exit before
3566 so we wouldn't need to create a loop PHI for it. */
3567 if (virtual_operand_p (gimple_phi_result (*gsi))
3568 && (gsi_end_p (gsi2)
3569 || !virtual_operand_p (gimple_phi_result (*gsi2))))
3570 add_phi_arg (*gsi,
3571 gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3572 guard_e, UNKNOWN_LOCATION);
3573 else
3575 add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3576 UNKNOWN_LOCATION);
3577 gsi_next (&gsi2);
3581 /* Only need to handle basic block before epilog loop if it's not
3582 the guard_bb, which is the case when skip_vector is true. */
3583 if (guard_bb != bb_before_epilog)
3585 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3587 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3589 scale_loop_profile (epilog, prob_epilog, -1);
3592 /* Recalculate the dominators after adding the guard edge. */
3593 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3594 iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3596 /* When we do not have a loop-around edge to the epilog we know
3597 the vector loop covered at least VF scalar iterations unless
3598 we have early breaks.
3599 Update any known upper bound with this knowledge. */
3600 if (! skip_vector
3601 && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3603 if (epilog->any_upper_bound)
3604 epilog->nb_iterations_upper_bound -= lowest_vf;
3605 if (epilog->any_likely_upper_bound)
3606 epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3607 if (epilog->any_estimate)
3608 epilog->nb_iterations_estimate -= lowest_vf;
3611 unsigned HOST_WIDE_INT bound;
3612 if (bound_scalar.is_constant (&bound))
3614 gcc_assert (bound != 0);
3615 /* Adjust the upper bound by the extra peeled vector iteration if we
3616 are an epilogue of an peeled vect loop and not VLA. For VLA the
3617 loop bounds are unknown. */
3618 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3619 && vf.is_constant ())
3620 bound += vf.to_constant ();
3621 /* -1 to convert loop iterations to latch iterations. */
3622 record_niter_bound (epilog, bound - 1, false, true);
3623 scale_loop_profile (epilog, profile_probability::always (),
3624 bound - 1);
3627 delete_update_ssa ();
3628 adjust_vec_debug_stmts ();
3629 scev_reset ();
3632 if (vect_epilogues)
3634 epilog->aux = epilogue_vinfo;
3635 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3636 LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3637 = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3639 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3641 /* We now must calculate the number of NITERS performed by the previous
3642 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3643 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3644 niters_prolog, niters_vector_mult_vf);
3646 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3647 determine whether we are coming from the previous vectorized loop
3648 using the update_e edge or the skip_vector basic block using the
3649 skip_e edge. */
3650 if (skip_vector)
3652 gcc_assert (update_e != NULL && skip_e != NULL);
3653 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3654 update_e->dest);
3655 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3656 gimple *stmt = gimple_build_assign (new_ssa, niters);
3657 gimple_stmt_iterator gsi;
3658 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3659 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3661 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3662 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3664 else
3666 gsi = gsi_last_bb (update_e->src);
3667 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3670 niters = new_ssa;
3671 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3672 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3673 UNKNOWN_LOCATION);
3674 niters = PHI_RESULT (new_phi);
3675 epilogue_vinfo->main_loop_edge = update_e;
3676 epilogue_vinfo->skip_main_loop_edge = skip_e;
3679 /* Set ADVANCE to the number of iterations performed by the previous
3680 loop and its prologue. */
3681 *advance = niters;
3683 /* Subtract the number of iterations performed by the vectorized loop
3684 from the number of total iterations. */
3685 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3686 before_loop_niters,
3687 niters);
3689 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3690 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3691 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3692 epilogue_niters,
3693 build_one_cst (TREE_TYPE (epilogue_niters)));
3695 /* Decide what to do if the number of epilogue iterations is not
3696 a multiple of the epilogue loop's vectorization factor.
3697 We should have rejected the loop during the analysis phase
3698 if this fails. */
3699 bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3700 gcc_assert (res);
3703 adjust_vec.release ();
3704 free_original_copy_tables ();
3706 return vect_epilogues ? epilog : NULL;
3709 /* Function vect_create_cond_for_niters_checks.
3711 Create a conditional expression that represents the run-time checks for
3712 loop's niter. The loop is guaranteed to terminate if the run-time
3713 checks hold.
3715 Input:
3716 COND_EXPR - input conditional expression. New conditions will be chained
3717 with logical AND operation. If it is NULL, then the function
3718 is used to return the number of alias checks.
3719 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3720 to be checked.
3722 Output:
3723 COND_EXPR - conditional expression.
3725 The returned COND_EXPR is the conditional expression to be used in the
3726 if statement that controls which version of the loop gets executed at
3727 runtime. */
3729 static void
3730 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3732 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3734 if (*cond_expr)
3735 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3736 *cond_expr, part_cond_expr);
3737 else
3738 *cond_expr = part_cond_expr;
3741 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3742 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3744 static void
3745 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3747 if (*cond_expr)
3748 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3749 *cond_expr, part_cond_expr);
3750 else
3751 *cond_expr = part_cond_expr;
3754 /* Function vect_create_cond_for_align_checks.
3756 Create a conditional expression that represents the alignment checks for
3757 all of data references (array element references) whose alignment must be
3758 checked at runtime.
3760 Input:
3761 COND_EXPR - input conditional expression. New conditions will be chained
3762 with logical AND operation.
3763 LOOP_VINFO - two fields of the loop information are used.
3764 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3765 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3767 Output:
3768 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3769 expression.
3770 The returned value is the conditional expression to be used in the if
3771 statement that controls which version of the loop gets executed at runtime.
3773 The algorithm makes two assumptions:
3774 1) The number of bytes "n" in a vector is a power of 2.
3775 2) An address "a" is aligned if a%n is zero and that this
3776 test can be done as a&(n-1) == 0. For example, for 16
3777 byte vectors the test is a&0xf == 0. */
3779 static void
3780 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3781 tree *cond_expr,
3782 gimple_seq *cond_expr_stmt_list)
3784 const vec<stmt_vec_info> &may_misalign_stmts
3785 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3786 stmt_vec_info stmt_info;
3787 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3788 tree mask_cst;
3789 unsigned int i;
3790 tree int_ptrsize_type;
3791 char tmp_name[20];
3792 tree or_tmp_name = NULL_TREE;
3793 tree and_tmp_name;
3794 gimple *and_stmt;
3795 tree ptrsize_zero;
3796 tree part_cond_expr;
3798 /* Check that mask is one less than a power of 2, i.e., mask is
3799 all zeros followed by all ones. */
3800 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3802 int_ptrsize_type = signed_type_for (ptr_type_node);
3804 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3805 of the first vector of the i'th data reference. */
3807 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3809 gimple_seq new_stmt_list = NULL;
3810 tree addr_base;
3811 tree addr_tmp_name;
3812 tree new_or_tmp_name;
3813 gimple *addr_stmt, *or_stmt;
3814 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3815 bool negative = tree_int_cst_compare
3816 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3817 tree offset = negative
3818 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3819 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3820 : size_zero_node;
3822 /* create: addr_tmp = (int)(address_of_first_vector) */
3823 addr_base =
3824 vect_create_addr_base_for_vector_ref (loop_vinfo,
3825 stmt_info, &new_stmt_list,
3826 offset);
3827 if (new_stmt_list != NULL)
3828 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3830 sprintf (tmp_name, "addr2int%d", i);
3831 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3832 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3833 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3835 /* The addresses are OR together. */
3837 if (or_tmp_name != NULL_TREE)
3839 /* create: or_tmp = or_tmp | addr_tmp */
3840 sprintf (tmp_name, "orptrs%d", i);
3841 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3842 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3843 or_tmp_name, addr_tmp_name);
3844 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3845 or_tmp_name = new_or_tmp_name;
3847 else
3848 or_tmp_name = addr_tmp_name;
3850 } /* end for i */
3852 mask_cst = build_int_cst (int_ptrsize_type, mask);
3854 /* create: and_tmp = or_tmp & mask */
3855 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3857 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3858 or_tmp_name, mask_cst);
3859 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3861 /* Make and_tmp the left operand of the conditional test against zero.
3862 if and_tmp has a nonzero bit then some address is unaligned. */
3863 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3864 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3865 and_tmp_name, ptrsize_zero);
3866 chain_cond_expr (cond_expr, part_cond_expr);
3869 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3870 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3871 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3872 and this new condition are true. Treat a null *COND_EXPR as "true". */
3874 static void
3875 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3877 const vec<vec_object_pair> &pairs
3878 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3879 unsigned int i;
3880 vec_object_pair *pair;
3881 FOR_EACH_VEC_ELT (pairs, i, pair)
3883 tree addr1 = build_fold_addr_expr (pair->first);
3884 tree addr2 = build_fold_addr_expr (pair->second);
3885 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3886 addr1, addr2);
3887 chain_cond_expr (cond_expr, part_cond_expr);
3891 /* Create an expression that is true when all lower-bound conditions for
3892 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3894 static void
3895 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3897 const vec<vec_lower_bound> &lower_bounds
3898 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3899 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3901 tree expr = lower_bounds[i].expr;
3902 tree type = unsigned_type_for (TREE_TYPE (expr));
3903 expr = fold_convert (type, expr);
3904 poly_uint64 bound = lower_bounds[i].min_value;
3905 if (!lower_bounds[i].unsigned_p)
3907 expr = fold_build2 (PLUS_EXPR, type, expr,
3908 build_int_cstu (type, bound - 1));
3909 bound += bound - 1;
3911 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3912 build_int_cstu (type, bound));
3913 chain_cond_expr (cond_expr, part_cond_expr);
3917 /* Function vect_create_cond_for_alias_checks.
3919 Create a conditional expression that represents the run-time checks for
3920 overlapping of address ranges represented by a list of data references
3921 relations passed as input.
3923 Input:
3924 COND_EXPR - input conditional expression. New conditions will be chained
3925 with logical AND operation. If it is NULL, then the function
3926 is used to return the number of alias checks.
3927 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3928 to be checked.
3930 Output:
3931 COND_EXPR - conditional expression.
3933 The returned COND_EXPR is the conditional expression to be used in the if
3934 statement that controls which version of the loop gets executed at runtime.
3937 void
3938 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3940 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3941 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3943 if (comp_alias_ddrs.is_empty ())
3944 return;
3946 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3947 &comp_alias_ddrs, cond_expr);
3948 if (dump_enabled_p ())
3949 dump_printf_loc (MSG_NOTE, vect_location,
3950 "created %u versioning for alias checks.\n",
3951 comp_alias_ddrs.length ());
3955 /* Function vect_loop_versioning.
3957 If the loop has data references that may or may not be aligned or/and
3958 has data reference relations whose independence was not proven then
3959 two versions of the loop need to be generated, one which is vectorized
3960 and one which isn't. A test is then generated to control which of the
3961 loops is executed. The test checks for the alignment of all of the
3962 data references that may or may not be aligned. An additional
3963 sequence of runtime tests is generated for each pairs of DDRs whose
3964 independence was not proven. The vectorized version of loop is
3965 executed only if both alias and alignment tests are passed.
3967 The test generated to check which version of loop is executed
3968 is modified to also check for profitability as indicated by the
3969 cost model threshold TH.
3971 The versioning precondition(s) are placed in *COND_EXPR and
3972 *COND_EXPR_STMT_LIST. */
3974 class loop *
3975 vect_loop_versioning (loop_vec_info loop_vinfo,
3976 gimple *loop_vectorized_call)
3978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3979 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3980 basic_block condition_bb;
3981 gphi_iterator gsi;
3982 gimple_stmt_iterator cond_exp_gsi;
3983 basic_block merge_bb;
3984 basic_block new_exit_bb;
3985 edge new_exit_e, e;
3986 gphi *orig_phi, *new_phi;
3987 tree cond_expr = NULL_TREE;
3988 gimple_seq cond_expr_stmt_list = NULL;
3989 tree arg;
3990 profile_probability prob = profile_probability::likely ();
3991 gimple_seq gimplify_stmt_list = NULL;
3992 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3993 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3994 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3995 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3996 poly_uint64 versioning_threshold
3997 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3998 tree version_simd_if_cond
3999 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
4000 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
4002 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
4003 && !ordered_p (th, versioning_threshold))
4004 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4005 build_int_cst (TREE_TYPE (scalar_loop_iters),
4006 th - 1));
4007 if (maybe_ne (versioning_threshold, 0U))
4009 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4010 build_int_cst (TREE_TYPE (scalar_loop_iters),
4011 versioning_threshold - 1));
4012 if (cond_expr)
4013 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
4014 expr, cond_expr);
4015 else
4016 cond_expr = expr;
4019 tree cost_name = NULL_TREE;
4020 profile_probability prob2 = profile_probability::always ();
4021 if (cond_expr
4022 && EXPR_P (cond_expr)
4023 && (version_niter
4024 || version_align
4025 || version_alias
4026 || version_simd_if_cond))
4028 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4029 &cond_expr_stmt_list,
4030 is_gimple_val, NULL_TREE);
4031 /* Split prob () into two so that the overall probability of passing
4032 both the cost-model and versioning checks is the orig prob. */
4033 prob2 = prob = prob.sqrt ();
4036 if (version_niter)
4037 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
4039 if (cond_expr)
4041 gimple_seq tem = NULL;
4042 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4043 &tem, is_gimple_condexpr_for_cond,
4044 NULL_TREE);
4045 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4048 if (version_align)
4049 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4050 &cond_expr_stmt_list);
4052 if (version_alias)
4054 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4055 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4056 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4059 if (version_simd_if_cond)
4061 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4062 if (flag_checking)
4063 if (basic_block bb
4064 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4065 gcc_assert (bb != loop->header
4066 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4067 && (scalar_loop == NULL
4068 || (bb != scalar_loop->header
4069 && dominated_by_p (CDI_DOMINATORS,
4070 scalar_loop->header, bb))));
4071 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4072 tree c = fold_build2 (NE_EXPR, boolean_type_node,
4073 version_simd_if_cond, zero);
4074 if (cond_expr)
4075 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4076 c, cond_expr);
4077 else
4078 cond_expr = c;
4079 if (dump_enabled_p ())
4080 dump_printf_loc (MSG_NOTE, vect_location,
4081 "created versioning for simd if condition check.\n");
4084 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4085 &gimplify_stmt_list,
4086 is_gimple_condexpr_for_cond, NULL_TREE);
4087 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4089 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4090 invariant in. */
4091 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4092 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4093 !gsi_end_p (gsi); gsi_next (&gsi))
4095 gimple *stmt = gsi_stmt (gsi);
4096 update_stmt (stmt);
4097 ssa_op_iter iter;
4098 use_operand_p use_p;
4099 basic_block def_bb;
4100 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4101 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4102 && flow_bb_inside_loop_p (outermost, def_bb))
4103 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4106 /* Search for the outermost loop we can version. Avoid versioning of
4107 non-perfect nests but allow if-conversion versioned loops inside. */
4108 class loop *loop_to_version = loop;
4109 if (flow_loop_nested_p (outermost, loop))
4111 if (dump_enabled_p ())
4112 dump_printf_loc (MSG_NOTE, vect_location,
4113 "trying to apply versioning to outer loop %d\n",
4114 outermost->num);
4115 if (outermost->num == 0)
4116 outermost = superloop_at_depth (loop, 1);
4117 /* And avoid applying versioning on non-perfect nests. */
4118 while (loop_to_version != outermost
4119 && (e = single_exit (loop_outer (loop_to_version)))
4120 && !(e->flags & EDGE_COMPLEX)
4121 && (!loop_outer (loop_to_version)->inner->next
4122 || vect_loop_vectorized_call (loop_to_version))
4123 && (!loop_outer (loop_to_version)->inner->next
4124 || !loop_outer (loop_to_version)->inner->next->next))
4125 loop_to_version = loop_outer (loop_to_version);
4128 /* Apply versioning. If there is already a scalar version created by
4129 if-conversion re-use that. Note we cannot re-use the copy of
4130 an if-converted outer-loop when vectorizing the inner loop only. */
4131 gcond *cond;
4132 if ((!loop_to_version->inner || loop == loop_to_version)
4133 && loop_vectorized_call)
4135 gcc_assert (scalar_loop);
4136 condition_bb = gimple_bb (loop_vectorized_call);
4137 cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4138 gimple_cond_set_condition_from_tree (cond, cond_expr);
4139 update_stmt (cond);
4141 if (cond_expr_stmt_list)
4143 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4144 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4145 GSI_SAME_STMT);
4148 /* if-conversion uses profile_probability::always () for both paths,
4149 reset the paths probabilities appropriately. */
4150 edge te, fe;
4151 extract_true_false_edges_from_block (condition_bb, &te, &fe);
4152 te->probability = prob;
4153 fe->probability = prob.invert ();
4154 /* We can scale loops counts immediately but have to postpone
4155 scaling the scalar loop because we re-use it during peeling.
4157 Ifcvt duplicates loop preheader, loop body and produces an basic
4158 block after loop exit. We need to scale all that. */
4159 basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4160 preheader->count = preheader->count.apply_probability (prob * prob2);
4161 scale_loop_frequencies (loop_to_version, prob * prob2);
4162 /* When the loop has multiple exits then we can only version itself.
4163 This is denoted by loop_to_version == loop. In this case we can
4164 do the versioning by selecting the exit edge the vectorizer is
4165 currently using. */
4166 edge exit_edge;
4167 if (loop_to_version == loop)
4168 exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4169 else
4170 exit_edge = single_exit (loop_to_version);
4171 exit_edge->dest->count = preheader->count;
4172 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4174 nloop = scalar_loop;
4175 if (dump_enabled_p ())
4176 dump_printf_loc (MSG_NOTE, vect_location,
4177 "reusing %sloop version created by if conversion\n",
4178 loop_to_version != loop ? "outer " : "");
4180 else
4182 if (loop_to_version != loop
4183 && dump_enabled_p ())
4184 dump_printf_loc (MSG_NOTE, vect_location,
4185 "applying loop versioning to outer loop %d\n",
4186 loop_to_version->num);
4188 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4190 initialize_original_copy_tables ();
4191 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4192 prob * prob2, (prob * prob2).invert (),
4193 prob * prob2, (prob * prob2).invert (),
4194 true);
4195 /* We will later insert second conditional so overall outcome of
4196 both is prob * prob2. */
4197 edge true_e, false_e;
4198 extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4199 true_e->probability = prob;
4200 false_e->probability = prob.invert ();
4201 gcc_assert (nloop);
4202 nloop = get_loop_copy (loop);
4204 /* For cycle vectorization with SLP we rely on the PHI arguments
4205 appearing in the same order as the SLP node operands which for the
4206 loop PHI nodes means the preheader edge dest index needs to remain
4207 the same for the analyzed loop which also becomes the vectorized one.
4208 Make it so in case the state after versioning differs by redirecting
4209 the first edge into the header to the same destination which moves
4210 it last. */
4211 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4213 edge e = EDGE_PRED (loop->header, 0);
4214 ssa_redirect_edge (e, e->dest);
4215 flush_pending_stmts (e);
4217 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4219 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4220 reap those otherwise; they also refer to the original
4221 loops. */
4222 class loop *l = loop;
4223 while (gimple *call = vect_loop_vectorized_call (l))
4225 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4226 fold_loop_internal_call (call, boolean_false_node);
4227 l = loop_outer (l);
4229 free_original_copy_tables ();
4231 if (cond_expr_stmt_list)
4233 cond_exp_gsi = gsi_last_bb (condition_bb);
4234 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4235 GSI_SAME_STMT);
4238 /* Loop versioning violates an assumption we try to maintain during
4239 vectorization - that the loop exit block has a single predecessor.
4240 After versioning, the exit block of both loop versions is the same
4241 basic block (i.e. it has two predecessors). Just in order to simplify
4242 following transformations in the vectorizer, we fix this situation
4243 here by adding a new (empty) block on the exit-edge of the loop,
4244 with the proper loop-exit phis to maintain loop-closed-form.
4245 If loop versioning wasn't done from loop, but scalar_loop instead,
4246 merge_bb will have already just a single successor. */
4248 /* When the loop has multiple exits then we can only version itself.
4249 This is denoted by loop_to_version == loop. In this case we can
4250 do the versioning by selecting the exit edge the vectorizer is
4251 currently using. */
4252 edge exit_edge;
4253 if (loop_to_version == loop)
4254 exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4255 else
4256 exit_edge = single_exit (loop_to_version);
4258 gcc_assert (exit_edge);
4259 merge_bb = exit_edge->dest;
4260 if (EDGE_COUNT (merge_bb->preds) >= 2)
4262 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4263 new_exit_bb = split_edge (exit_edge);
4264 new_exit_e = exit_edge;
4265 e = EDGE_SUCC (new_exit_bb, 0);
4267 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4268 gsi_next (&gsi))
4270 tree new_res;
4271 orig_phi = gsi.phi ();
4272 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4273 new_phi = create_phi_node (new_res, new_exit_bb);
4274 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4275 add_phi_arg (new_phi, arg, new_exit_e,
4276 gimple_phi_arg_location_from_edge (orig_phi, e));
4277 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4281 update_ssa (TODO_update_ssa_no_phi);
4284 /* Split the cost model check off to a separate BB. Costing assumes
4285 this is the only thing we perform when we enter the scalar loop
4286 from a failed cost decision. */
4287 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4289 gimple *def = SSA_NAME_DEF_STMT (cost_name);
4290 gcc_assert (gimple_bb (def) == condition_bb);
4291 /* All uses of the cost check are 'true' after the check we
4292 are going to insert. */
4293 replace_uses_by (cost_name, boolean_true_node);
4294 /* And we're going to build the new single use of it. */
4295 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4296 NULL_TREE, NULL_TREE);
4297 edge e = split_block (gimple_bb (def), def);
4298 gimple_stmt_iterator gsi = gsi_for_stmt (def);
4299 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4300 edge true_e, false_e;
4301 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4302 e->flags &= ~EDGE_FALLTHRU;
4303 e->flags |= EDGE_TRUE_VALUE;
4304 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4305 e->probability = prob2;
4306 e2->probability = prob2.invert ();
4307 e->dest->count = e->count ();
4308 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4309 auto_vec<basic_block, 3> adj;
4310 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4311 son;
4312 son = next_dom_son (CDI_DOMINATORS, son))
4313 if (EDGE_COUNT (son->preds) > 1)
4314 adj.safe_push (son);
4315 for (auto son : adj)
4316 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4317 //debug_bb (condition_bb);
4318 //debug_bb (e->src);
4321 if (version_niter)
4323 /* The versioned loop could be infinite, we need to clear existing
4324 niter information which is copied from the original loop. */
4325 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4326 vect_free_loop_info_assumptions (nloop);
4329 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4330 && dump_enabled_p ())
4332 if (version_alias)
4333 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4334 vect_location,
4335 "loop versioned for vectorization because of "
4336 "possible aliasing\n");
4337 if (version_align)
4338 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4339 vect_location,
4340 "loop versioned for vectorization to enhance "
4341 "alignment\n");
4345 return nloop;