1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
29 #include "tree-pass.h"
31 #include "fold-const.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
51 /*************************************************************************
52 Simple Loop Peeling Utilities
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
58 /* Renames the use *OP_P. */
61 rename_use_op (use_operand_p op_p
)
65 if (TREE_CODE (USE_FROM_PTR (op_p
)) != SSA_NAME
)
68 new_name
= get_current_def (USE_FROM_PTR (op_p
));
70 /* Something defined outside of the loop. */
74 /* An ordinary ssa name defined in the loop. */
76 SET_USE (op_p
, new_name
);
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 rename_variables_in_bb (basic_block bb
, bool rename_from_outer_loop
)
92 struct loop
*loop
= bb
->loop_father
;
93 struct loop
*outer_loop
= NULL
;
95 if (rename_from_outer_loop
)
98 outer_loop
= loop_outer (loop
);
101 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
104 stmt
= gsi_stmt (gsi
);
105 FOR_EACH_SSA_USE_OPERAND (use_p
, stmt
, iter
, SSA_OP_ALL_USES
)
106 rename_use_op (use_p
);
109 FOR_EACH_EDGE (e
, ei
, bb
->preds
)
111 if (!flow_bb_inside_loop_p (loop
, e
->src
))
113 if (!rename_from_outer_loop
)
115 if (e
->src
!= outer_loop
->header
)
117 if (outer_loop
->inner
->next
)
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e
->src
)
123 || single_pred (e
->src
) != outer_loop
->header
)
128 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi
.phi (), e
));
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
147 static vec
<adjust_info
, va_heap
> adjust_vec
;
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
154 adjust_debug_stmts_now (adjust_info
*ai
)
156 basic_block bbphi
= ai
->bb
;
157 tree orig_def
= ai
->from
;
158 tree new_def
= ai
->to
;
159 imm_use_iterator imm_iter
;
161 basic_block bbdef
= gimple_bb (SSA_NAME_DEF_STMT (orig_def
));
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS
));
165 /* Adjust any debug stmts that held onto non-loop-closed
167 FOR_EACH_IMM_USE_STMT (stmt
, imm_iter
, orig_def
)
172 if (!is_gimple_debug (stmt
))
175 gcc_assert (gimple_debug_bind_p (stmt
));
177 bbuse
= gimple_bb (stmt
);
180 || dominated_by_p (CDI_DOMINATORS
, bbuse
, bbphi
))
182 || dominated_by_p (CDI_DOMINATORS
, bbuse
, bbdef
)))
185 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
186 SET_USE (use_p
, new_def
);
189 gimple_debug_bind_reset_value (stmt
);
196 /* Adjust debug stmts as scheduled before. */
199 adjust_vec_debug_stmts (void)
201 if (!MAY_HAVE_DEBUG_BIND_STMTS
)
204 gcc_assert (adjust_vec
.exists ());
206 while (!adjust_vec
.is_empty ())
208 adjust_debug_stmts_now (&adjust_vec
.last ());
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
219 adjust_debug_stmts (tree from
, tree to
, basic_block bb
)
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from
) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from
)
226 && ! virtual_operand_p (from
))
232 if (adjust_vec
.exists ())
233 adjust_vec
.safe_push (ai
);
235 adjust_debug_stmts_now (&ai
);
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
245 adjust_phi_and_debug_stmts (gimple
*update_phi
, edge e
, tree new_def
)
247 tree orig_def
= PHI_ARG_DEF_FROM_EDGE (update_phi
, e
);
249 SET_PHI_ARG_DEF (update_phi
, e
->dest_idx
, new_def
);
251 if (MAY_HAVE_DEBUG_BIND_STMTS
)
252 adjust_debug_stmts (orig_def
, PHI_RESULT (update_phi
),
253 gimple_bb (update_phi
));
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
261 vect_set_loop_mask (struct loop
*loop
, tree mask
, tree init_mask
,
264 gphi
*phi
= create_phi_node (mask
, loop
->header
);
265 add_phi_arg (phi
, init_mask
, loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
266 add_phi_arg (phi
, next_mask
, loop_latch_edge (loop
), UNKNOWN_LOCATION
);
269 /* Add SEQ to the end of LOOP's preheader block. */
272 add_preheader_seq (struct loop
*loop
, gimple_seq seq
)
276 edge pe
= loop_preheader_edge (loop
);
277 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
278 gcc_assert (!new_bb
);
282 /* Add SEQ to the beginning of LOOP's header block. */
285 add_header_seq (struct loop
*loop
, gimple_seq seq
)
289 gimple_stmt_iterator gsi
= gsi_after_labels (loop
->header
);
290 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
300 interleave_supported_p (vec_perm_indices
*indices
, tree vectype
,
303 poly_uint64 nelts
= TYPE_VECTOR_SUBPARTS (vectype
);
304 poly_uint64 base
= exact_div (nelts
, 2) * offset
;
305 vec_perm_builder
sel (nelts
, 2, 3);
306 for (unsigned int i
= 0; i
< 3; ++i
)
308 sel
.quick_push (base
+ i
);
309 sel
.quick_push (base
+ i
+ nelts
);
311 indices
->new_vector (sel
, 2, nelts
);
312 return can_vec_perm_const_p (TYPE_MODE (vectype
), *indices
);
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
320 vect_maybe_permute_loop_masks (gimple_seq
*seq
, rgroup_masks
*dest_rgm
,
321 rgroup_masks
*src_rgm
)
323 tree src_masktype
= src_rgm
->mask_type
;
324 tree dest_masktype
= dest_rgm
->mask_type
;
325 machine_mode src_mode
= TYPE_MODE (src_masktype
);
326 if (dest_rgm
->max_nscalars_per_iter
<= src_rgm
->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab
, src_mode
) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab
, src_mode
) != CODE_FOR_nothing
)
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype
= vect_halve_mask_nunits (src_masktype
);
333 for (unsigned int i
= 0; i
< dest_rgm
->masks
.length (); ++i
)
335 tree src
= src_rgm
->masks
[i
/ 2];
336 tree dest
= dest_rgm
->masks
[i
];
337 tree_code code
= ((i
& 1) == (BYTES_BIG_ENDIAN
? 0 : 1)
339 : VEC_UNPACK_LO_EXPR
);
341 if (dest_masktype
== unpack_masktype
)
342 stmt
= gimple_build_assign (dest
, code
, src
);
345 tree temp
= make_ssa_name (unpack_masktype
);
346 stmt
= gimple_build_assign (temp
, code
, src
);
347 gimple_seq_add_stmt (seq
, stmt
);
348 stmt
= gimple_build_assign (dest
, VIEW_CONVERT_EXPR
,
349 build1 (VIEW_CONVERT_EXPR
,
350 dest_masktype
, temp
));
352 gimple_seq_add_stmt (seq
, stmt
);
356 vec_perm_indices indices
[2];
357 if (dest_masktype
== src_masktype
358 && interleave_supported_p (&indices
[0], src_masktype
, 0)
359 && interleave_supported_p (&indices
[1], src_masktype
, 1))
361 /* The destination requires twice as many mask bits as the source, so
362 we can use interleaving permutes to double up the number of bits. */
364 for (unsigned int i
= 0; i
< 2; ++i
)
365 masks
[i
] = vect_gen_perm_mask_checked (src_masktype
, indices
[i
]);
366 for (unsigned int i
= 0; i
< dest_rgm
->masks
.length (); ++i
)
368 tree src
= src_rgm
->masks
[i
/ 2];
369 tree dest
= dest_rgm
->masks
[i
];
370 gimple
*stmt
= gimple_build_assign (dest
, VEC_PERM_EXPR
,
371 src
, src
, masks
[i
& 1]);
372 gimple_seq_add_stmt (seq
, stmt
);
379 /* Helper for vect_set_loop_condition_masked. Generate definitions for
380 all the masks in RGM and return a mask that is nonzero when the loop
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
382 Use LOOP_COND_GSI to insert code before the exit gcond.
384 RGM belongs to loop LOOP. The loop originally iterated NITERS
385 times and has been vectorized according to LOOP_VINFO. Each iteration
386 of the vectorized loop handles VF iterations of the scalar loop.
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
395 NITERS * RGM->max_nscalars_per_iter
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
400 VF * RGM->max_nscalars_per_iter
402 might overflow before hitting a value above:
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
410 vect_set_loop_masks_directly (struct loop
*loop
, loop_vec_info loop_vinfo
,
411 gimple_seq
*preheader_seq
,
412 gimple_stmt_iterator loop_cond_gsi
,
413 rgroup_masks
*rgm
, tree vf
,
414 tree niters
, tree niters_skip
,
417 tree compare_type
= LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
);
418 tree mask_type
= rgm
->mask_type
;
419 unsigned int nscalars_per_iter
= rgm
->max_nscalars_per_iter
;
420 poly_uint64 nscalars_per_mask
= TYPE_VECTOR_SUBPARTS (mask_type
);
422 /* Calculate the maximum number of scalar values that the rgroup
423 handles in total, the number that it handles for each iteration
424 of the vector loop, and the number that it should skip during the
425 first iteration of the vector loop. */
426 tree nscalars_total
= niters
;
427 tree nscalars_step
= vf
;
428 tree nscalars_skip
= niters_skip
;
429 if (nscalars_per_iter
!= 1)
431 /* We checked before choosing to use a fully-masked loop that these
432 multiplications don't overflow. */
433 tree factor
= build_int_cst (compare_type
, nscalars_per_iter
);
434 nscalars_total
= gimple_build (preheader_seq
, MULT_EXPR
, compare_type
,
435 nscalars_total
, factor
);
436 nscalars_step
= gimple_build (preheader_seq
, MULT_EXPR
, compare_type
,
437 nscalars_step
, factor
);
439 nscalars_skip
= gimple_build (preheader_seq
, MULT_EXPR
, compare_type
,
440 nscalars_skip
, factor
);
443 /* Create an induction variable that counts the number of scalars
445 tree index_before_incr
, index_after_incr
;
446 gimple_stmt_iterator incr_gsi
;
448 tree zero_index
= build_int_cst (compare_type
, 0);
449 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
450 create_iv (zero_index
, nscalars_step
, NULL_TREE
, loop
, &incr_gsi
,
451 insert_after
, &index_before_incr
, &index_after_incr
);
453 tree test_index
, test_limit
, first_limit
;
454 gimple_stmt_iterator
*test_gsi
;
457 /* In principle the loop should stop iterating once the incremented
458 IV reaches a value greater than or equal to:
460 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
462 However, there's no guarantee that this addition doesn't overflow
463 the comparison type, or that the IV hits a value above it before
464 wrapping around. We therefore adjust the limit down by one
467 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468 -[infinite-prec] NSCALARS_STEP
470 and compare the IV against this limit _before_ incrementing it.
471 Since the comparison type is unsigned, we actually want the
472 subtraction to saturate at zero:
474 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
479 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
481 where the rightmost subtraction can be done directly in
483 test_index
= index_before_incr
;
484 tree adjust
= nscalars_step
;
486 adjust
= gimple_build (preheader_seq
, MINUS_EXPR
, compare_type
,
487 adjust
, nscalars_skip
);
488 test_limit
= gimple_build (preheader_seq
, MAX_EXPR
, compare_type
,
489 nscalars_total
, adjust
);
490 test_limit
= gimple_build (preheader_seq
, MINUS_EXPR
, compare_type
,
492 test_gsi
= &incr_gsi
;
494 /* Get a safe limit for the first iteration. */
497 /* The first vector iteration can handle at most NSCALARS_STEP
498 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
499 NSCALARS_SKIP to that cannot overflow. */
500 tree const_limit
= build_int_cst (compare_type
,
501 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
502 * nscalars_per_iter
);
503 first_limit
= gimple_build (preheader_seq
, MIN_EXPR
, compare_type
,
504 nscalars_total
, const_limit
);
505 first_limit
= gimple_build (preheader_seq
, PLUS_EXPR
, compare_type
,
506 first_limit
, nscalars_skip
);
509 /* For the first iteration it doesn't matter whether the IV hits
510 a value above NSCALARS_TOTAL. That only matters for the latch
512 first_limit
= nscalars_total
;
516 /* Test the incremented IV, which will always hit a value above
517 the bound before wrapping. */
518 test_index
= index_after_incr
;
519 test_limit
= nscalars_total
;
521 test_limit
= gimple_build (preheader_seq
, PLUS_EXPR
, compare_type
,
522 test_limit
, nscalars_skip
);
523 test_gsi
= &loop_cond_gsi
;
525 first_limit
= test_limit
;
528 /* Provide a definition of each mask in the group. */
529 tree next_mask
= NULL_TREE
;
532 FOR_EACH_VEC_ELT_REVERSE (rgm
->masks
, i
, mask
)
534 /* Previous masks will cover BIAS scalars. This mask covers the
536 poly_uint64 bias
= nscalars_per_mask
* i
;
537 tree bias_tree
= build_int_cst (compare_type
, bias
);
540 /* See whether the first iteration of the vector loop is known
541 to have a full mask. */
542 poly_uint64 const_limit
;
543 bool first_iteration_full
544 = (poly_int_tree_p (first_limit
, &const_limit
)
545 && known_ge (const_limit
, (i
+ 1) * nscalars_per_mask
));
547 /* Rather than have a new IV that starts at BIAS and goes up to
548 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549 and adjust the bound down by BIAS. */
550 tree this_test_limit
= test_limit
;
553 this_test_limit
= gimple_build (preheader_seq
, MAX_EXPR
,
554 compare_type
, this_test_limit
,
556 this_test_limit
= gimple_build (preheader_seq
, MINUS_EXPR
,
557 compare_type
, this_test_limit
,
561 /* Create the initial mask. First include all scalars that
562 are within the loop limit. */
563 tree init_mask
= NULL_TREE
;
564 if (!first_iteration_full
)
567 if (first_limit
== test_limit
)
569 /* Use a natural test between zero (the initial IV value)
570 and the loop limit. The "else" block would be valid too,
571 but this choice can avoid the need to load BIAS_TREE into
574 end
= this_test_limit
;
578 /* FIRST_LIMIT is the maximum number of scalars handled by the
579 first iteration of the vector loop. Test the portion
580 associated with this mask. */
585 init_mask
= make_temp_ssa_name (mask_type
, NULL
, "max_mask");
586 tmp_stmt
= vect_gen_while (init_mask
, start
, end
);
587 gimple_seq_add_stmt (preheader_seq
, tmp_stmt
);
590 /* Now AND out the bits that are within the number of skipped
592 poly_uint64 const_skip
;
594 && !(poly_int_tree_p (nscalars_skip
, &const_skip
)
595 && known_le (const_skip
, bias
)))
597 tree unskipped_mask
= vect_gen_while_not (preheader_seq
, mask_type
,
598 bias_tree
, nscalars_skip
);
600 init_mask
= gimple_build (preheader_seq
, BIT_AND_EXPR
, mask_type
,
601 init_mask
, unskipped_mask
);
603 init_mask
= unskipped_mask
;
607 /* First iteration is full. */
608 init_mask
= build_minus_one_cst (mask_type
);
610 /* Get the mask value for the next iteration of the loop. */
611 next_mask
= make_temp_ssa_name (mask_type
, NULL
, "next_mask");
612 gcall
*call
= vect_gen_while (next_mask
, test_index
, this_test_limit
);
613 gsi_insert_before (test_gsi
, call
, GSI_SAME_STMT
);
615 vect_set_loop_mask (loop
, mask
, init_mask
, next_mask
);
620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
622 number of iterations of the original scalar loop that should be
623 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
624 as for vect_set_loop_condition.
626 Insert the branch-back condition before LOOP_COND_GSI and return the
630 vect_set_loop_condition_masked (struct loop
*loop
, loop_vec_info loop_vinfo
,
631 tree niters
, tree final_iv
,
632 bool niters_maybe_zero
,
633 gimple_stmt_iterator loop_cond_gsi
)
635 gimple_seq preheader_seq
= NULL
;
636 gimple_seq header_seq
= NULL
;
638 tree compare_type
= LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
);
639 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
640 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
641 tree orig_niters
= niters
;
643 /* Type of the initial value of NITERS. */
644 tree ni_actual_type
= TREE_TYPE (niters
);
645 unsigned int ni_actual_precision
= TYPE_PRECISION (ni_actual_type
);
647 /* Convert NITERS to the same size as the compare. */
648 if (compare_precision
> ni_actual_precision
649 && niters_maybe_zero
)
651 /* We know that there is always at least one iteration, so if the
652 count is zero then it must have wrapped. Cope with this by
653 subtracting 1 before the conversion and adding 1 to the result. */
654 gcc_assert (TYPE_UNSIGNED (ni_actual_type
));
655 niters
= gimple_build (&preheader_seq
, PLUS_EXPR
, ni_actual_type
,
656 niters
, build_minus_one_cst (ni_actual_type
));
657 niters
= gimple_convert (&preheader_seq
, compare_type
, niters
);
658 niters
= gimple_build (&preheader_seq
, PLUS_EXPR
, compare_type
,
659 niters
, build_one_cst (compare_type
));
662 niters
= gimple_convert (&preheader_seq
, compare_type
, niters
);
664 /* Convert skip_niters to the right type. */
665 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
667 /* Now calculate the value that the induction variable must be able
668 to hit in order to ensure that we end the loop with an all-false mask.
669 This involves adding the maximum number of inactive trailing scalar
672 bool known_max_iters
= max_loop_iterations (loop
, &iv_limit
);
677 /* Add the maximum number of skipped iterations to the
678 maximum iteration count. */
679 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
680 iv_limit
+= wi::to_widest (niters_skip
);
682 iv_limit
+= max_vf
- 1;
684 /* IV_LIMIT is the maximum number of latch iterations, which is also
685 the maximum in-range IV value. Round this value down to the previous
686 vector alignment boundary and then add an extra full iteration. */
687 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
688 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
691 /* Get the vectorization factor in tree form. */
692 tree vf
= build_int_cst (compare_type
,
693 LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
695 /* Iterate over all the rgroups and fill in their masks. We could use
696 the first mask from any rgroup for the loop condition; here we
697 arbitrarily pick the last. */
698 tree test_mask
= NULL_TREE
;
701 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
702 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
703 if (!rgm
->masks
.is_empty ())
705 /* First try using permutes. This adds a single vector
706 instruction to the loop for each mask, but needs no extra
707 loop invariants or IVs. */
708 unsigned int nmasks
= i
+ 1;
709 if ((nmasks
& 1) == 0)
711 rgroup_masks
*half_rgm
= &(*masks
)[nmasks
/ 2 - 1];
712 if (!half_rgm
->masks
.is_empty ()
713 && vect_maybe_permute_loop_masks (&header_seq
, rgm
, half_rgm
))
717 /* See whether zero-based IV would ever generate all-false masks
718 before wrapping around. */
721 || (wi::min_precision (iv_limit
* rgm
->max_nscalars_per_iter
,
723 > compare_precision
));
725 /* Set up all masks for this group. */
726 test_mask
= vect_set_loop_masks_directly (loop
, loop_vinfo
,
728 loop_cond_gsi
, rgm
, vf
,
733 /* Emit all accumulated statements. */
734 add_preheader_seq (loop
, preheader_seq
);
735 add_header_seq (loop
, header_seq
);
737 /* Get a boolean result that tells us whether to iterate. */
738 edge exit_edge
= single_exit (loop
);
739 tree_code code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? EQ_EXPR
: NE_EXPR
;
740 tree zero_mask
= build_zero_cst (TREE_TYPE (test_mask
));
741 gcond
*cond_stmt
= gimple_build_cond (code
, test_mask
, zero_mask
,
742 NULL_TREE
, NULL_TREE
);
743 gsi_insert_before (&loop_cond_gsi
, cond_stmt
, GSI_SAME_STMT
);
745 /* The loop iterates (NITERS - 1) / VF + 1 times.
746 Subtract one from this to get the latch count. */
747 tree step
= build_int_cst (compare_type
,
748 LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
749 tree niters_minus_one
= fold_build2 (PLUS_EXPR
, compare_type
, niters
,
750 build_minus_one_cst (compare_type
));
751 loop
->nb_iterations
= fold_build2 (TRUNC_DIV_EXPR
, compare_type
,
752 niters_minus_one
, step
);
756 gassign
*assign
= gimple_build_assign (final_iv
, orig_niters
);
757 gsi_insert_on_edge_immediate (single_exit (loop
), assign
);
763 /* Like vect_set_loop_condition, but handle the case in which there
764 are no loop masks. */
767 vect_set_loop_condition_unmasked (struct loop
*loop
, tree niters
,
768 tree step
, tree final_iv
,
769 bool niters_maybe_zero
,
770 gimple_stmt_iterator loop_cond_gsi
)
772 tree indx_before_incr
, indx_after_incr
;
775 edge pe
= loop_preheader_edge (loop
);
776 edge exit_edge
= single_exit (loop
);
777 gimple_stmt_iterator incr_gsi
;
780 tree niters_type
= TREE_TYPE (niters
);
782 orig_cond
= get_loop_exit_condition (loop
);
783 gcc_assert (orig_cond
);
784 loop_cond_gsi
= gsi_for_stmt (orig_cond
);
787 if (!niters_maybe_zero
&& integer_onep (step
))
789 /* In this case we can use a simple 0-based IV:
798 while (x < NITERS); */
799 code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? GE_EXPR
: LT_EXPR
;
800 init
= build_zero_cst (niters_type
);
805 /* The following works for all values of NITERS except 0:
814 while (x <= NITERS - STEP);
816 so that the loop continues to iterate if x + STEP - 1 < NITERS
817 but stops if x + STEP - 1 >= NITERS.
819 However, if NITERS is zero, x never hits a value above NITERS - STEP
820 before wrapping around. There are two obvious ways of dealing with
823 - start at STEP - 1 and compare x before incrementing it
824 - start at -1 and compare x after incrementing it
826 The latter is simpler and is what we use. The loop in this case
836 while (x < NITERS - STEP);
838 In both cases the loop limit is NITERS - STEP. */
839 gimple_seq seq
= NULL
;
840 limit
= force_gimple_operand (niters
, &seq
, true, NULL_TREE
);
841 limit
= gimple_build (&seq
, MINUS_EXPR
, TREE_TYPE (limit
), limit
, step
);
844 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
845 gcc_assert (!new_bb
);
847 if (niters_maybe_zero
)
850 code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? GE_EXPR
: LT_EXPR
;
851 init
= build_all_ones_cst (niters_type
);
856 code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? GT_EXPR
: LE_EXPR
;
857 init
= build_zero_cst (niters_type
);
861 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
862 create_iv (init
, step
, NULL_TREE
, loop
,
863 &incr_gsi
, insert_after
, &indx_before_incr
, &indx_after_incr
);
864 indx_after_incr
= force_gimple_operand_gsi (&loop_cond_gsi
, indx_after_incr
,
865 true, NULL_TREE
, true,
867 limit
= force_gimple_operand_gsi (&loop_cond_gsi
, limit
, true, NULL_TREE
,
868 true, GSI_SAME_STMT
);
870 cond_stmt
= gimple_build_cond (code
, indx_after_incr
, limit
, NULL_TREE
,
873 gsi_insert_before (&loop_cond_gsi
, cond_stmt
, GSI_SAME_STMT
);
875 /* Record the number of latch iterations. */
877 /* Case A: the loop iterates NITERS times. Subtract one to get the
879 loop
->nb_iterations
= fold_build2 (MINUS_EXPR
, niters_type
, niters
,
880 build_int_cst (niters_type
, 1));
882 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883 Subtract one from this to get the latch count. */
884 loop
->nb_iterations
= fold_build2 (TRUNC_DIV_EXPR
, niters_type
,
889 gassign
*assign
= gimple_build_assign (final_iv
, MINUS_EXPR
,
890 indx_after_incr
, init
);
891 gsi_insert_on_edge_immediate (single_exit (loop
), assign
);
897 /* If we're using fully-masked loops, make LOOP iterate:
899 N == (NITERS - 1) / STEP + 1
901 times. When NITERS is zero, this is equivalent to making the loop
902 execute (1 << M) / STEP times, where M is the precision of NITERS.
903 NITERS_MAYBE_ZERO is true if this last case might occur.
905 If we're not using fully-masked loops, make LOOP iterate:
907 N == (NITERS - STEP) / STEP + 1
909 times, where NITERS is known to be outside the range [1, STEP - 1].
910 This is equivalent to making the loop execute NITERS / STEP times
911 when NITERS is nonzero and (1 << M) / STEP times otherwise.
912 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
914 If FINAL_IV is nonnull, it is an SSA name that should be set to
915 N * STEP on exit from the loop.
917 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
920 vect_set_loop_condition (struct loop
*loop
, loop_vec_info loop_vinfo
,
921 tree niters
, tree step
, tree final_iv
,
922 bool niters_maybe_zero
)
925 gcond
*orig_cond
= get_loop_exit_condition (loop
);
926 gimple_stmt_iterator loop_cond_gsi
= gsi_for_stmt (orig_cond
);
928 if (loop_vinfo
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
929 cond_stmt
= vect_set_loop_condition_masked (loop
, loop_vinfo
, niters
,
930 final_iv
, niters_maybe_zero
,
933 cond_stmt
= vect_set_loop_condition_unmasked (loop
, niters
, step
,
934 final_iv
, niters_maybe_zero
,
937 /* Remove old loop exit test. */
938 stmt_vec_info orig_cond_info
;
940 && (orig_cond_info
= loop_vinfo
->lookup_stmt (orig_cond
)))
941 loop_vinfo
->remove_stmt (orig_cond_info
);
943 gsi_remove (&loop_cond_gsi
, true);
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE
, vect_location
, "New loop exit condition: %G",
950 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
951 For all PHI arguments in FROM->dest and TO->dest from those
952 edges ensure that TO->dest PHI arguments have current_def
956 slpeel_duplicate_current_defs_from_edges (edge from
, edge to
)
958 gimple_stmt_iterator gsi_from
, gsi_to
;
960 for (gsi_from
= gsi_start_phis (from
->dest
),
961 gsi_to
= gsi_start_phis (to
->dest
);
962 !gsi_end_p (gsi_from
) && !gsi_end_p (gsi_to
);)
964 gimple
*from_phi
= gsi_stmt (gsi_from
);
965 gimple
*to_phi
= gsi_stmt (gsi_to
);
966 tree from_arg
= PHI_ARG_DEF_FROM_EDGE (from_phi
, from
);
967 tree to_arg
= PHI_ARG_DEF_FROM_EDGE (to_phi
, to
);
968 if (virtual_operand_p (from_arg
))
970 gsi_next (&gsi_from
);
973 if (virtual_operand_p (to_arg
))
978 if (TREE_CODE (from_arg
) != SSA_NAME
)
979 gcc_assert (operand_equal_p (from_arg
, to_arg
, 0));
982 if (get_current_def (to_arg
) == NULL_TREE
)
983 set_current_def (to_arg
, get_current_def (from_arg
));
985 gsi_next (&gsi_from
);
989 gphi
*from_phi
= get_virtual_phi (from
->dest
);
990 gphi
*to_phi
= get_virtual_phi (to
->dest
);
992 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi
, to
),
993 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi
, from
)));
997 /* Given LOOP this function generates a new copy of it and puts it
998 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
999 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1000 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1001 entry or exit of LOOP. */
1004 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop
*loop
,
1005 struct loop
*scalar_loop
, edge e
)
1007 struct loop
*new_loop
;
1008 basic_block
*new_bbs
, *bbs
, *pbbs
;
1011 basic_block exit_dest
;
1012 edge exit
, new_exit
;
1013 bool duplicate_outer_loop
= false;
1015 exit
= single_exit (loop
);
1016 at_exit
= (e
== exit
);
1017 if (!at_exit
&& e
!= loop_preheader_edge (loop
))
1020 if (scalar_loop
== NULL
)
1023 bbs
= XNEWVEC (basic_block
, scalar_loop
->num_nodes
+ 1);
1025 get_loop_body_with_size (scalar_loop
, pbbs
, scalar_loop
->num_nodes
);
1026 /* Allow duplication of outer loops. */
1027 if (scalar_loop
->inner
)
1028 duplicate_outer_loop
= true;
1029 /* Check whether duplication is possible. */
1030 if (!can_copy_bbs_p (pbbs
, scalar_loop
->num_nodes
))
1036 /* Generate new loop structure. */
1037 new_loop
= duplicate_loop (scalar_loop
, loop_outer (scalar_loop
));
1038 duplicate_subloops (scalar_loop
, new_loop
);
1040 exit_dest
= exit
->dest
;
1041 was_imm_dom
= (get_immediate_dominator (CDI_DOMINATORS
,
1042 exit_dest
) == loop
->header
?
1045 /* Also copy the pre-header, this avoids jumping through hoops to
1046 duplicate the loop entry PHI arguments. Create an empty
1047 pre-header unconditionally for this. */
1048 basic_block preheader
= split_edge (loop_preheader_edge (scalar_loop
));
1049 edge entry_e
= single_pred_edge (preheader
);
1051 new_bbs
= XNEWVEC (basic_block
, scalar_loop
->num_nodes
+ 1);
1053 exit
= single_exit (scalar_loop
);
1054 copy_bbs (bbs
, scalar_loop
->num_nodes
+ 1, new_bbs
,
1055 &exit
, 1, &new_exit
, NULL
,
1056 at_exit
? loop
->latch
: e
->src
, true);
1057 exit
= single_exit (loop
);
1058 basic_block new_preheader
= new_bbs
[0];
1060 add_phi_args_after_copy (new_bbs
, scalar_loop
->num_nodes
+ 1, NULL
);
1062 if (scalar_loop
!= loop
)
1064 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1065 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1066 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1067 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1068 header) to have current_def set, so copy them over. */
1069 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop
),
1071 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop
->latch
,
1073 EDGE_SUCC (loop
->latch
, 0));
1076 if (at_exit
) /* Add the loop copy at exit. */
1078 if (scalar_loop
!= loop
)
1081 new_exit
= redirect_edge_and_branch (new_exit
, exit_dest
);
1083 for (gsi
= gsi_start_phis (exit_dest
); !gsi_end_p (gsi
);
1086 gphi
*phi
= gsi
.phi ();
1087 tree orig_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, e
);
1088 location_t orig_locus
1089 = gimple_phi_arg_location_from_edge (phi
, e
);
1091 add_phi_arg (phi
, orig_arg
, new_exit
, orig_locus
);
1094 redirect_edge_and_branch_force (e
, new_preheader
);
1095 flush_pending_stmts (e
);
1096 set_immediate_dominator (CDI_DOMINATORS
, new_preheader
, e
->src
);
1097 if (was_imm_dom
|| duplicate_outer_loop
)
1098 set_immediate_dominator (CDI_DOMINATORS
, exit_dest
, new_exit
->src
);
1100 /* And remove the non-necessary forwarder again. Keep the other
1101 one so we have a proper pre-header for the loop at the exit edge. */
1102 redirect_edge_pred (single_succ_edge (preheader
),
1103 single_pred (preheader
));
1104 delete_basic_block (preheader
);
1105 set_immediate_dominator (CDI_DOMINATORS
, scalar_loop
->header
,
1106 loop_preheader_edge (scalar_loop
)->src
);
1108 else /* Add the copy at entry. */
1110 if (scalar_loop
!= loop
)
1112 /* Remove the non-necessary forwarder of scalar_loop again. */
1113 redirect_edge_pred (single_succ_edge (preheader
),
1114 single_pred (preheader
));
1115 delete_basic_block (preheader
);
1116 set_immediate_dominator (CDI_DOMINATORS
, scalar_loop
->header
,
1117 loop_preheader_edge (scalar_loop
)->src
);
1118 preheader
= split_edge (loop_preheader_edge (loop
));
1119 entry_e
= single_pred_edge (preheader
);
1122 redirect_edge_and_branch_force (entry_e
, new_preheader
);
1123 flush_pending_stmts (entry_e
);
1124 set_immediate_dominator (CDI_DOMINATORS
, new_preheader
, entry_e
->src
);
1126 redirect_edge_and_branch_force (new_exit
, preheader
);
1127 flush_pending_stmts (new_exit
);
1128 set_immediate_dominator (CDI_DOMINATORS
, preheader
, new_exit
->src
);
1130 /* And remove the non-necessary forwarder again. Keep the other
1131 one so we have a proper pre-header for the loop at the exit edge. */
1132 redirect_edge_pred (single_succ_edge (new_preheader
),
1133 single_pred (new_preheader
));
1134 delete_basic_block (new_preheader
);
1135 set_immediate_dominator (CDI_DOMINATORS
, new_loop
->header
,
1136 loop_preheader_edge (new_loop
)->src
);
1139 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1140 for (unsigned i
= (at_exit
? 0 : 1); i
< scalar_loop
->num_nodes
+ 1; i
++)
1141 rename_variables_in_bb (new_bbs
[i
], duplicate_outer_loop
);
1143 if (scalar_loop
!= loop
)
1145 /* Update new_loop->header PHIs, so that on the preheader
1146 edge they are the ones from loop rather than scalar_loop. */
1147 gphi_iterator gsi_orig
, gsi_new
;
1148 edge orig_e
= loop_preheader_edge (loop
);
1149 edge new_e
= loop_preheader_edge (new_loop
);
1151 for (gsi_orig
= gsi_start_phis (loop
->header
),
1152 gsi_new
= gsi_start_phis (new_loop
->header
);
1153 !gsi_end_p (gsi_orig
) && !gsi_end_p (gsi_new
);
1154 gsi_next (&gsi_orig
), gsi_next (&gsi_new
))
1156 gphi
*orig_phi
= gsi_orig
.phi ();
1157 gphi
*new_phi
= gsi_new
.phi ();
1158 tree orig_arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, orig_e
);
1159 location_t orig_locus
1160 = gimple_phi_arg_location_from_edge (orig_phi
, orig_e
);
1162 add_phi_arg (new_phi
, orig_arg
, new_e
, orig_locus
);
1169 checking_verify_dominators (CDI_DOMINATORS
);
1175 /* Given the condition expression COND, put it as the last statement of
1176 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1177 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1178 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1179 new edge as irreducible if IRREDUCIBLE_P is true. */
1182 slpeel_add_loop_guard (basic_block guard_bb
, tree cond
,
1183 basic_block guard_to
, basic_block dom_bb
,
1184 profile_probability probability
, bool irreducible_p
)
1186 gimple_stmt_iterator gsi
;
1187 edge new_e
, enter_e
;
1189 gimple_seq gimplify_stmt_list
= NULL
;
1191 enter_e
= EDGE_SUCC (guard_bb
, 0);
1192 enter_e
->flags
&= ~EDGE_FALLTHRU
;
1193 enter_e
->flags
|= EDGE_FALSE_VALUE
;
1194 gsi
= gsi_last_bb (guard_bb
);
1196 cond
= force_gimple_operand_1 (cond
, &gimplify_stmt_list
, is_gimple_condexpr
,
1198 if (gimplify_stmt_list
)
1199 gsi_insert_seq_after (&gsi
, gimplify_stmt_list
, GSI_NEW_STMT
);
1201 cond_stmt
= gimple_build_cond_from_tree (cond
, NULL_TREE
, NULL_TREE
);
1202 gsi
= gsi_last_bb (guard_bb
);
1203 gsi_insert_after (&gsi
, cond_stmt
, GSI_NEW_STMT
);
1205 /* Add new edge to connect guard block to the merge/loop-exit block. */
1206 new_e
= make_edge (guard_bb
, guard_to
, EDGE_TRUE_VALUE
);
1208 new_e
->probability
= probability
;
1210 new_e
->flags
|= EDGE_IRREDUCIBLE_LOOP
;
1212 enter_e
->probability
= probability
.invert ();
1213 set_immediate_dominator (CDI_DOMINATORS
, guard_to
, dom_bb
);
1215 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1216 if (enter_e
->dest
->loop_father
->header
== enter_e
->dest
)
1217 split_edge (enter_e
);
1223 /* This function verifies that the following restrictions apply to LOOP:
1224 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1225 for innermost loop and 5 basic blocks for outer-loop.
1226 (2) it is single entry, single exit
1227 (3) its exit condition is the last stmt in the header
1228 (4) E is the entry/exit edge of LOOP.
1232 slpeel_can_duplicate_loop_p (const struct loop
*loop
, const_edge e
)
1234 edge exit_e
= single_exit (loop
);
1235 edge entry_e
= loop_preheader_edge (loop
);
1236 gcond
*orig_cond
= get_loop_exit_condition (loop
);
1237 gimple_stmt_iterator loop_exit_gsi
= gsi_last_bb (exit_e
->src
);
1238 unsigned int num_bb
= loop
->inner
? 5 : 2;
1240 /* All loops have an outer scope; the only case loop->outer is NULL is for
1241 the function itself. */
1242 if (!loop_outer (loop
)
1243 || loop
->num_nodes
!= num_bb
1244 || !empty_block_p (loop
->latch
)
1245 || !single_exit (loop
)
1246 /* Verify that new loop exit condition can be trivially modified. */
1247 || (!orig_cond
|| orig_cond
!= gsi_stmt (loop_exit_gsi
))
1248 || (e
!= exit_e
&& e
!= entry_e
))
1254 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1255 in the exit bb and rename all the uses after the loop. This simplifies
1256 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1257 (but normally loop closed SSA form doesn't require virtual PHIs to be
1258 in the same form). Doing this early simplifies the checking what
1259 uses should be renamed. */
1262 create_lcssa_for_virtual_phi (struct loop
*loop
)
1265 edge exit_e
= single_exit (loop
);
1267 for (gsi
= gsi_start_phis (loop
->header
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1268 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi
))))
1270 gphi
*phi
= gsi
.phi ();
1271 for (gsi
= gsi_start_phis (exit_e
->dest
);
1272 !gsi_end_p (gsi
); gsi_next (&gsi
))
1273 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi
))))
1275 if (gsi_end_p (gsi
))
1277 tree new_vop
= copy_ssa_name (PHI_RESULT (phi
));
1278 gphi
*new_phi
= create_phi_node (new_vop
, exit_e
->dest
);
1279 tree vop
= PHI_ARG_DEF_FROM_EDGE (phi
, EDGE_SUCC (loop
->latch
, 0));
1280 imm_use_iterator imm_iter
;
1282 use_operand_p use_p
;
1284 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop
)
1285 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop
);
1286 add_phi_arg (new_phi
, vop
, exit_e
, UNKNOWN_LOCATION
);
1287 gimple_phi_set_result (new_phi
, new_vop
);
1288 FOR_EACH_IMM_USE_STMT (stmt
, imm_iter
, vop
)
1290 && !flow_bb_inside_loop_p (loop
, gimple_bb (stmt
)))
1291 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
1292 SET_USE (use_p
, new_vop
);
1299 /* Function vect_get_loop_location.
1301 Extract the location of the loop in the source code.
1302 If the loop is not well formed for vectorization, an estimated
1303 location is calculated.
1304 Return the loop location if succeed and NULL if not. */
1306 dump_user_location_t
1307 find_loop_location (struct loop
*loop
)
1309 gimple
*stmt
= NULL
;
1311 gimple_stmt_iterator si
;
1314 return dump_user_location_t ();
1316 stmt
= get_loop_exit_condition (loop
);
1319 && LOCATION_LOCUS (gimple_location (stmt
)) > BUILTINS_LOCATION
)
1322 /* If we got here the loop is probably not "well formed",
1323 try to estimate the loop location */
1326 return dump_user_location_t ();
1330 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1332 stmt
= gsi_stmt (si
);
1333 if (LOCATION_LOCUS (gimple_location (stmt
)) > BUILTINS_LOCATION
)
1337 return dump_user_location_t ();
1340 /* Return true if the phi described by STMT_INFO defines an IV of the
1341 loop to be vectorized. */
1344 iv_phi_p (stmt_vec_info stmt_info
)
1346 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
1347 if (virtual_operand_p (PHI_RESULT (phi
)))
1350 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1351 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
1357 /* Function vect_can_advance_ivs_p
1359 In case the number of iterations that LOOP iterates is unknown at compile
1360 time, an epilog loop will be generated, and the loop induction variables
1361 (IVs) will be "advanced" to the value they are supposed to take just before
1362 the epilog loop. Here we check that the access function of the loop IVs
1363 and the expression that represents the loop bound are simple enough.
1364 These restrictions will be relaxed in the future. */
1367 vect_can_advance_ivs_p (loop_vec_info loop_vinfo
)
1369 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1370 basic_block bb
= loop
->header
;
1373 /* Analyze phi functions of the loop header. */
1375 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_NOTE
, vect_location
, "vect_can_advance_ivs_p:\n");
1377 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1379 tree evolution_part
;
1381 gphi
*phi
= gsi
.phi ();
1382 stmt_vec_info phi_info
= loop_vinfo
->lookup_stmt (phi
);
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
1387 /* Skip virtual phi's. The data dependences that are associated with
1388 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1390 Skip reduction phis. */
1391 if (!iv_phi_p (phi_info
))
1393 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_NOTE
, vect_location
,
1395 "reduc or virtual phi. skip.\n");
1399 /* Analyze the evolution function. */
1401 evolution_part
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
1402 if (evolution_part
== NULL_TREE
)
1404 if (dump_enabled_p ())
1405 dump_printf (MSG_MISSED_OPTIMIZATION
,
1406 "No access function or evolution.\n");
1410 /* FORNOW: We do not transform initial conditions of IVs
1411 which evolution functions are not invariants in the loop. */
1413 if (!expr_invariant_in_loop_p (loop
, evolution_part
))
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1417 "evolution not invariant in loop.\n");
1421 /* FORNOW: We do not transform initial conditions of IVs
1422 which evolution functions are a polynomial of degree >= 2. */
1424 if (tree_is_chrec (evolution_part
))
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1428 "evolution is chrec.\n");
1437 /* Function vect_update_ivs_after_vectorizer.
1439 "Advance" the induction variables of LOOP to the value they should take
1440 after the execution of LOOP. This is currently necessary because the
1441 vectorizer does not handle induction variables that are used after the
1442 loop. Such a situation occurs when the last iterations of LOOP are
1444 1. We introduced new uses after LOOP for IVs that were not originally used
1445 after LOOP: the IVs of LOOP are now used by an epilog loop.
1446 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447 times, whereas the loop IVs should be bumped N times.
1450 - LOOP - a loop that is going to be vectorized. The last few iterations
1451 of LOOP were peeled.
1452 - NITERS - the number of iterations that LOOP executes (before it is
1453 vectorized). i.e, the number of times the ivs should be bumped.
1454 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455 coming out from LOOP on which there are uses of the LOOP ivs
1456 (this is the path from LOOP->exit to epilog_loop->preheader).
1458 The new definitions of the ivs are placed in LOOP->exit.
1459 The phi args associated with the edge UPDATE_E in the bb
1460 UPDATE_E->dest are updated accordingly.
1462 Assumption 1: Like the rest of the vectorizer, this function assumes
1463 a single loop exit that has a single predecessor.
1465 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466 organized in the same order.
1468 Assumption 3: The access function of the ivs is simple enough (see
1469 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1471 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1472 coming out of LOOP on which the ivs of LOOP are used (this is the path
1473 that leads to the epilog loop; other paths skip the epilog loop). This
1474 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475 needs to have its phis updated.
1479 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo
,
1480 tree niters
, edge update_e
)
1482 gphi_iterator gsi
, gsi1
;
1483 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1484 basic_block update_bb
= update_e
->dest
;
1485 basic_block exit_bb
= single_exit (loop
)->dest
;
1487 /* Make sure there exists a single-predecessor exit bb: */
1488 gcc_assert (single_pred_p (exit_bb
));
1489 gcc_assert (single_succ_edge (exit_bb
) == update_e
);
1491 for (gsi
= gsi_start_phis (loop
->header
), gsi1
= gsi_start_phis (update_bb
);
1492 !gsi_end_p (gsi
) && !gsi_end_p (gsi1
);
1493 gsi_next (&gsi
), gsi_next (&gsi1
))
1496 tree step_expr
, off
;
1498 tree var
, ni
, ni_name
;
1499 gimple_stmt_iterator last_gsi
;
1501 gphi
*phi
= gsi
.phi ();
1502 gphi
*phi1
= gsi1
.phi ();
1503 stmt_vec_info phi_info
= loop_vinfo
->lookup_stmt (phi
);
1504 if (dump_enabled_p ())
1505 dump_printf_loc (MSG_NOTE
, vect_location
,
1506 "vect_update_ivs_after_vectorizer: phi: %G", phi
);
1508 /* Skip reduction and virtual phis. */
1509 if (!iv_phi_p (phi_info
))
1511 if (dump_enabled_p ())
1512 dump_printf_loc (MSG_NOTE
, vect_location
,
1513 "reduc or virtual phi. skip.\n");
1517 type
= TREE_TYPE (gimple_phi_result (phi
));
1518 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
1519 step_expr
= unshare_expr (step_expr
);
1521 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1522 of degree >= 2 or exponential. */
1523 gcc_assert (!tree_is_chrec (step_expr
));
1525 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (loop
));
1527 off
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
1528 fold_convert (TREE_TYPE (step_expr
), niters
),
1530 if (POINTER_TYPE_P (type
))
1531 ni
= fold_build_pointer_plus (init_expr
, off
);
1533 ni
= fold_build2 (PLUS_EXPR
, type
,
1534 init_expr
, fold_convert (type
, off
));
1536 var
= create_tmp_var (type
, "tmp");
1538 last_gsi
= gsi_last_bb (exit_bb
);
1539 gimple_seq new_stmts
= NULL
;
1540 ni_name
= force_gimple_operand (ni
, &new_stmts
, false, var
);
1541 /* Exit_bb shouldn't be empty. */
1542 if (!gsi_end_p (last_gsi
))
1543 gsi_insert_seq_after (&last_gsi
, new_stmts
, GSI_SAME_STMT
);
1545 gsi_insert_seq_before (&last_gsi
, new_stmts
, GSI_SAME_STMT
);
1547 /* Fix phi expressions in the successor bb. */
1548 adjust_phi_and_debug_stmts (phi1
, update_e
, ni_name
);
1552 /* Return a gimple value containing the misalignment (measured in vector
1553 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1554 it is away from a perfectly aligned address. Add any new statements
1558 get_misalign_in_elems (gimple
**seq
, loop_vec_info loop_vinfo
)
1560 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
1561 stmt_vec_info stmt_info
= dr_info
->stmt
;
1562 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1564 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr_info
);
1565 gcc_assert (target_align
!= 0);
1567 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1568 size_zero_node
) < 0;
1569 tree offset
= (negative
1570 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype
) + 1)
1572 tree start_addr
= vect_create_addr_base_for_vector_ref (stmt_info
, seq
,
1574 tree type
= unsigned_type_for (TREE_TYPE (start_addr
));
1575 tree target_align_minus_1
= build_int_cst (type
, target_align
- 1);
1576 HOST_WIDE_INT elem_size
1577 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1578 tree elem_size_log
= build_int_cst (type
, exact_log2 (elem_size
));
1580 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1581 tree int_start_addr
= fold_convert (type
, start_addr
);
1582 tree misalign_in_bytes
= fold_build2 (BIT_AND_EXPR
, type
, int_start_addr
,
1583 target_align_minus_1
);
1585 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1586 tree misalign_in_elems
= fold_build2 (RSHIFT_EXPR
, type
, misalign_in_bytes
,
1589 return misalign_in_elems
;
1592 /* Function vect_gen_prolog_loop_niters
1594 Generate the number of iterations which should be peeled as prolog for the
1595 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1596 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1597 As a result, after the execution of this loop, the data reference DR will
1598 refer to an aligned location. The following computation is generated:
1600 If the misalignment of DR is known at compile time:
1601 addr_mis = int mis = DR_MISALIGNMENT (dr);
1602 Else, compute address misalignment in bytes:
1603 addr_mis = addr & (target_align - 1)
1605 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1607 (elem_size = element type size; an element is the scalar element whose type
1608 is the inner type of the vectype)
1610 The computations will be emitted at the end of BB. We also compute and
1611 store upper bound (included) of the result in BOUND.
1613 When the step of the data-ref in the loop is not 1 (as in interleaved data
1614 and SLP), the number of iterations of the prolog must be divided by the step
1615 (which is equal to the size of interleaved group).
1617 The above formulas assume that VF == number of elements in the vector. This
1618 may not hold when there are multiple-types in the loop.
1619 In this case, for some data-references in the loop the VF does not represent
1620 the number of elements that fit in the vector. Therefore, instead of VF we
1621 use TYPE_VECTOR_SUBPARTS. */
1624 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo
,
1625 basic_block bb
, int *bound
)
1627 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
1629 tree niters_type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
1630 gimple_seq stmts
= NULL
, new_stmts
= NULL
;
1631 tree iters
, iters_name
;
1632 stmt_vec_info stmt_info
= dr_info
->stmt
;
1633 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1634 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr_info
);
1636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1638 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1640 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_NOTE
, vect_location
,
1642 "known peeling = %d.\n", npeel
);
1644 iters
= build_int_cst (niters_type
, npeel
);
1645 *bound
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1649 tree misalign_in_elems
= get_misalign_in_elems (&stmts
, loop_vinfo
);
1650 tree type
= TREE_TYPE (misalign_in_elems
);
1651 HOST_WIDE_INT elem_size
1652 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1653 HOST_WIDE_INT align_in_elems
= target_align
/ elem_size
;
1654 tree align_in_elems_minus_1
= build_int_cst (type
, align_in_elems
- 1);
1655 tree align_in_elems_tree
= build_int_cst (type
, align_in_elems
);
1657 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1658 & (align_in_elems - 1)). */
1659 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1660 size_zero_node
) < 0;
1662 iters
= fold_build2 (MINUS_EXPR
, type
, misalign_in_elems
,
1663 align_in_elems_tree
);
1665 iters
= fold_build2 (MINUS_EXPR
, type
, align_in_elems_tree
,
1667 iters
= fold_build2 (BIT_AND_EXPR
, type
, iters
, align_in_elems_minus_1
);
1668 iters
= fold_convert (niters_type
, iters
);
1669 *bound
= align_in_elems
- 1;
1672 if (dump_enabled_p ())
1673 dump_printf_loc (MSG_NOTE
, vect_location
,
1674 "niters for prolog loop: %T\n", iters
);
1676 var
= create_tmp_var (niters_type
, "prolog_loop_niters");
1677 iters_name
= force_gimple_operand (iters
, &new_stmts
, false, var
);
1680 gimple_seq_add_seq (&stmts
, new_stmts
);
1683 gcc_assert (single_succ_p (bb
));
1684 gimple_stmt_iterator gsi
= gsi_last_bb (bb
);
1685 if (gsi_end_p (gsi
))
1686 gsi_insert_seq_before (&gsi
, stmts
, GSI_SAME_STMT
);
1688 gsi_insert_seq_after (&gsi
, stmts
, GSI_SAME_STMT
);
1694 /* Function vect_update_init_of_dr
1696 If CODE is PLUS, the vector loop starts NITERS iterations after the
1697 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1698 iterations before the scalar one (using masking to skip inactive
1699 elements). This function updates the information recorded in DR to
1700 account for the difference. Specifically, it updates the OFFSET
1704 vect_update_init_of_dr (struct data_reference
*dr
, tree niters
, tree_code code
)
1706 tree offset
= DR_OFFSET (dr
);
1708 niters
= fold_build2 (MULT_EXPR
, sizetype
,
1709 fold_convert (sizetype
, niters
),
1710 fold_convert (sizetype
, DR_STEP (dr
)));
1711 offset
= fold_build2 (code
, sizetype
,
1712 fold_convert (sizetype
, offset
), niters
);
1713 DR_OFFSET (dr
) = offset
;
1717 /* Function vect_update_inits_of_drs
1719 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1720 CODE and NITERS are as for vect_update_inits_of_dr. */
1723 vect_update_inits_of_drs (loop_vec_info loop_vinfo
, tree niters
,
1727 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1728 struct data_reference
*dr
;
1730 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1732 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1733 if (!types_compatible_p (sizetype
, TREE_TYPE (niters
)))
1736 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1737 tree var
= create_tmp_var (sizetype
, "prolog_loop_adjusted_niters");
1739 niters
= fold_convert (sizetype
, niters
);
1740 niters
= force_gimple_operand (niters
, &seq
, false, var
);
1743 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
1744 gcc_assert (!new_bb
);
1748 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1750 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1751 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info
->stmt
))
1752 vect_update_init_of_dr (dr
, niters
, code
);
1756 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1757 by masking. This involves calculating the number of iterations to
1758 be peeled and then aligning all memory references appropriately. */
1761 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo
)
1763 tree misalign_in_elems
;
1764 tree type
= LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
);
1766 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo
));
1768 /* From the information recorded in LOOP_VINFO get the number of iterations
1769 that need to be skipped via masking. */
1770 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1772 poly_int64 misalign
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
1773 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
));
1774 misalign_in_elems
= build_int_cst (type
, misalign
);
1778 gimple_seq seq1
= NULL
, seq2
= NULL
;
1779 misalign_in_elems
= get_misalign_in_elems (&seq1
, loop_vinfo
);
1780 misalign_in_elems
= fold_convert (type
, misalign_in_elems
);
1781 misalign_in_elems
= force_gimple_operand (misalign_in_elems
,
1782 &seq2
, true, NULL_TREE
);
1783 gimple_seq_add_seq (&seq1
, seq2
);
1786 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1787 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq1
);
1788 gcc_assert (!new_bb
);
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_NOTE
, vect_location
,
1794 "misalignment for fully-masked loop: %T\n",
1797 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
) = misalign_in_elems
;
1799 vect_update_inits_of_drs (loop_vinfo
, misalign_in_elems
, MINUS_EXPR
);
1802 /* This function builds ni_name = number of iterations. Statements
1803 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1804 it to TRUE if new ssa_var is generated. */
1807 vect_build_loop_niters (loop_vec_info loop_vinfo
, bool *new_var_p
)
1809 tree ni
= unshare_expr (LOOP_VINFO_NITERS (loop_vinfo
));
1810 if (TREE_CODE (ni
) == INTEGER_CST
)
1815 gimple_seq stmts
= NULL
;
1816 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1818 var
= create_tmp_var (TREE_TYPE (ni
), "niters");
1819 ni_name
= force_gimple_operand (ni
, &stmts
, false, var
);
1822 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
1823 if (new_var_p
!= NULL
)
1831 /* Calculate the number of iterations above which vectorized loop will be
1832 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1833 of prolog loop. If it's integer const, the integer number is also passed
1834 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1835 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1836 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1837 threshold below which the scalar (rather than vectorized) loop will be
1838 executed. This function stores the upper bound (inclusive) of the result
1842 vect_gen_scalar_loop_niters (tree niters_prolog
, int int_niters_prolog
,
1843 int bound_prolog
, poly_int64 bound_epilog
, int th
,
1844 poly_uint64
*bound_scalar
,
1845 bool check_profitability
)
1847 tree type
= TREE_TYPE (niters_prolog
);
1848 tree niters
= fold_build2 (PLUS_EXPR
, type
, niters_prolog
,
1849 build_int_cst (type
, bound_epilog
));
1851 *bound_scalar
= bound_prolog
+ bound_epilog
;
1852 if (check_profitability
)
1854 /* TH indicates the minimum niters of vectorized loop, while we
1855 compute the maximum niters of scalar loop. */
1857 /* Peeling for constant times. */
1858 if (int_niters_prolog
>= 0)
1860 *bound_scalar
= upper_bound (int_niters_prolog
+ bound_epilog
, th
);
1861 return build_int_cst (type
, *bound_scalar
);
1863 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1864 and BOUND_EPILOG are inclusive upper bounds. */
1865 if (known_ge (th
, bound_prolog
+ bound_epilog
))
1868 return build_int_cst (type
, th
);
1870 /* Need to do runtime comparison. */
1871 else if (maybe_gt (th
, bound_epilog
))
1873 *bound_scalar
= upper_bound (*bound_scalar
, th
);
1874 return fold_build2 (MAX_EXPR
, type
,
1875 build_int_cst (type
, th
), niters
);
1881 /* NITERS is the number of times that the original scalar loop executes
1882 after peeling. Work out the maximum number of iterations N that can
1883 be handled by the vectorized form of the loop and then either:
1885 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1889 b) set *STEP_VECTOR_PTR to one and generate:
1891 niters_vector = N / vf
1893 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1894 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1895 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1898 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo
, tree niters
,
1899 tree
*niters_vector_ptr
, tree
*step_vector_ptr
,
1900 bool niters_no_overflow
)
1902 tree ni_minus_gap
, var
;
1903 tree niters_vector
, step_vector
, type
= TREE_TYPE (niters
);
1904 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1905 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1906 tree log_vf
= NULL_TREE
;
1908 /* If epilogue loop is required because of data accesses with gaps, we
1909 subtract one iteration from the total number of iterations here for
1910 correct calculation of RATIO. */
1911 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1913 ni_minus_gap
= fold_build2 (MINUS_EXPR
, type
, niters
,
1914 build_one_cst (type
));
1915 if (!is_gimple_val (ni_minus_gap
))
1917 var
= create_tmp_var (type
, "ni_gap");
1918 gimple
*stmts
= NULL
;
1919 ni_minus_gap
= force_gimple_operand (ni_minus_gap
, &stmts
,
1921 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
1925 ni_minus_gap
= niters
;
1927 unsigned HOST_WIDE_INT const_vf
;
1928 if (vf
.is_constant (&const_vf
)
1929 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1931 /* Create: niters >> log2(vf) */
1932 /* If it's known that niters == number of latch executions + 1 doesn't
1933 overflow, we can generate niters >> log2(vf); otherwise we generate
1934 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1935 will be at least one. */
1936 log_vf
= build_int_cst (type
, exact_log2 (const_vf
));
1937 if (niters_no_overflow
)
1938 niters_vector
= fold_build2 (RSHIFT_EXPR
, type
, ni_minus_gap
, log_vf
);
1941 = fold_build2 (PLUS_EXPR
, type
,
1942 fold_build2 (RSHIFT_EXPR
, type
,
1943 fold_build2 (MINUS_EXPR
, type
,
1945 build_int_cst (type
, vf
)),
1947 build_int_cst (type
, 1));
1948 step_vector
= build_one_cst (type
);
1952 niters_vector
= ni_minus_gap
;
1953 step_vector
= build_int_cst (type
, vf
);
1956 if (!is_gimple_val (niters_vector
))
1958 var
= create_tmp_var (type
, "bnd");
1959 gimple_seq stmts
= NULL
;
1960 niters_vector
= force_gimple_operand (niters_vector
, &stmts
, true, var
);
1961 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
1962 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1963 we set range information to make niters analyzer's life easier. */
1964 if (stmts
!= NULL
&& log_vf
)
1965 set_range_info (niters_vector
, VR_RANGE
,
1966 wi::to_wide (build_int_cst (type
, 1)),
1967 wi::to_wide (fold_build2 (RSHIFT_EXPR
, type
,
1968 TYPE_MAX_VALUE (type
),
1971 *niters_vector_ptr
= niters_vector
;
1972 *step_vector_ptr
= step_vector
;
1977 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1978 loop specified by LOOP_VINFO after vectorization, compute the number
1979 of iterations before vectorization (niters_vector * vf) and store it
1980 to NITERS_VECTOR_MULT_VF_PTR. */
1983 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo
,
1985 tree
*niters_vector_mult_vf_ptr
)
1987 /* We should be using a step_vector of VF if VF is variable. */
1988 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
).to_constant ();
1989 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1990 tree type
= TREE_TYPE (niters_vector
);
1991 tree log_vf
= build_int_cst (type
, exact_log2 (vf
));
1992 basic_block exit_bb
= single_exit (loop
)->dest
;
1994 gcc_assert (niters_vector_mult_vf_ptr
!= NULL
);
1995 tree niters_vector_mult_vf
= fold_build2 (LSHIFT_EXPR
, type
,
1996 niters_vector
, log_vf
);
1997 if (!is_gimple_val (niters_vector_mult_vf
))
1999 tree var
= create_tmp_var (type
, "niters_vector_mult_vf");
2000 gimple_seq stmts
= NULL
;
2001 niters_vector_mult_vf
= force_gimple_operand (niters_vector_mult_vf
,
2003 gimple_stmt_iterator gsi
= gsi_start_bb (exit_bb
);
2004 gsi_insert_seq_before (&gsi
, stmts
, GSI_SAME_STMT
);
2006 *niters_vector_mult_vf_ptr
= niters_vector_mult_vf
;
2009 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2010 from SECOND/FIRST and puts it at the original loop's preheader/exit
2011 edge, the two loops are arranged as below:
2016 i_1 = PHI<i_0, i_2>;
2027 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2031 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2032 or with i_2 if no LCSSA phi is created
2033 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2045 This function creates loop closed SSA for the first loop; update the
2046 second loop's PHI nodes by replacing argument on incoming edge with the
2047 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2048 is false, Loop closed ssa phis will only be created for non-iv phis for
2051 This function assumes exit bb of the first loop is preheader bb of the
2052 second loop, i.e, between_bb in the example code. With PHIs updated,
2053 the second loop will execute rest iterations of the first. */
2056 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo
,
2057 struct loop
*first
, struct loop
*second
,
2058 bool create_lcssa_for_iv_phis
)
2060 gphi_iterator gsi_update
, gsi_orig
;
2061 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2063 edge first_latch_e
= EDGE_SUCC (first
->latch
, 0);
2064 edge second_preheader_e
= loop_preheader_edge (second
);
2065 basic_block between_bb
= single_exit (first
)->dest
;
2067 gcc_assert (between_bb
== second_preheader_e
->src
);
2068 gcc_assert (single_pred_p (between_bb
) && single_succ_p (between_bb
));
2069 /* Either the first loop or the second is the loop to be vectorized. */
2070 gcc_assert (loop
== first
|| loop
== second
);
2072 for (gsi_orig
= gsi_start_phis (first
->header
),
2073 gsi_update
= gsi_start_phis (second
->header
);
2074 !gsi_end_p (gsi_orig
) && !gsi_end_p (gsi_update
);
2075 gsi_next (&gsi_orig
), gsi_next (&gsi_update
))
2077 gphi
*orig_phi
= gsi_orig
.phi ();
2078 gphi
*update_phi
= gsi_update
.phi ();
2080 tree arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, first_latch_e
);
2081 /* Generate lcssa PHI node for the first loop. */
2082 gphi
*vect_phi
= (loop
== first
) ? orig_phi
: update_phi
;
2083 stmt_vec_info vect_phi_info
= loop_vinfo
->lookup_stmt (vect_phi
);
2084 if (create_lcssa_for_iv_phis
|| !iv_phi_p (vect_phi_info
))
2086 tree new_res
= copy_ssa_name (PHI_RESULT (orig_phi
));
2087 gphi
*lcssa_phi
= create_phi_node (new_res
, between_bb
);
2088 add_phi_arg (lcssa_phi
, arg
, single_exit (first
), UNKNOWN_LOCATION
);
2092 /* Update PHI node in the second loop by replacing arg on the loop's
2094 adjust_phi_and_debug_stmts (update_phi
, second_preheader_e
, arg
);
2098 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2099 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2100 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2111 i_1 = PHI<i_0, i_2>;
2125 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2129 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2141 This function creates PHI nodes at merge_bb and replaces the use of i_5
2142 in the update_loop's PHI node with the result of new PHI result. */
2145 slpeel_update_phi_nodes_for_guard1 (struct loop
*skip_loop
,
2146 struct loop
*update_loop
,
2147 edge guard_edge
, edge merge_edge
)
2149 source_location merge_loc
, guard_loc
;
2150 edge orig_e
= loop_preheader_edge (skip_loop
);
2151 edge update_e
= loop_preheader_edge (update_loop
);
2152 gphi_iterator gsi_orig
, gsi_update
;
2154 for ((gsi_orig
= gsi_start_phis (skip_loop
->header
),
2155 gsi_update
= gsi_start_phis (update_loop
->header
));
2156 !gsi_end_p (gsi_orig
) && !gsi_end_p (gsi_update
);
2157 gsi_next (&gsi_orig
), gsi_next (&gsi_update
))
2159 gphi
*orig_phi
= gsi_orig
.phi ();
2160 gphi
*update_phi
= gsi_update
.phi ();
2162 /* Generate new phi node at merge bb of the guard. */
2163 tree new_res
= copy_ssa_name (PHI_RESULT (orig_phi
));
2164 gphi
*new_phi
= create_phi_node (new_res
, guard_edge
->dest
);
2166 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2167 args in NEW_PHI for these edges. */
2168 tree merge_arg
= PHI_ARG_DEF_FROM_EDGE (update_phi
, update_e
);
2169 tree guard_arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, orig_e
);
2170 merge_loc
= gimple_phi_arg_location_from_edge (update_phi
, update_e
);
2171 guard_loc
= gimple_phi_arg_location_from_edge (orig_phi
, orig_e
);
2172 add_phi_arg (new_phi
, merge_arg
, merge_edge
, merge_loc
);
2173 add_phi_arg (new_phi
, guard_arg
, guard_edge
, guard_loc
);
2175 /* Update phi in UPDATE_PHI. */
2176 adjust_phi_and_debug_stmts (update_phi
, update_e
, new_res
);
2180 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2181 this function searches for the corresponding lcssa phi node in exit
2182 bb of LOOP. If it is found, return the phi result; otherwise return
2186 find_guard_arg (struct loop
*loop
, struct loop
*epilog ATTRIBUTE_UNUSED
,
2190 edge e
= single_exit (loop
);
2192 gcc_assert (single_pred_p (e
->dest
));
2193 for (gsi
= gsi_start_phis (e
->dest
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2195 gphi
*phi
= gsi
.phi ();
2196 if (operand_equal_p (PHI_ARG_DEF (phi
, 0),
2197 PHI_ARG_DEF (lcssa_phi
, 0), 0))
2198 return PHI_RESULT (phi
);
2203 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2204 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2205 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2206 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2211 i_1 = PHI<i_0, i_2>;
2233 i_3 = PHI<i_2, i_4>;
2244 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2247 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2249 For each name used out side EPILOG (i.e - for each name that has a lcssa
2250 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2251 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2252 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2253 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2254 in exit_bb will also be updated. */
2257 slpeel_update_phi_nodes_for_guard2 (struct loop
*loop
, struct loop
*epilog
,
2258 edge guard_edge
, edge merge_edge
)
2261 basic_block merge_bb
= guard_edge
->dest
;
2263 gcc_assert (single_succ_p (merge_bb
));
2264 edge e
= single_succ_edge (merge_bb
);
2265 basic_block exit_bb
= e
->dest
;
2266 gcc_assert (single_pred_p (exit_bb
));
2267 gcc_assert (single_pred (exit_bb
) == single_exit (epilog
)->dest
);
2269 for (gsi
= gsi_start_phis (exit_bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2271 gphi
*update_phi
= gsi
.phi ();
2272 tree old_arg
= PHI_ARG_DEF (update_phi
, 0);
2273 /* This loop-closed-phi actually doesn't represent a use out of the
2274 loop - the phi arg is a constant. */
2275 if (TREE_CODE (old_arg
) != SSA_NAME
)
2278 tree merge_arg
= get_current_def (old_arg
);
2280 merge_arg
= old_arg
;
2282 tree guard_arg
= find_guard_arg (loop
, epilog
, update_phi
);
2283 /* If the var is live after loop but not a reduction, we simply
2286 guard_arg
= old_arg
;
2288 /* Create new phi node in MERGE_BB: */
2289 tree new_res
= copy_ssa_name (PHI_RESULT (update_phi
));
2290 gphi
*merge_phi
= create_phi_node (new_res
, merge_bb
);
2292 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2293 the two PHI args in merge_phi for these edges. */
2294 add_phi_arg (merge_phi
, merge_arg
, merge_edge
, UNKNOWN_LOCATION
);
2295 add_phi_arg (merge_phi
, guard_arg
, guard_edge
, UNKNOWN_LOCATION
);
2297 /* Update the original phi in exit_bb. */
2298 adjust_phi_and_debug_stmts (update_phi
, e
, new_res
);
2302 /* EPILOG loop is duplicated from the original loop for vectorizing,
2303 the arg of its loop closed ssa PHI needs to be updated. */
2306 slpeel_update_phi_nodes_for_lcssa (struct loop
*epilog
)
2309 basic_block exit_bb
= single_exit (epilog
)->dest
;
2311 gcc_assert (single_pred_p (exit_bb
));
2312 edge e
= EDGE_PRED (exit_bb
, 0);
2313 for (gsi
= gsi_start_phis (exit_bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2314 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi
.phi (), e
));
2317 /* Function vect_do_peeling.
2320 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2326 if (exit_loop_cond) goto exit_bb
2330 - NITERS: The number of iterations of the loop.
2331 - NITERSM1: The number of iterations of the loop's latch.
2332 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2333 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2334 CHECK_PROFITABILITY is true.
2336 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2337 iterate after vectorization; see vect_set_loop_condition for details.
2338 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2339 should be set to the number of scalar iterations handled by the
2340 vector loop. The SSA name is only used on exit from the loop.
2342 This function peels prolog and epilog from the loop, adds guards skipping
2343 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2347 if (prefer_scalar_loop) goto merge_bb_1
2348 else goto guard_bb_2
2351 if (skip_prolog) goto merge_bb_2
2352 else goto prolog_preheader
2358 if (exit_prolog_cond) goto prolog_exit_bb
2359 else goto prolog_header_bb
2368 if (exit_vector_cond) goto vector_exit_bb
2369 else goto vector_header_bb
2373 if (skip_epilog) goto merge_bb_3
2374 else goto epilog_preheader
2382 if (exit_epilog_cond) goto merge_bb_3
2383 else goto epilog_header_bb
2387 Note this function peels prolog and epilog only if it's necessary,
2389 Returns created epilogue or NULL.
2391 TODO: Guard for prefer_scalar_loop should be emitted along with
2392 versioning conditions if loop versioning is needed. */
2396 vect_do_peeling (loop_vec_info loop_vinfo
, tree niters
, tree nitersm1
,
2397 tree
*niters_vector
, tree
*step_vector
,
2398 tree
*niters_vector_mult_vf_var
, int th
,
2399 bool check_profitability
, bool niters_no_overflow
)
2402 tree type
= TREE_TYPE (niters
), guard_cond
;
2403 basic_block guard_bb
, guard_to
;
2404 profile_probability prob_prolog
, prob_vector
, prob_epilog
;
2406 int prolog_peeling
= 0;
2407 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2408 prolog_peeling
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2410 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2411 poly_uint64 bound_epilog
= 0;
2412 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2413 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2414 bound_epilog
+= vf
- 1;
2415 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2417 bool epilog_peeling
= maybe_ne (bound_epilog
, 0U);
2418 poly_uint64 bound_scalar
= bound_epilog
;
2420 if (!prolog_peeling
&& !epilog_peeling
)
2423 prob_vector
= profile_probability::guessed_always ().apply_scale (9, 10);
2424 estimated_vf
= vect_vf_for_cost (loop_vinfo
);
2425 if (estimated_vf
== 2)
2427 prob_prolog
= prob_epilog
= profile_probability::guessed_always ()
2428 .apply_scale (estimated_vf
- 1, estimated_vf
);
2430 struct loop
*prolog
, *epilog
= NULL
, *loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2431 struct loop
*first_loop
= loop
;
2432 bool irred_flag
= loop_preheader_edge (loop
)->flags
& EDGE_IRREDUCIBLE_LOOP
;
2433 create_lcssa_for_virtual_phi (loop
);
2434 update_ssa (TODO_update_ssa_only_virtuals
);
2436 if (MAY_HAVE_DEBUG_BIND_STMTS
)
2438 gcc_assert (!adjust_vec
.exists ());
2439 adjust_vec
.create (32);
2441 initialize_original_copy_tables ();
2443 /* Record the anchor bb at which the guard should be placed if the scalar
2444 loop might be preferred. */
2445 basic_block anchor
= loop_preheader_edge (loop
)->src
;
2447 /* Generate the number of iterations for the prolog loop. We do this here
2448 so that we can also get the upper bound on the number of iterations. */
2450 int bound_prolog
= 0;
2452 niters_prolog
= vect_gen_prolog_loop_niters (loop_vinfo
, anchor
,
2455 niters_prolog
= build_int_cst (type
, 0);
2457 /* Prolog loop may be skipped. */
2458 bool skip_prolog
= (prolog_peeling
!= 0);
2459 /* Skip to epilog if scalar loop may be preferred. It's only needed
2460 when we peel for epilog loop and when it hasn't been checked with
2462 bool skip_vector
= (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2463 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo
),
2464 bound_prolog
+ bound_epilog
)
2465 : !LOOP_REQUIRES_VERSIONING (loop_vinfo
));
2466 /* Epilog loop must be executed if the number of iterations for epilog
2467 loop is known at compile time, otherwise we need to add a check at
2468 the end of vector loop and skip to the end of epilog loop. */
2469 bool skip_epilog
= (prolog_peeling
< 0
2470 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2471 || !vf
.is_constant ());
2472 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2473 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2474 skip_epilog
= false;
2478 split_edge (loop_preheader_edge (loop
));
2480 /* Due to the order in which we peel prolog and epilog, we first
2481 propagate probability to the whole loop. The purpose is to
2482 avoid adjusting probabilities of both prolog and vector loops
2483 separately. Note in this case, the probability of epilog loop
2484 needs to be scaled back later. */
2485 basic_block bb_before_loop
= loop_preheader_edge (loop
)->src
;
2486 if (prob_vector
.initialized_p ())
2488 scale_bbs_frequencies (&bb_before_loop
, 1, prob_vector
);
2489 scale_loop_profile (loop
, prob_vector
, 0);
2493 dump_user_location_t loop_loc
= find_loop_location (loop
);
2494 struct loop
*scalar_loop
= LOOP_VINFO_SCALAR_LOOP (loop_vinfo
);
2497 e
= loop_preheader_edge (loop
);
2498 if (!slpeel_can_duplicate_loop_p (loop
, e
))
2500 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2501 "loop can't be duplicated to preheader edge.\n");
2504 /* Peel prolog and put it on preheader edge of loop. */
2505 prolog
= slpeel_tree_duplicate_loop_to_edge_cfg (loop
, scalar_loop
, e
);
2508 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2509 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2512 slpeel_update_phi_nodes_for_loops (loop_vinfo
, prolog
, loop
, true);
2513 first_loop
= prolog
;
2514 reset_original_copy_tables ();
2516 /* Update the number of iterations for prolog loop. */
2517 tree step_prolog
= build_one_cst (TREE_TYPE (niters_prolog
));
2518 vect_set_loop_condition (prolog
, NULL
, niters_prolog
,
2519 step_prolog
, NULL_TREE
, false);
2521 /* Skip the prolog loop. */
2524 guard_cond
= fold_build2 (EQ_EXPR
, boolean_type_node
,
2525 niters_prolog
, build_int_cst (type
, 0));
2526 guard_bb
= loop_preheader_edge (prolog
)->src
;
2527 basic_block bb_after_prolog
= loop_preheader_edge (loop
)->src
;
2528 guard_to
= split_edge (loop_preheader_edge (loop
));
2529 guard_e
= slpeel_add_loop_guard (guard_bb
, guard_cond
,
2531 prob_prolog
.invert (),
2533 e
= EDGE_PRED (guard_to
, 0);
2534 e
= (e
!= guard_e
? e
: EDGE_PRED (guard_to
, 1));
2535 slpeel_update_phi_nodes_for_guard1 (prolog
, loop
, guard_e
, e
);
2537 scale_bbs_frequencies (&bb_after_prolog
, 1, prob_prolog
);
2538 scale_loop_profile (prolog
, prob_prolog
, bound_prolog
);
2540 /* Update init address of DRs. */
2541 vect_update_inits_of_drs (loop_vinfo
, niters_prolog
, PLUS_EXPR
);
2542 /* Update niters for vector loop. */
2543 LOOP_VINFO_NITERS (loop_vinfo
)
2544 = fold_build2 (MINUS_EXPR
, type
, niters
, niters_prolog
);
2545 LOOP_VINFO_NITERSM1 (loop_vinfo
)
2546 = fold_build2 (MINUS_EXPR
, type
,
2547 LOOP_VINFO_NITERSM1 (loop_vinfo
), niters_prolog
);
2548 bool new_var_p
= false;
2549 niters
= vect_build_loop_niters (loop_vinfo
, &new_var_p
);
2550 /* It's guaranteed that vector loop bound before vectorization is at
2551 least VF, so set range information for newly generated var. */
2553 set_range_info (niters
, VR_RANGE
,
2554 wi::to_wide (build_int_cst (type
, vf
)),
2555 wi::to_wide (TYPE_MAX_VALUE (type
)));
2557 /* Prolog iterates at most bound_prolog times, latch iterates at
2558 most bound_prolog - 1 times. */
2559 record_niter_bound (prolog
, bound_prolog
- 1, false, true);
2560 delete_update_ssa ();
2561 adjust_vec_debug_stmts ();
2567 e
= single_exit (loop
);
2568 if (!slpeel_can_duplicate_loop_p (loop
, e
))
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2571 "loop can't be duplicated to exit edge.\n");
2574 /* Peel epilog and put it on exit edge of loop. */
2575 epilog
= slpeel_tree_duplicate_loop_to_edge_cfg (loop
, scalar_loop
, e
);
2578 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2579 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2582 slpeel_update_phi_nodes_for_loops (loop_vinfo
, loop
, epilog
, false);
2584 /* Scalar version loop may be preferred. In this case, add guard
2585 and skip to epilog. Note this only happens when the number of
2586 iterations of loop is unknown at compile time, otherwise this
2587 won't be vectorized. */
2590 /* Additional epilogue iteration is peeled if gap exists. */
2591 tree t
= vect_gen_scalar_loop_niters (niters_prolog
, prolog_peeling
,
2592 bound_prolog
, bound_epilog
,
2594 check_profitability
);
2595 /* Build guard against NITERSM1 since NITERS may overflow. */
2596 guard_cond
= fold_build2 (LT_EXPR
, boolean_type_node
, nitersm1
, t
);
2598 guard_to
= split_edge (loop_preheader_edge (epilog
));
2599 guard_e
= slpeel_add_loop_guard (guard_bb
, guard_cond
,
2601 prob_vector
.invert (),
2603 e
= EDGE_PRED (guard_to
, 0);
2604 e
= (e
!= guard_e
? e
: EDGE_PRED (guard_to
, 1));
2605 slpeel_update_phi_nodes_for_guard1 (first_loop
, epilog
, guard_e
, e
);
2607 /* Simply propagate profile info from guard_bb to guard_to which is
2608 a merge point of control flow. */
2609 guard_to
->count
= guard_bb
->count
;
2611 /* Scale probability of epilog loop back.
2612 FIXME: We should avoid scaling down and back up. Profile may
2613 get lost if we scale down to 0. */
2614 basic_block
*bbs
= get_loop_body (epilog
);
2615 for (unsigned int i
= 0; i
< epilog
->num_nodes
; i
++)
2616 bbs
[i
]->count
= bbs
[i
]->count
.apply_scale
2618 bbs
[i
]->count
.apply_probability
2623 basic_block bb_before_epilog
= loop_preheader_edge (epilog
)->src
;
2624 tree niters_vector_mult_vf
;
2625 /* If loop is peeled for non-zero constant times, now niters refers to
2626 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2628 niters_no_overflow
|= (prolog_peeling
> 0);
2629 vect_gen_vector_loop_niters (loop_vinfo
, niters
,
2630 niters_vector
, step_vector
,
2631 niters_no_overflow
);
2632 if (!integer_onep (*step_vector
))
2634 /* On exit from the loop we will have an easy way of calcalating
2635 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2637 niters_vector_mult_vf
= make_ssa_name (TREE_TYPE (*niters_vector
));
2638 SSA_NAME_DEF_STMT (niters_vector_mult_vf
) = gimple_build_nop ();
2639 *niters_vector_mult_vf_var
= niters_vector_mult_vf
;
2642 vect_gen_vector_loop_niters_mult_vf (loop_vinfo
, *niters_vector
,
2643 &niters_vector_mult_vf
);
2644 /* Update IVs of original loop as if they were advanced by
2645 niters_vector_mult_vf steps. */
2646 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo
));
2647 edge update_e
= skip_vector
? e
: loop_preheader_edge (epilog
);
2648 vect_update_ivs_after_vectorizer (loop_vinfo
, niters_vector_mult_vf
,
2653 guard_cond
= fold_build2 (EQ_EXPR
, boolean_type_node
,
2654 niters
, niters_vector_mult_vf
);
2655 guard_bb
= single_exit (loop
)->dest
;
2656 guard_to
= split_edge (single_exit (epilog
));
2657 guard_e
= slpeel_add_loop_guard (guard_bb
, guard_cond
, guard_to
,
2658 skip_vector
? anchor
: guard_bb
,
2659 prob_epilog
.invert (),
2661 slpeel_update_phi_nodes_for_guard2 (loop
, epilog
, guard_e
,
2662 single_exit (epilog
));
2663 /* Only need to handle basic block before epilog loop if it's not
2664 the guard_bb, which is the case when skip_vector is true. */
2665 if (guard_bb
!= bb_before_epilog
)
2667 prob_epilog
= prob_vector
* prob_epilog
+ prob_vector
.invert ();
2669 scale_bbs_frequencies (&bb_before_epilog
, 1, prob_epilog
);
2671 scale_loop_profile (epilog
, prob_epilog
, 0);
2674 slpeel_update_phi_nodes_for_lcssa (epilog
);
2676 unsigned HOST_WIDE_INT bound
;
2677 if (bound_scalar
.is_constant (&bound
))
2679 gcc_assert (bound
!= 0);
2680 /* -1 to convert loop iterations to latch iterations. */
2681 record_niter_bound (epilog
, bound
- 1, false, true);
2684 delete_update_ssa ();
2685 adjust_vec_debug_stmts ();
2688 adjust_vec
.release ();
2689 free_original_copy_tables ();
2694 /* Function vect_create_cond_for_niters_checks.
2696 Create a conditional expression that represents the run-time checks for
2697 loop's niter. The loop is guaranteed to terminate if the run-time
2701 COND_EXPR - input conditional expression. New conditions will be chained
2702 with logical AND operation. If it is NULL, then the function
2703 is used to return the number of alias checks.
2704 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2708 COND_EXPR - conditional expression.
2710 The returned COND_EXPR is the conditional expression to be used in the
2711 if statement that controls which version of the loop gets executed at
2715 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo
, tree
*cond_expr
)
2717 tree part_cond_expr
= LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
);
2720 *cond_expr
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
2721 *cond_expr
, part_cond_expr
);
2723 *cond_expr
= part_cond_expr
;
2726 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2727 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2730 chain_cond_expr (tree
*cond_expr
, tree part_cond_expr
)
2733 *cond_expr
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
2734 *cond_expr
, part_cond_expr
);
2736 *cond_expr
= part_cond_expr
;
2739 /* Function vect_create_cond_for_align_checks.
2741 Create a conditional expression that represents the alignment checks for
2742 all of data references (array element references) whose alignment must be
2746 COND_EXPR - input conditional expression. New conditions will be chained
2747 with logical AND operation.
2748 LOOP_VINFO - two fields of the loop information are used.
2749 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2750 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2753 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2755 The returned value is the conditional expression to be used in the if
2756 statement that controls which version of the loop gets executed at runtime.
2758 The algorithm makes two assumptions:
2759 1) The number of bytes "n" in a vector is a power of 2.
2760 2) An address "a" is aligned if a%n is zero and that this
2761 test can be done as a&(n-1) == 0. For example, for 16
2762 byte vectors the test is a&0xf == 0. */
2765 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo
,
2767 gimple_seq
*cond_expr_stmt_list
)
2769 vec
<stmt_vec_info
> may_misalign_stmts
2770 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2771 stmt_vec_info stmt_info
;
2772 int mask
= LOOP_VINFO_PTR_MASK (loop_vinfo
);
2775 tree int_ptrsize_type
;
2777 tree or_tmp_name
= NULL_TREE
;
2781 tree part_cond_expr
;
2783 /* Check that mask is one less than a power of 2, i.e., mask is
2784 all zeros followed by all ones. */
2785 gcc_assert ((mask
!= 0) && ((mask
& (mask
+1)) == 0));
2787 int_ptrsize_type
= signed_type_for (ptr_type_node
);
2789 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2790 of the first vector of the i'th data reference. */
2792 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2794 gimple_seq new_stmt_list
= NULL
;
2797 tree new_or_tmp_name
;
2798 gimple
*addr_stmt
, *or_stmt
;
2799 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2800 bool negative
= tree_int_cst_compare
2801 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info
)), size_zero_node
) < 0;
2802 tree offset
= negative
2803 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype
) + 1) : size_zero_node
;
2805 /* create: addr_tmp = (int)(address_of_first_vector) */
2807 vect_create_addr_base_for_vector_ref (stmt_info
, &new_stmt_list
,
2809 if (new_stmt_list
!= NULL
)
2810 gimple_seq_add_seq (cond_expr_stmt_list
, new_stmt_list
);
2812 sprintf (tmp_name
, "addr2int%d", i
);
2813 addr_tmp_name
= make_temp_ssa_name (int_ptrsize_type
, NULL
, tmp_name
);
2814 addr_stmt
= gimple_build_assign (addr_tmp_name
, NOP_EXPR
, addr_base
);
2815 gimple_seq_add_stmt (cond_expr_stmt_list
, addr_stmt
);
2817 /* The addresses are OR together. */
2819 if (or_tmp_name
!= NULL_TREE
)
2821 /* create: or_tmp = or_tmp | addr_tmp */
2822 sprintf (tmp_name
, "orptrs%d", i
);
2823 new_or_tmp_name
= make_temp_ssa_name (int_ptrsize_type
, NULL
, tmp_name
);
2824 or_stmt
= gimple_build_assign (new_or_tmp_name
, BIT_IOR_EXPR
,
2825 or_tmp_name
, addr_tmp_name
);
2826 gimple_seq_add_stmt (cond_expr_stmt_list
, or_stmt
);
2827 or_tmp_name
= new_or_tmp_name
;
2830 or_tmp_name
= addr_tmp_name
;
2834 mask_cst
= build_int_cst (int_ptrsize_type
, mask
);
2836 /* create: and_tmp = or_tmp & mask */
2837 and_tmp_name
= make_temp_ssa_name (int_ptrsize_type
, NULL
, "andmask");
2839 and_stmt
= gimple_build_assign (and_tmp_name
, BIT_AND_EXPR
,
2840 or_tmp_name
, mask_cst
);
2841 gimple_seq_add_stmt (cond_expr_stmt_list
, and_stmt
);
2843 /* Make and_tmp the left operand of the conditional test against zero.
2844 if and_tmp has a nonzero bit then some address is unaligned. */
2845 ptrsize_zero
= build_int_cst (int_ptrsize_type
, 0);
2846 part_cond_expr
= fold_build2 (EQ_EXPR
, boolean_type_node
,
2847 and_tmp_name
, ptrsize_zero
);
2848 chain_cond_expr (cond_expr
, part_cond_expr
);
2851 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2852 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2853 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2854 and this new condition are true. Treat a null *COND_EXPR as "true". */
2857 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo
, tree
*cond_expr
)
2859 vec
<vec_object_pair
> pairs
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
2861 vec_object_pair
*pair
;
2862 FOR_EACH_VEC_ELT (pairs
, i
, pair
)
2864 tree addr1
= build_fold_addr_expr (pair
->first
);
2865 tree addr2
= build_fold_addr_expr (pair
->second
);
2866 tree part_cond_expr
= fold_build2 (NE_EXPR
, boolean_type_node
,
2868 chain_cond_expr (cond_expr
, part_cond_expr
);
2872 /* Create an expression that is true when all lower-bound conditions for
2873 the vectorized loop are met. Chain this condition with *COND_EXPR. */
2876 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo
, tree
*cond_expr
)
2878 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
2879 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
2881 tree expr
= lower_bounds
[i
].expr
;
2882 tree type
= unsigned_type_for (TREE_TYPE (expr
));
2883 expr
= fold_convert (type
, expr
);
2884 poly_uint64 bound
= lower_bounds
[i
].min_value
;
2885 if (!lower_bounds
[i
].unsigned_p
)
2887 expr
= fold_build2 (PLUS_EXPR
, type
, expr
,
2888 build_int_cstu (type
, bound
- 1));
2891 tree part_cond_expr
= fold_build2 (GE_EXPR
, boolean_type_node
, expr
,
2892 build_int_cstu (type
, bound
));
2893 chain_cond_expr (cond_expr
, part_cond_expr
);
2897 /* Function vect_create_cond_for_alias_checks.
2899 Create a conditional expression that represents the run-time checks for
2900 overlapping of address ranges represented by a list of data references
2901 relations passed as input.
2904 COND_EXPR - input conditional expression. New conditions will be chained
2905 with logical AND operation. If it is NULL, then the function
2906 is used to return the number of alias checks.
2907 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2911 COND_EXPR - conditional expression.
2913 The returned COND_EXPR is the conditional expression to be used in the if
2914 statement that controls which version of the loop gets executed at runtime.
2918 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo
, tree
* cond_expr
)
2920 vec
<dr_with_seg_len_pair_t
> comp_alias_ddrs
=
2921 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2923 if (comp_alias_ddrs
.is_empty ())
2926 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo
),
2927 &comp_alias_ddrs
, cond_expr
);
2928 if (dump_enabled_p ())
2929 dump_printf_loc (MSG_NOTE
, vect_location
,
2930 "created %u versioning for alias checks.\n",
2931 comp_alias_ddrs
.length ());
2935 /* Function vect_loop_versioning.
2937 If the loop has data references that may or may not be aligned or/and
2938 has data reference relations whose independence was not proven then
2939 two versions of the loop need to be generated, one which is vectorized
2940 and one which isn't. A test is then generated to control which of the
2941 loops is executed. The test checks for the alignment of all of the
2942 data references that may or may not be aligned. An additional
2943 sequence of runtime tests is generated for each pairs of DDRs whose
2944 independence was not proven. The vectorized version of loop is
2945 executed only if both alias and alignment tests are passed.
2947 The test generated to check which version of loop is executed
2948 is modified to also check for profitability as indicated by the
2949 cost model threshold TH.
2951 The versioning precondition(s) are placed in *COND_EXPR and
2952 *COND_EXPR_STMT_LIST. */
2955 vect_loop_versioning (loop_vec_info loop_vinfo
,
2956 unsigned int th
, bool check_profitability
,
2957 poly_uint64 versioning_threshold
)
2959 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *nloop
;
2960 struct loop
*scalar_loop
= LOOP_VINFO_SCALAR_LOOP (loop_vinfo
);
2961 basic_block condition_bb
;
2963 gimple_stmt_iterator cond_exp_gsi
;
2964 basic_block merge_bb
;
2965 basic_block new_exit_bb
;
2967 gphi
*orig_phi
, *new_phi
;
2968 tree cond_expr
= NULL_TREE
;
2969 gimple_seq cond_expr_stmt_list
= NULL
;
2971 profile_probability prob
= profile_probability::likely ();
2972 gimple_seq gimplify_stmt_list
= NULL
;
2973 tree scalar_loop_iters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2974 bool version_align
= LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
);
2975 bool version_alias
= LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
);
2976 bool version_niter
= LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
);
2978 if (check_profitability
)
2979 cond_expr
= fold_build2 (GE_EXPR
, boolean_type_node
, scalar_loop_iters
,
2980 build_int_cst (TREE_TYPE (scalar_loop_iters
),
2982 if (maybe_ne (versioning_threshold
, 0U))
2984 tree expr
= fold_build2 (GE_EXPR
, boolean_type_node
, scalar_loop_iters
,
2985 build_int_cst (TREE_TYPE (scalar_loop_iters
),
2986 versioning_threshold
- 1));
2988 cond_expr
= fold_build2 (BIT_AND_EXPR
, boolean_type_node
,
2995 vect_create_cond_for_niters_checks (loop_vinfo
, &cond_expr
);
2998 cond_expr
= force_gimple_operand_1 (cond_expr
, &cond_expr_stmt_list
,
2999 is_gimple_condexpr
, NULL_TREE
);
3002 vect_create_cond_for_align_checks (loop_vinfo
, &cond_expr
,
3003 &cond_expr_stmt_list
);
3007 vect_create_cond_for_unequal_addrs (loop_vinfo
, &cond_expr
);
3008 vect_create_cond_for_lower_bounds (loop_vinfo
, &cond_expr
);
3009 vect_create_cond_for_alias_checks (loop_vinfo
, &cond_expr
);
3012 cond_expr
= force_gimple_operand_1 (unshare_expr (cond_expr
),
3013 &gimplify_stmt_list
,
3014 is_gimple_condexpr
, NULL_TREE
);
3015 gimple_seq_add_seq (&cond_expr_stmt_list
, gimplify_stmt_list
);
3017 initialize_original_copy_tables ();
3021 basic_block preheader
, scalar_preheader
;
3023 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3024 scale LOOP's frequencies instead. */
3025 nloop
= loop_version (scalar_loop
, cond_expr
, &condition_bb
,
3026 prob
, prob
.invert (), prob
, prob
.invert (), true);
3027 scale_loop_frequencies (loop
, prob
);
3028 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3029 while we need to move it above LOOP's preheader. */
3030 e
= loop_preheader_edge (loop
);
3031 scalar_e
= loop_preheader_edge (scalar_loop
);
3032 /* The vector loop preheader might not be empty, since new
3033 invariants could have been created while analyzing the loop. */
3034 gcc_assert (single_pred_p (e
->src
));
3035 gcc_assert (empty_block_p (scalar_e
->src
)
3036 && single_pred_p (scalar_e
->src
));
3037 gcc_assert (single_pred_p (condition_bb
));
3039 scalar_preheader
= scalar_e
->src
;
3040 scalar_e
= find_edge (condition_bb
, scalar_preheader
);
3041 e
= single_pred_edge (preheader
);
3042 redirect_edge_and_branch_force (single_pred_edge (condition_bb
),
3044 redirect_edge_and_branch_force (scalar_e
, preheader
);
3045 redirect_edge_and_branch_force (e
, condition_bb
);
3046 set_immediate_dominator (CDI_DOMINATORS
, condition_bb
,
3047 single_pred (condition_bb
));
3048 set_immediate_dominator (CDI_DOMINATORS
, scalar_preheader
,
3049 single_pred (scalar_preheader
));
3050 set_immediate_dominator (CDI_DOMINATORS
, preheader
,
3054 nloop
= loop_version (loop
, cond_expr
, &condition_bb
,
3055 prob
, prob
.invert (), prob
, prob
.invert (), true);
3059 /* The versioned loop could be infinite, we need to clear existing
3060 niter information which is copied from the original loop. */
3061 gcc_assert (loop_constraint_set_p (loop
, LOOP_C_FINITE
));
3062 vect_free_loop_info_assumptions (nloop
);
3063 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3064 loop_constraint_set (loop
, LOOP_C_INFINITE
);
3067 if (LOCATION_LOCUS (vect_location
.get_location_t ()) != UNKNOWN_LOCATION
3068 && dump_enabled_p ())
3071 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
| MSG_PRIORITY_USER_FACING
,
3073 "loop versioned for vectorization because of "
3074 "possible aliasing\n");
3076 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
| MSG_PRIORITY_USER_FACING
,
3078 "loop versioned for vectorization to enhance "
3082 free_original_copy_tables ();
3084 /* Loop versioning violates an assumption we try to maintain during
3085 vectorization - that the loop exit block has a single predecessor.
3086 After versioning, the exit block of both loop versions is the same
3087 basic block (i.e. it has two predecessors). Just in order to simplify
3088 following transformations in the vectorizer, we fix this situation
3089 here by adding a new (empty) block on the exit-edge of the loop,
3090 with the proper loop-exit phis to maintain loop-closed-form.
3091 If loop versioning wasn't done from loop, but scalar_loop instead,
3092 merge_bb will have already just a single successor. */
3094 merge_bb
= single_exit (loop
)->dest
;
3095 if (scalar_loop
== NULL
|| EDGE_COUNT (merge_bb
->preds
) >= 2)
3097 gcc_assert (EDGE_COUNT (merge_bb
->preds
) >= 2);
3098 new_exit_bb
= split_edge (single_exit (loop
));
3099 new_exit_e
= single_exit (loop
);
3100 e
= EDGE_SUCC (new_exit_bb
, 0);
3102 for (gsi
= gsi_start_phis (merge_bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3105 orig_phi
= gsi
.phi ();
3106 new_res
= copy_ssa_name (PHI_RESULT (orig_phi
));
3107 new_phi
= create_phi_node (new_res
, new_exit_bb
);
3108 arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, e
);
3109 add_phi_arg (new_phi
, arg
, new_exit_e
,
3110 gimple_phi_arg_location_from_edge (orig_phi
, e
));
3111 adjust_phi_and_debug_stmts (orig_phi
, e
, PHI_RESULT (new_phi
));
3115 /* End loop-exit-fixes after versioning. */
3117 if (cond_expr_stmt_list
)
3119 cond_exp_gsi
= gsi_last_bb (condition_bb
);
3120 gsi_insert_seq_before (&cond_exp_gsi
, cond_expr_stmt_list
,
3123 update_ssa (TODO_update_ssa
);