1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode
), count
);
83 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
87 "cannot use %s<%s><%s>\n", name
,
88 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE
, vect_location
,
94 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
95 GET_MODE_NAME (mode
));
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
105 simd_clone_call_p (gimple
*stmt
)
107 gcall
*call
= dyn_cast
<gcall
*> (stmt
);
111 tree fndecl
= NULL_TREE
;
112 if (gimple_call_internal_p (call
, IFN_MASK_CALL
))
113 fndecl
= TREE_OPERAND (gimple_call_arg (stmt
, 0), 0);
115 fndecl
= gimple_call_fndecl (stmt
);
117 if (fndecl
== NULL_TREE
)
120 cgraph_node
*node
= cgraph_node::get (fndecl
);
121 if (node
&& node
->simd_clones
!= NULL
)
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
, tree scalar_type
)
149 HOST_WIDE_INT lhs
, rhs
;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
156 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
158 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
161 scalar_type
= TREE_TYPE (gimple_assign_lhs (assign
));
162 if (gimple_assign_cast_p (assign
)
163 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
169 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
171 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
173 scalar_type
= rhs_type
;
176 else if (cgraph_node
*node
= simd_clone_call_p (stmt_info
->stmt
))
178 auto clone
= node
->simd_clones
->simdclone
;
179 for (unsigned int i
= 0; i
< clone
->nargs
; ++i
)
181 if (clone
->args
[i
].arg_type
== SIMD_CLONE_ARG_TYPE_VECTOR
)
183 tree arg_scalar_type
= TREE_TYPE (clone
->args
[i
].vector_type
);
184 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type
));
187 scalar_type
= arg_scalar_type
;
193 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
196 if (gimple_call_internal_p (call
))
198 internal_fn ifn
= gimple_call_internal_fn (call
);
199 if (internal_load_fn_p (ifn
))
200 /* For loads the LHS type does the trick. */
202 else if (internal_store_fn_p (ifn
))
204 /* For stores use the tyep of the stored value. */
205 i
= internal_fn_stored_value_index (ifn
);
206 scalar_type
= TREE_TYPE (gimple_call_arg (call
, i
));
209 else if (internal_fn_mask_index (ifn
) == 0)
212 if (i
< gimple_call_num_args (call
))
214 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
217 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
219 scalar_type
= rhs_type
;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
233 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
235 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
237 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
238 return opt_result::failure_at (vect_location
,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
244 = runtime_alias_check_p (ddr
, loop
,
245 optimize_loop_nest_for_speed_p (loop
));
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
256 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
258 const vec
<tree
> &checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
259 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
260 if (checks
[i
] == value
)
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE
, vect_location
,
265 "need run-time check that %T is nonzero\n",
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
275 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
277 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
278 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
285 /* STMT_A and STMT_B belong to overlapping groups. All loads are
286 emitted at the position of the first scalar load.
287 Stores in a group are emitted at the position of the last scalar store.
288 Compute that position and check whether the resulting order matches
290 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
293 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
294 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
295 s
= DR_GROUP_NEXT_ELEMENT (s
))
296 il_a
= get_later_stmt (il_a
, s
);
297 else /* DR_IS_READ */
298 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
299 s
= DR_GROUP_NEXT_ELEMENT (s
))
300 if (get_later_stmt (il_a
, s
) == il_a
)
305 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
308 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
309 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
310 s
= DR_GROUP_NEXT_ELEMENT (s
))
311 il_b
= get_later_stmt (il_b
, s
);
312 else /* DR_IS_READ */
313 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
314 s
= DR_GROUP_NEXT_ELEMENT (s
))
315 if (get_later_stmt (il_b
, s
) == il_b
)
320 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
321 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
324 /* A subroutine of vect_analyze_data_ref_dependence. Handle
325 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
326 distances. These distances are conservatively correct but they don't
327 reflect a guaranteed dependence.
329 Return true if this function does all the work necessary to avoid
330 an alias or false if the caller should use the dependence distances
331 to limit the vectorization factor in the usual way. LOOP_DEPTH is
332 the depth of the loop described by LOOP_VINFO and the other arguments
333 are as for vect_analyze_data_ref_dependence. */
336 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
337 loop_vec_info loop_vinfo
,
338 int loop_depth
, unsigned int *max_vf
)
340 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
341 for (lambda_vector
&dist_v
: DDR_DIST_VECTS (ddr
))
343 int dist
= dist_v
[loop_depth
];
344 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
346 /* If the user asserted safelen >= DIST consecutive iterations
347 can be executed concurrently, assume independence.
349 ??? An alternative would be to add the alias check even
350 in this case, and vectorize the fallback loop with the
351 maximum VF set to safelen. However, if the user has
352 explicitly given a length, it's less likely that that
354 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
356 if ((unsigned int) loop
->safelen
< *max_vf
)
357 *max_vf
= loop
->safelen
;
358 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
362 /* For dependence distances of 2 or more, we have the option
363 of limiting VF or checking for an alias at runtime.
364 Prefer to check at runtime if we can, to avoid limiting
365 the VF unnecessarily when the bases are in fact independent.
367 Note that the alias checks will be removed if the VF ends up
368 being small enough. */
369 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
370 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
371 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
372 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
373 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
380 /* Function vect_analyze_data_ref_dependence.
382 FIXME: I needed to change the sense of the returned flag.
384 Return FALSE if there (might) exist a dependence between a memory-reference
385 DRA and a memory-reference DRB. When versioning for alias may check a
386 dependence at run-time, return TRUE. Adjust *MAX_VF according to
387 the data dependence. */
390 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
391 loop_vec_info loop_vinfo
,
392 unsigned int *max_vf
)
395 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
396 struct data_reference
*dra
= DDR_A (ddr
);
397 struct data_reference
*drb
= DDR_B (ddr
);
398 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
399 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
400 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
401 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
402 lambda_vector dist_v
;
403 unsigned int loop_depth
;
405 /* If user asserted safelen consecutive iterations can be
406 executed concurrently, assume independence. */
407 auto apply_safelen
= [&]()
409 if (loop
->safelen
>= 2)
411 if ((unsigned int) loop
->safelen
< *max_vf
)
412 *max_vf
= loop
->safelen
;
413 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
419 /* In loop analysis all data references should be vectorizable. */
420 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
421 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
424 /* Independent data accesses. */
425 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
426 return opt_result::success ();
429 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
430 return opt_result::success ();
432 /* We do not have to consider dependences between accesses that belong
433 to the same group, unless the stride could be smaller than the
435 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
436 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
437 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
438 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
439 return opt_result::success ();
441 /* Even if we have an anti-dependence then, as the vectorized loop covers at
442 least two scalar iterations, there is always also a true dependence.
443 As the vectorizer does not re-order loads and stores we can ignore
444 the anti-dependence if TBAA can disambiguate both DRs similar to the
445 case with known negative distance anti-dependences (positive
446 distance anti-dependences would violate TBAA constraints). */
447 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
448 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
449 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
450 get_alias_set (DR_REF (drb
))))
451 return opt_result::success ();
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
456 if (apply_safelen ())
457 return opt_result::success ();
459 return opt_result::failure_at
461 "possible alias involving gather/scatter between %T and %T\n",
462 DR_REF (dra
), DR_REF (drb
));
465 /* Unknown data dependence. */
466 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
468 if (apply_safelen ())
469 return opt_result::success ();
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
473 "versioning for alias required: "
474 "can't determine dependence between %T and %T\n",
475 DR_REF (dra
), DR_REF (drb
));
477 /* Add to list of ddrs that need to be tested at run-time. */
478 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
481 /* Known data dependence. */
482 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
484 if (apply_safelen ())
485 return opt_result::success ();
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
489 "versioning for alias required: "
490 "bad dist vector for %T and %T\n",
491 DR_REF (dra
), DR_REF (drb
));
492 /* Add to list of ddrs that need to be tested at run-time. */
493 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
496 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
498 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
499 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
501 return opt_result::success ();
503 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
505 int dist
= dist_v
[loop_depth
];
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE
, vect_location
,
509 "dependence distance = %d.\n", dist
);
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE
, vect_location
,
515 "dependence distance == 0 between %T and %T\n",
516 DR_REF (dra
), DR_REF (drb
));
518 /* When we perform grouped accesses and perform implicit CSE
519 by detecting equal accesses and doing disambiguation with
520 runtime alias tests like for
528 where we will end up loading { a[i], a[i+1] } once, make
529 sure that inserting group loads before the first load and
530 stores after the last store will do the right thing.
531 Similar for groups like
535 where loads from the group interleave with the store. */
536 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
537 return opt_result::failure_at (stmtinfo_a
->stmt
,
538 "READ_WRITE dependence"
539 " in interleaving.\n");
541 if (loop
->safelen
< 2)
543 tree indicator
= dr_zero_step_indicator (dra
);
544 if (!indicator
|| integer_zerop (indicator
))
545 return opt_result::failure_at (stmtinfo_a
->stmt
,
546 "access also has a zero step\n");
547 else if (TREE_CODE (indicator
) != INTEGER_CST
)
548 vect_check_nonzero_value (loop_vinfo
, indicator
);
553 if (dist
> 0 && DDR_REVERSED_P (ddr
))
555 /* If DDR_REVERSED_P the order of the data-refs in DDR was
556 reversed (to make distance vector positive), and the actual
557 distance is negative. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE
, vect_location
,
560 "dependence distance negative.\n");
561 /* When doing outer loop vectorization, we need to check if there is
562 a backward dependence at the inner loop level if the dependence
563 at the outer loop is reversed. See PR81740. */
564 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
565 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
567 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
568 DDR_LOOP_NEST (ddr
));
569 if (dist_v
[inner_depth
] < 0)
570 return opt_result::failure_at (stmtinfo_a
->stmt
,
571 "not vectorized, dependence "
572 "between data-refs %T and %T\n",
573 DR_REF (dra
), DR_REF (drb
));
575 /* Record a negative dependence distance to later limit the
576 amount of stmt copying / unrolling we can perform.
577 Only need to handle read-after-write dependence. */
579 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
580 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
581 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
585 unsigned int abs_dist
= abs (dist
);
586 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
588 /* The dependence distance requires reduction of the maximal
589 vectorization factor. */
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE
, vect_location
,
593 "adjusting maximal vectorization factor to %i\n",
597 if (abs_dist
>= *max_vf
)
599 /* Dependence distance does not create dependence, as far as
600 vectorization is concerned, in this case. */
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE
, vect_location
,
603 "dependence distance >= VF.\n");
607 return opt_result::failure_at (stmtinfo_a
->stmt
,
608 "not vectorized, possible dependence "
609 "between data-refs %T and %T\n",
610 DR_REF (dra
), DR_REF (drb
));
613 return opt_result::success ();
616 /* Funcion vect_analyze_early_break_dependences.
618 Examime all the data references in the loop and make sure that if we have
619 mulitple exits that we are able to safely move stores such that they become
620 safe for vectorization. The function also calculates the place where to move
621 the instructions to and computes what the new vUSE chain should be.
623 This works in tandem with the CFG that will be produced by
624 slpeel_tree_duplicate_loop_to_edge_cfg later on.
626 This function tries to validate whether an early break vectorization
627 is possible for the current instruction sequence. Returns True i
628 possible, otherwise False.
631 - Any memory access must be to a fixed size buffer.
632 - There must not be any loads and stores to the same object.
633 - Multiple loads are allowed as long as they don't alias.
636 This implemementation is very conservative. Any overlappig loads/stores
637 that take place before the early break statement gets rejected aside from
654 is which is the common case. */
657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo
)
659 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
661 /* List of all load data references found during traversal. */
662 auto_vec
<data_reference
*> bases
;
663 basic_block dest_bb
= NULL
;
665 hash_set
<gimple
*> visited
;
666 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
667 class loop
*loop_nest
= loop_outer (loop
);
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE
, vect_location
,
671 "loop contains multiple exits, analyzing"
672 " statement dependencies.\n");
674 /* Since we don't support general control flow, the location we'll move the
675 side-effects to is always the latch connected exit. When we support
676 general control flow we can do better but for now this is fine. */
677 dest_bb
= single_pred (loop
->latch
);
678 basic_block bb
= dest_bb
;
682 /* If the destination block is also the header then we have nothing to do. */
683 if (!single_pred_p (bb
))
686 bb
= single_pred (bb
);
687 gimple_stmt_iterator gsi
= gsi_last_bb (bb
);
689 /* Now analyze all the remaining statements and try to determine which
690 instructions are allowed/needed to be moved. */
691 while (!gsi_end_p (gsi
))
693 gimple
*stmt
= gsi_stmt (gsi
);
695 if (!gimple_has_ops (stmt
)
696 || is_gimple_debug (stmt
))
699 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (stmt
);
700 auto dr_ref
= STMT_VINFO_DATA_REF (stmt_vinfo
);
704 /* We currently only support statically allocated objects due to
705 not having first-faulting loads support or peeling for
706 alignment support. Compute the size of the referenced object
707 (it could be dynamically allocated). */
708 tree obj
= DR_BASE_ADDRESS (dr_ref
);
709 if (!obj
|| TREE_CODE (obj
) != ADDR_EXPR
)
711 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
713 "early breaks only supported on statically"
714 " allocated objects.\n");
715 return opt_result::failure_at (stmt
,
716 "can't safely apply code motion to "
717 "dependencies of %G to vectorize "
718 "the early exit.\n", stmt
);
721 tree refop
= TREE_OPERAND (obj
, 0);
722 tree refbase
= get_base_address (refop
);
723 if (!refbase
|| !DECL_P (refbase
) || !DECL_SIZE (refbase
)
724 || TREE_CODE (DECL_SIZE (refbase
)) != INTEGER_CST
)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
728 "early breaks only supported on"
729 " statically allocated objects.\n");
730 return opt_result::failure_at (stmt
,
731 "can't safely apply code motion to "
732 "dependencies of %G to vectorize "
733 "the early exit.\n", stmt
);
736 /* Check if vector accesses to the object will be within bounds.
737 must be a constant or assume loop will be versioned or niters
738 bounded by VF so accesses are within range. */
739 if (!ref_within_array_bound (stmt
, DR_REF (dr_ref
)))
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
743 "early breaks not supported: vectorization "
744 "would %s beyond size of obj.",
745 DR_IS_READ (dr_ref
) ? "read" : "write");
746 return opt_result::failure_at (stmt
,
747 "can't safely apply code motion to "
748 "dependencies of %G to vectorize "
749 "the early exit.\n", stmt
);
752 if (DR_IS_READ (dr_ref
))
753 bases
.safe_push (dr_ref
);
754 else if (DR_IS_WRITE (dr_ref
))
756 /* We are moving writes down in the CFG. To be sure that this
757 is valid after vectorization we have to check all the loads
758 we are sinking the stores past to see if any of them may
759 alias or are the same object.
761 Same objects will not be an issue because unless the store
762 is marked volatile the value can be forwarded. If the
763 store is marked volatile we don't vectorize the loop
766 That leaves the check for aliasing. We don't really need
767 to care about the stores aliasing with each other since the
768 stores are moved in order so the effects are still observed
769 correctly. This leaves the check for WAR dependencies
770 which we would be introducing here if the DR can alias.
771 The check is quadratic in loads/stores but I have not found
772 a better API to do this. I believe all loads and stores
773 must be checked. We also must check them when we
774 encountered the store, since we don't care about loads past
777 for (auto dr_read
: bases
)
778 if (dr_may_alias_p (dr_ref
, dr_read
, loop_nest
))
780 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
783 "early breaks not supported: "
784 "overlapping loads and stores "
785 "found before the break "
788 return opt_result::failure_at (stmt
,
789 "can't safely apply code motion to dependencies"
790 " to vectorize the early exit. %G may alias with"
791 " %G\n", stmt
, dr_read
->stmt
);
795 if (gimple_vdef (stmt
))
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE
, vect_location
,
799 "==> recording stmt %G", stmt
);
801 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
).safe_push (stmt
);
803 else if (gimple_vuse (stmt
))
805 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).safe_insert (0, stmt
);
806 if (dump_enabled_p ())
807 dump_printf_loc (MSG_NOTE
, vect_location
,
808 "marked statement for vUSE update: %G", stmt
);
812 while (bb
!= loop
->header
);
814 /* We don't allow outer -> inner loop transitions which should have been
815 trapped already during loop form analysis. */
816 gcc_assert (dest_bb
->loop_father
== loop
);
818 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
) = dest_bb
;
820 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).is_empty ())
822 /* All uses shall be updated to that of the first load. Entries are
823 stored in reverse order. */
824 tree vuse
= gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).last ());
825 for (auto g
: LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
))
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE
, vect_location
,
829 "will update use: %T, mem_ref: %G", vuse
, g
);
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_NOTE
, vect_location
,
835 "recorded statements to be moved to BB %d\n",
836 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
)->index
);
838 return opt_result::success ();
841 /* Function vect_analyze_data_ref_dependences.
843 Examine all the data references in the loop, and make sure there do not
844 exist any data dependences between them. Set *MAX_VF according to
845 the maximum vectorization factor the data dependences allow. */
848 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
849 unsigned int *max_vf
)
852 struct data_dependence_relation
*ddr
;
854 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
856 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
858 LOOP_VINFO_DDRS (loop_vinfo
)
859 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
860 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
861 /* We do not need read-read dependences. */
862 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
863 &LOOP_VINFO_DDRS (loop_vinfo
),
864 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
869 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
871 /* For epilogues we either have no aliases or alias versioning
872 was applied to original loop. Therefore we may just get max_vf
873 using VF of original loop. */
874 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
875 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
877 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
880 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
885 /* If we have early break statements in the loop, check to see if they
886 are of a form we can vectorizer. */
887 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
888 return vect_analyze_early_break_dependences (loop_vinfo
);
890 return opt_result::success ();
894 /* Function vect_slp_analyze_data_ref_dependence.
896 Return TRUE if there (might) exist a dependence between a memory-reference
897 DRA and a memory-reference DRB for VINFO. When versioning for alias
898 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
899 according to the data dependence. */
902 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
903 struct data_dependence_relation
*ddr
)
905 struct data_reference
*dra
= DDR_A (ddr
);
906 struct data_reference
*drb
= DDR_B (ddr
);
907 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
908 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
910 /* We need to check dependences of statements marked as unvectorizable
911 as well, they still can prohibit vectorization. */
913 /* Independent data accesses. */
914 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
920 /* Read-read is OK. */
921 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
924 /* If dra and drb are part of the same interleaving chain consider
926 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
927 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
928 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
931 /* Unknown data dependence. */
932 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
934 if (dump_enabled_p ())
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
936 "can't determine dependence between %T and %T\n",
937 DR_REF (dra
), DR_REF (drb
));
939 else if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE
, vect_location
,
941 "determined dependence between %T and %T\n",
942 DR_REF (dra
), DR_REF (drb
));
948 /* Analyze dependences involved in the transform of a store SLP NODE. */
951 vect_slp_analyze_store_dependences (vec_info
*vinfo
, slp_tree node
)
953 /* This walks over all stmts involved in the SLP store done
954 in NODE verifying we can sink them up to the last stmt in the
956 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
957 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info
)));
959 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
961 stmt_vec_info access_info
962 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
963 if (access_info
== last_access_info
)
965 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
967 bool ref_initialized_p
= false;
968 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
969 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
971 gimple
*stmt
= gsi_stmt (gsi
);
972 if (! gimple_vuse (stmt
))
975 /* If we couldn't record a (single) data reference for this
976 stmt we have to resort to the alias oracle. */
977 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
978 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
981 /* We are moving a store - this means
982 we cannot use TBAA for disambiguation. */
983 if (!ref_initialized_p
)
984 ao_ref_init (&ref
, DR_REF (dr_a
));
985 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
986 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
991 gcc_assert (!gimple_visited_p (stmt
));
993 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
995 bool dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
996 free_dependence_relation (ddr
);
1004 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1005 contain the vector of scalar stores of this instance if we are
1006 disambiguating the loads. */
1009 vect_slp_analyze_load_dependences (vec_info
*vinfo
, slp_tree node
,
1010 vec
<stmt_vec_info
> stores
,
1011 stmt_vec_info last_store_info
)
1013 /* This walks over all stmts involved in the SLP load done
1014 in NODE verifying we can hoist them up to the first stmt in the
1016 stmt_vec_info first_access_info
= vect_find_first_scalar_stmt_in_slp (node
);
1017 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info
)));
1019 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
1021 stmt_vec_info access_info
1022 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
1023 if (access_info
== first_access_info
)
1025 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
1027 bool ref_initialized_p
= false;
1028 hash_set
<stmt_vec_info
> grp_visited
;
1029 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
1030 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
1032 gimple
*stmt
= gsi_stmt (gsi
);
1033 if (! gimple_vdef (stmt
))
1036 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
1038 /* If we run into a store of this same instance (we've just
1039 marked those) then delay dependence checking until we run
1040 into the last store because this is where it will have
1041 been sunk to (and we verified that we can do that already). */
1042 if (gimple_visited_p (stmt
))
1044 if (stmt_info
!= last_store_info
)
1047 for (stmt_vec_info
&store_info
: stores
)
1049 data_reference
*store_dr
= STMT_VINFO_DATA_REF (store_info
);
1050 ddr_p ddr
= initialize_data_dependence_relation
1051 (dr_a
, store_dr
, vNULL
);
1053 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1054 free_dependence_relation (ddr
);
1061 auto check_hoist
= [&] (stmt_vec_info stmt_info
) -> bool
1063 /* We are hoisting a load - this means we can use TBAA for
1065 if (!ref_initialized_p
)
1066 ao_ref_init (&ref
, DR_REF (dr_a
));
1067 if (stmt_may_clobber_ref_p_1 (stmt_info
->stmt
, &ref
, true))
1069 /* If we couldn't record a (single) data reference for this
1070 stmt we have to give up now. */
1071 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
1074 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
1077 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1078 free_dependence_relation (ddr
);
1082 /* No dependence. */
1085 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1087 /* When we run into a store group we have to honor
1088 that earlier stores might be moved here. We don't
1089 know exactly which and where to since we lack a
1090 back-mapping from DR to SLP node, so assume all
1091 earlier stores are sunk here. It's enough to
1092 consider the last stmt of a group for this.
1093 ??? Both this and the fact that we disregard that
1094 the conflicting instance might be removed later
1095 is overly conservative. */
1096 if (!grp_visited
.add (DR_GROUP_FIRST_ELEMENT (stmt_info
)))
1097 for (auto store_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1099 store_info
= DR_GROUP_NEXT_ELEMENT (store_info
))
1100 if ((store_info
== stmt_info
1101 || get_later_stmt (store_info
, stmt_info
) == stmt_info
)
1102 && !check_hoist (store_info
))
1107 if (!check_hoist (stmt_info
))
1116 /* Function vect_analyze_data_ref_dependences.
1118 Examine all the data references in the basic-block, and make sure there
1119 do not exist any data dependences between them. Set *MAX_VF according to
1120 the maximum vectorization factor the data dependences allow. */
1123 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
1125 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1127 /* The stores of this instance are at the root of the SLP tree. */
1128 slp_tree store
= NULL
;
1129 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
)
1130 store
= SLP_INSTANCE_TREE (instance
);
1132 /* Verify we can sink stores to the vectorized stmt insert location. */
1133 stmt_vec_info last_store_info
= NULL
;
1136 if (! vect_slp_analyze_store_dependences (vinfo
, store
))
1139 /* Mark stores in this instance and remember the last one. */
1140 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
1141 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
1142 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
1147 /* Verify we can sink loads to the vectorized stmt insert location,
1148 special-casing stores of this instance. */
1149 for (slp_tree
&load
: SLP_INSTANCE_LOADS (instance
))
1150 if (! vect_slp_analyze_load_dependences (vinfo
, load
,
1152 ? SLP_TREE_SCALAR_STMTS (store
)
1153 : vNULL
, last_store_info
))
1159 /* Unset the visited flag. */
1161 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
1162 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
1167 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1171 dr_misalignment (dr_vec_info
*dr_info
, tree vectype
, poly_int64 offset
)
1173 HOST_WIDE_INT diff
= 0;
1174 /* Alignment is only analyzed for the first element of a DR group,
1175 use that but adjust misalignment by the offset of the access. */
1176 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
))
1178 dr_vec_info
*first_dr
1179 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
));
1180 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1181 INTEGER_CSTs and the first element in the group has the lowest
1183 diff
= (TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
))
1184 - TREE_INT_CST_LOW (DR_INIT (first_dr
->dr
)));
1185 gcc_assert (diff
>= 0);
1189 int misalign
= dr_info
->misalignment
;
1190 gcc_assert (misalign
!= DR_MISALIGNMENT_UNINITIALIZED
);
1191 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
1194 /* If the access is only aligned for a vector type with smaller alignment
1195 requirement the access has unknown misalignment. */
1196 if (maybe_lt (dr_info
->target_alignment
* BITS_PER_UNIT
,
1197 targetm
.vectorize
.preferred_vector_alignment (vectype
)))
1198 return DR_MISALIGNMENT_UNKNOWN
;
1200 /* Apply the offset from the DR group start and the externally supplied
1201 offset which can for example result from a negative stride access. */
1202 poly_int64 misalignment
= misalign
+ diff
+ offset
;
1204 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1205 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1206 unsigned HOST_WIDE_INT target_alignment_c
1207 = dr_info
->target_alignment
.to_constant ();
1208 if (!known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1209 return DR_MISALIGNMENT_UNKNOWN
;
1213 /* Record the base alignment guarantee given by DRB, which occurs
1217 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
1218 innermost_loop_behavior
*drb
)
1221 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> &entry
1222 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
1223 if (!existed
|| entry
.second
->base_alignment
< drb
->base_alignment
)
1225 entry
= std::make_pair (stmt_info
, drb
);
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_NOTE
, vect_location
,
1228 "recording new base alignment for %T\n"
1230 " misalignment: %d\n"
1233 drb
->base_alignment
,
1234 drb
->base_misalignment
,
1239 /* If the region we're going to vectorize is reached, all unconditional
1240 data references occur at least once. We can therefore pool the base
1241 alignment guarantees from each unconditional reference. Do this by
1242 going through all the data references in VINFO and checking whether
1243 the containing statement makes the reference unconditionally. If so,
1244 record the alignment of the base address in VINFO so that it can be
1245 used for all other references with the same base. */
1248 vect_record_base_alignments (vec_info
*vinfo
)
1250 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1251 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
1252 for (data_reference
*dr
: vinfo
->shared
->datarefs
)
1254 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
1255 stmt_vec_info stmt_info
= dr_info
->stmt
;
1256 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
1257 && STMT_VINFO_VECTORIZABLE (stmt_info
)
1258 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1260 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
1262 /* If DR is nested in the loop that is being vectorized, we can also
1263 record the alignment of the base wrt the outer loop. */
1264 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
1265 vect_record_base_alignment
1266 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
1271 /* Function vect_compute_data_ref_alignment
1273 Compute the misalignment of the data reference DR_INFO when vectorizing
1277 1. initialized misalignment info for DR_INFO
1279 FOR NOW: No analysis is actually performed. Misalignment is calculated
1280 only for trivial cases. TODO. */
1283 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1286 stmt_vec_info stmt_info
= dr_info
->stmt
;
1287 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
1288 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1289 class loop
*loop
= NULL
;
1290 tree ref
= DR_REF (dr_info
->dr
);
1292 if (dump_enabled_p ())
1293 dump_printf_loc (MSG_NOTE
, vect_location
,
1294 "vect_compute_data_ref_alignment:\n");
1297 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1299 /* Initialize misalignment to unknown. */
1300 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1302 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1305 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
1306 bool step_preserves_misalignment_p
;
1308 poly_uint64 vector_alignment
1309 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
1311 SET_DR_TARGET_ALIGNMENT (dr_info
, vector_alignment
);
1313 /* If the main loop has peeled for alignment we have no way of knowing
1314 whether the data accesses in the epilogues are aligned. We can't at
1315 compile time answer the question whether we have entered the main loop or
1316 not. Fixes PR 92351. */
1319 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1321 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1325 unsigned HOST_WIDE_INT vect_align_c
;
1326 if (!vector_alignment
.is_constant (&vect_align_c
))
1329 /* No step for BB vectorization. */
1332 gcc_assert (integer_zerop (drb
->step
));
1333 step_preserves_misalignment_p
= true;
1336 /* In case the dataref is in an inner-loop of the loop that is being
1337 vectorized (LOOP), we use the base and misalignment information
1338 relative to the outer-loop (LOOP). This is ok only if the misalignment
1339 stays the same throughout the execution of the inner-loop, which is why
1340 we have to check that the stride of the dataref in the inner-loop evenly
1341 divides by the vector alignment. */
1342 else if (nested_in_vect_loop_p (loop
, stmt_info
))
1344 step_preserves_misalignment_p
1345 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1347 if (dump_enabled_p ())
1349 if (step_preserves_misalignment_p
)
1350 dump_printf_loc (MSG_NOTE
, vect_location
,
1351 "inner step divides the vector alignment.\n");
1353 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1354 "inner step doesn't divide the vector"
1359 /* Similarly we can only use base and misalignment information relative to
1360 an innermost loop if the misalignment stays the same throughout the
1361 execution of the loop. As above, this is the case if the stride of
1362 the dataref evenly divides by the alignment. */
1365 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1366 step_preserves_misalignment_p
1367 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
1369 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1371 "step doesn't divide the vector alignment.\n");
1374 unsigned int base_alignment
= drb
->base_alignment
;
1375 unsigned int base_misalignment
= drb
->base_misalignment
;
1377 /* Calculate the maximum of the pooled base address alignment and the
1378 alignment that we can compute for DR itself. */
1379 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> *entry
1380 = base_alignments
->get (drb
->base_address
);
1382 && base_alignment
< (*entry
).second
->base_alignment
1384 || (dominated_by_p (CDI_DOMINATORS
, gimple_bb (stmt_info
->stmt
),
1385 gimple_bb (entry
->first
->stmt
))
1386 && (gimple_bb (stmt_info
->stmt
) != gimple_bb (entry
->first
->stmt
)
1387 || (entry
->first
->dr_aux
.group
<= dr_info
->group
)))))
1389 base_alignment
= entry
->second
->base_alignment
;
1390 base_misalignment
= entry
->second
->base_misalignment
;
1393 if (drb
->offset_alignment
< vect_align_c
1394 || !step_preserves_misalignment_p
1395 /* We need to know whether the step wrt the vectorized loop is
1396 negative when computing the starting misalignment below. */
1397 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1401 "Unknown alignment for access: %T\n", ref
);
1405 if (base_alignment
< vect_align_c
)
1407 unsigned int max_alignment
;
1408 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1409 if (max_alignment
< vect_align_c
1410 || !vect_can_force_dr_alignment_p (base
,
1411 vect_align_c
* BITS_PER_UNIT
))
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE
, vect_location
,
1415 "can't force alignment of ref: %T\n", ref
);
1419 /* Force the alignment of the decl.
1420 NOTE: This is the only change to the code we make during
1421 the analysis phase, before deciding to vectorize the loop. */
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE
, vect_location
,
1424 "force alignment of %T\n", ref
);
1426 dr_info
->base_decl
= base
;
1427 dr_info
->base_misaligned
= true;
1428 base_misalignment
= 0;
1430 poly_int64 misalignment
1431 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1433 unsigned int const_misalignment
;
1434 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1438 "Non-constant misalignment for access: %T\n", ref
);
1442 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1444 if (dump_enabled_p ())
1445 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1446 "misalign = %d bytes of ref %T\n",
1447 const_misalignment
, ref
);
1452 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1453 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1454 is made aligned via peeling. */
1457 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1458 dr_vec_info
*dr_peel_info
)
1460 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1461 DR_TARGET_ALIGNMENT (dr_info
)))
1463 poly_offset_int diff
1464 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1465 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1466 if (known_eq (diff
, 0)
1467 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1473 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1474 aligned via peeling. */
1477 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1478 dr_vec_info
*dr_peel_info
)
1480 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1481 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1482 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1483 DR_OFFSET (dr_peel_info
->dr
), 0)
1484 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1485 DR_STEP (dr_peel_info
->dr
), 0))
1488 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1491 /* Compute the value for dr_info->misalign so that the access appears
1492 aligned. This is used by peeling to compensate for dr_misalignment
1493 applying the offset for negative step. */
1496 vect_dr_misalign_for_aligned_access (dr_vec_info
*dr_info
)
1498 if (tree_int_cst_sgn (DR_STEP (dr_info
->dr
)) >= 0)
1501 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1502 poly_int64 misalignment
1503 = ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1504 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1506 unsigned HOST_WIDE_INT target_alignment_c
;
1508 if (!dr_info
->target_alignment
.is_constant (&target_alignment_c
)
1509 || !known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1510 return DR_MISALIGNMENT_UNKNOWN
;
1514 /* Function vect_update_misalignment_for_peel.
1515 Sets DR_INFO's misalignment
1516 - to 0 if it has the same alignment as DR_PEEL_INFO,
1517 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1518 - to -1 (unknown) otherwise.
1520 DR_INFO - the data reference whose misalignment is to be adjusted.
1521 DR_PEEL_INFO - the data reference whose misalignment is being made
1522 zero in the vector loop by the peel.
1523 NPEEL - the number of iterations in the peel loop if the misalignment
1524 of DR_PEEL_INFO is known at compile time. */
1527 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1528 dr_vec_info
*dr_peel_info
, int npeel
)
1530 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1531 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1533 SET_DR_MISALIGNMENT (dr_info
,
1534 vect_dr_misalign_for_aligned_access (dr_peel_info
));
1538 unsigned HOST_WIDE_INT alignment
;
1539 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1540 && known_alignment_for_access_p (dr_info
,
1541 STMT_VINFO_VECTYPE (dr_info
->stmt
))
1542 && known_alignment_for_access_p (dr_peel_info
,
1543 STMT_VINFO_VECTYPE (dr_peel_info
->stmt
)))
1545 int misal
= dr_info
->misalignment
;
1546 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1547 misal
&= alignment
- 1;
1548 set_dr_misalignment (dr_info
, misal
);
1552 if (dump_enabled_p ())
1553 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1554 "to unknown (-1).\n");
1555 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1558 /* Return true if alignment is relevant for DR_INFO. */
1561 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1563 stmt_vec_info stmt_info
= dr_info
->stmt
;
1565 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1568 /* For interleaving, only the alignment of the first access matters. */
1569 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1570 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1573 /* Scatter-gather and invariant accesses continue to address individual
1574 scalars, so vector-level alignment is irrelevant. */
1575 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1576 || integer_zerop (DR_STEP (dr_info
->dr
)))
1579 /* Strided accesses perform only component accesses, alignment is
1580 irrelevant for them. */
1581 if (STMT_VINFO_STRIDED_P (stmt_info
)
1582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1588 /* Given an memory reference EXP return whether its alignment is less
1592 not_size_aligned (tree exp
)
1594 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1597 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1598 > get_object_alignment (exp
));
1601 /* Function vector_alignment_reachable_p
1603 Return true if vector alignment for DR_INFO is reachable by peeling
1604 a few loop iterations. Return false otherwise. */
1607 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1609 stmt_vec_info stmt_info
= dr_info
->stmt
;
1610 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1614 /* For interleaved access we peel only if number of iterations in
1615 the prolog loop ({VF - misalignment}), is a multiple of the
1616 number of the interleaved accesses. */
1617 int elem_size
, mis_in_elements
;
1619 /* FORNOW: handle only known alignment. */
1620 if (!known_alignment_for_access_p (dr_info
, vectype
))
1623 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1624 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1625 elem_size
= vector_element_size (vector_size
, nelements
);
1626 mis_in_elements
= dr_misalignment (dr_info
, vectype
) / elem_size
;
1628 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1632 /* If misalignment is known at the compile time then allow peeling
1633 only if natural alignment is reachable through peeling. */
1634 if (known_alignment_for_access_p (dr_info
, vectype
)
1635 && !aligned_access_p (dr_info
, vectype
))
1637 HOST_WIDE_INT elmsize
=
1638 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1639 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_NOTE
, vect_location
,
1642 "data size = %wd. misalignment = %d.\n", elmsize
,
1643 dr_misalignment (dr_info
, vectype
));
1645 if (dr_misalignment (dr_info
, vectype
) % elmsize
)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1649 "data size does not divide the misalignment.\n");
1654 if (!known_alignment_for_access_p (dr_info
, vectype
))
1656 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1657 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1660 "Unknown misalignment, %snaturally aligned\n",
1661 is_packed
? "not " : "");
1662 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1669 /* Calculate the cost of the memory access represented by DR_INFO. */
1672 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1673 dr_alignment_support alignment_support_scheme
,
1675 unsigned int *inside_cost
,
1676 unsigned int *outside_cost
,
1677 stmt_vector_for_cost
*body_cost_vec
,
1678 stmt_vector_for_cost
*prologue_cost_vec
)
1680 stmt_vec_info stmt_info
= dr_info
->stmt
;
1681 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1684 if (PURE_SLP_STMT (stmt_info
))
1687 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1689 if (DR_IS_READ (dr_info
->dr
))
1690 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, alignment_support_scheme
,
1691 misalignment
, true, inside_cost
,
1692 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1694 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, alignment_support_scheme
,
1695 misalignment
, inside_cost
, body_cost_vec
);
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_NOTE
, vect_location
,
1699 "vect_get_data_access_cost: inside_cost = %d, "
1700 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1704 typedef struct _vect_peel_info
1706 dr_vec_info
*dr_info
;
1711 typedef struct _vect_peel_extended_info
1714 struct _vect_peel_info peel_info
;
1715 unsigned int inside_cost
;
1716 unsigned int outside_cost
;
1717 } *vect_peel_extended_info
;
1720 /* Peeling hashtable helpers. */
1722 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1724 static inline hashval_t
hash (const _vect_peel_info
*);
1725 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1729 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1731 return (hashval_t
) peel_info
->npeel
;
1735 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1737 return (a
->npeel
== b
->npeel
);
1741 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1744 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1745 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1746 int npeel
, bool supportable_if_not_aligned
)
1748 struct _vect_peel_info elem
, *slot
;
1749 _vect_peel_info
**new_slot
;
1752 slot
= peeling_htab
->find (&elem
);
1757 slot
= XNEW (struct _vect_peel_info
);
1758 slot
->npeel
= npeel
;
1759 slot
->dr_info
= dr_info
;
1761 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1765 /* If this DR is not supported with unknown misalignment then bias
1766 this slot when the cost model is disabled. */
1767 if (!supportable_if_not_aligned
1768 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1769 slot
->count
+= VECT_MAX_COST
;
1773 /* Traverse peeling hash table to find peeling option that aligns maximum
1774 number of data accesses. */
1777 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1778 _vect_peel_extended_info
*max
)
1780 vect_peel_info elem
= *slot
;
1782 if (elem
->count
> max
->peel_info
.count
1783 || (elem
->count
== max
->peel_info
.count
1784 && max
->peel_info
.npeel
> elem
->npeel
))
1786 max
->peel_info
.npeel
= elem
->npeel
;
1787 max
->peel_info
.count
= elem
->count
;
1788 max
->peel_info
.dr_info
= elem
->dr_info
;
1794 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1795 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1796 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1800 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1801 dr_vec_info
*dr0_info
,
1802 unsigned int *inside_cost
,
1803 unsigned int *outside_cost
,
1804 stmt_vector_for_cost
*body_cost_vec
,
1805 stmt_vector_for_cost
*prologue_cost_vec
,
1808 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1810 bool dr0_alignment_known_p
1812 && known_alignment_for_access_p (dr0_info
,
1813 STMT_VINFO_VECTYPE (dr0_info
->stmt
)));
1815 for (data_reference
*dr
: datarefs
)
1817 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1818 if (!vect_relevant_for_alignment_p (dr_info
))
1821 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1822 dr_alignment_support alignment_support_scheme
;
1824 unsigned HOST_WIDE_INT alignment
;
1826 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1827 size_zero_node
) < 0;
1830 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1831 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1834 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1835 else if (dr_info
== dr0_info
1836 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1838 else if (!dr0_alignment_known_p
1839 || !known_alignment_for_access_p (dr_info
, vectype
)
1840 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1841 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1844 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1845 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1846 misalignment
&= alignment
- 1;
1848 alignment_support_scheme
1849 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1852 vect_get_data_access_cost (loop_vinfo
, dr_info
,
1853 alignment_support_scheme
, misalignment
,
1854 inside_cost
, outside_cost
,
1855 body_cost_vec
, prologue_cost_vec
);
1859 /* Traverse peeling hash table and calculate cost for each peeling option.
1860 Find the one with the lowest cost. */
1863 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1864 _vect_peel_extended_info
*min
)
1866 vect_peel_info elem
= *slot
;
1868 unsigned int inside_cost
= 0, outside_cost
= 0;
1869 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1870 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1873 prologue_cost_vec
.create (2);
1874 body_cost_vec
.create (2);
1875 epilogue_cost_vec
.create (2);
1877 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1878 &outside_cost
, &body_cost_vec
,
1879 &prologue_cost_vec
, elem
->npeel
);
1881 body_cost_vec
.release ();
1883 outside_cost
+= vect_get_known_peeling_cost
1884 (loop_vinfo
, elem
->npeel
, &dummy
,
1885 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1886 &prologue_cost_vec
, &epilogue_cost_vec
);
1888 /* Prologue and epilogue costs are added to the target model later.
1889 These costs depend only on the scalar iteration cost, the
1890 number of peeling iterations finally chosen, and the number of
1891 misaligned statements. So discard the information found here. */
1892 prologue_cost_vec
.release ();
1893 epilogue_cost_vec
.release ();
1895 if (inside_cost
< min
->inside_cost
1896 || (inside_cost
== min
->inside_cost
1897 && outside_cost
< min
->outside_cost
))
1899 min
->inside_cost
= inside_cost
;
1900 min
->outside_cost
= outside_cost
;
1901 min
->peel_info
.dr_info
= elem
->dr_info
;
1902 min
->peel_info
.npeel
= elem
->npeel
;
1903 min
->peel_info
.count
= elem
->count
;
1910 /* Choose best peeling option by traversing peeling hash table and either
1911 choosing an option with the lowest cost (if cost model is enabled) or the
1912 option that aligns as many accesses as possible. */
1914 static struct _vect_peel_extended_info
1915 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1916 loop_vec_info loop_vinfo
)
1918 struct _vect_peel_extended_info res
;
1920 res
.peel_info
.dr_info
= NULL
;
1921 res
.vinfo
= loop_vinfo
;
1923 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1925 res
.inside_cost
= INT_MAX
;
1926 res
.outside_cost
= INT_MAX
;
1927 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1928 vect_peeling_hash_get_lowest_cost
> (&res
);
1932 res
.peel_info
.count
= 0;
1933 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1934 vect_peeling_hash_get_most_frequent
> (&res
);
1935 res
.inside_cost
= 0;
1936 res
.outside_cost
= 0;
1942 /* Return true if the new peeling NPEEL is supported. */
1945 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1948 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1949 enum dr_alignment_support supportable_dr_alignment
;
1951 bool dr0_alignment_known_p
1952 = known_alignment_for_access_p (dr0_info
,
1953 STMT_VINFO_VECTYPE (dr0_info
->stmt
));
1955 /* Ensure that all data refs can be vectorized after the peel. */
1956 for (data_reference
*dr
: datarefs
)
1958 if (dr
== dr0_info
->dr
)
1961 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1962 if (!vect_relevant_for_alignment_p (dr_info
)
1963 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1966 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1968 unsigned HOST_WIDE_INT alignment
;
1969 if (!dr0_alignment_known_p
1970 || !known_alignment_for_access_p (dr_info
, vectype
)
1971 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1972 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1975 misalignment
= dr_misalignment (dr_info
, vectype
);
1976 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1977 misalignment
&= alignment
- 1;
1979 supportable_dr_alignment
1980 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1982 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
1989 /* Compare two data-references DRA and DRB to group them into chunks
1990 with related alignment. */
1993 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
1995 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
1996 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
1999 /* Stabilize sort. */
2003 /* Ordering of DRs according to base. */
2004 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2005 DR_BASE_ADDRESS (drb
));
2009 /* And according to DR_OFFSET. */
2010 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2014 /* And after step. */
2015 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2019 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2020 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2022 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2026 /* Function vect_enhance_data_refs_alignment
2028 This pass will use loop versioning and loop peeling in order to enhance
2029 the alignment of data references in the loop.
2031 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2032 original loop is to be vectorized. Any other loops that are created by
2033 the transformations performed in this pass - are not supposed to be
2034 vectorized. This restriction will be relaxed.
2036 This pass will require a cost model to guide it whether to apply peeling
2037 or versioning or a combination of the two. For example, the scheme that
2038 intel uses when given a loop with several memory accesses, is as follows:
2039 choose one memory access ('p') which alignment you want to force by doing
2040 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2041 other accesses are not necessarily aligned, or (2) use loop versioning to
2042 generate one loop in which all accesses are aligned, and another loop in
2043 which only 'p' is necessarily aligned.
2045 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2046 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2047 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2049 Devising a cost model is the most critical aspect of this work. It will
2050 guide us on which access to peel for, whether to use loop versioning, how
2051 many versions to create, etc. The cost model will probably consist of
2052 generic considerations as well as target specific considerations (on
2053 powerpc for example, misaligned stores are more painful than misaligned
2056 Here are the general steps involved in alignment enhancements:
2058 -- original loop, before alignment analysis:
2059 for (i=0; i<N; i++){
2060 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2061 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2064 -- After vect_compute_data_refs_alignment:
2065 for (i=0; i<N; i++){
2066 x = q[i]; # DR_MISALIGNMENT(q) = 3
2067 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2070 -- Possibility 1: we do loop versioning:
2072 for (i=0; i<N; i++){ # loop 1A
2073 x = q[i]; # DR_MISALIGNMENT(q) = 3
2074 p[i] = y; # DR_MISALIGNMENT(p) = 0
2078 for (i=0; i<N; i++){ # loop 1B
2079 x = q[i]; # DR_MISALIGNMENT(q) = 3
2080 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2084 -- Possibility 2: we do loop peeling:
2085 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2089 for (i = 3; i < N; i++){ # loop 2A
2090 x = q[i]; # DR_MISALIGNMENT(q) = 0
2091 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2094 -- Possibility 3: combination of loop peeling and versioning:
2095 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2100 for (i = 3; i<N; i++){ # loop 3A
2101 x = q[i]; # DR_MISALIGNMENT(q) = 0
2102 p[i] = y; # DR_MISALIGNMENT(p) = 0
2106 for (i = 3; i<N; i++){ # loop 3B
2107 x = q[i]; # DR_MISALIGNMENT(q) = 0
2108 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2112 These loops are later passed to loop_transform to be vectorized. The
2113 vectorizer will use the alignment information to guide the transformation
2114 (whether to generate regular loads/stores, or with special handling for
2118 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
2120 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2121 dr_vec_info
*first_store
= NULL
;
2122 dr_vec_info
*dr0_info
= NULL
;
2123 struct data_reference
*dr
;
2125 bool do_peeling
= false;
2126 bool do_versioning
= false;
2127 unsigned int npeel
= 0;
2128 bool one_misalignment_known
= false;
2129 bool one_misalignment_unknown
= false;
2130 bool one_dr_unsupportable
= false;
2131 dr_vec_info
*unsupportable_dr_info
= NULL
;
2132 unsigned int dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
2133 hash_table
<peel_info_hasher
> peeling_htab (1);
2135 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2137 /* Reset data so we can safely be called multiple times. */
2138 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2139 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
2141 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
2142 return opt_result::success ();
2144 /* Sort the vector of datarefs so DRs that have the same or dependent
2145 alignment are next to each other. */
2146 auto_vec
<data_reference_p
> datarefs
2147 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
2148 datarefs
.qsort (dr_align_group_sort_cmp
);
2150 /* Compute the number of DRs that become aligned when we peel
2151 a dataref so it becomes aligned. */
2152 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
2153 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
2155 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
2156 if (DR_BASE_ADDRESS (datarefs
[i0
]))
2158 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
2160 if (i
== datarefs
.length ()
2161 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
2162 DR_BASE_ADDRESS (datarefs
[i
]), 0)
2163 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
2164 DR_OFFSET (datarefs
[i
]), 0)
2165 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
2166 DR_STEP (datarefs
[i
]), 0))
2168 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2169 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2170 will get known misalignment if we align one of the refs
2171 with the largest DR_TARGET_ALIGNMENT. */
2172 for (unsigned j
= i0
; j
< i
; ++j
)
2174 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
2175 for (unsigned k
= i0
; k
< i
; ++k
)
2179 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
2180 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
2182 n_same_align_refs
[j
]++;
2189 /* While cost model enhancements are expected in the future, the high level
2190 view of the code at this time is as follows:
2192 A) If there is a misaligned access then see if peeling to align
2193 this access can make all data references satisfy
2194 vect_supportable_dr_alignment. If so, update data structures
2195 as needed and return true.
2197 B) If peeling wasn't possible and there is a data reference with an
2198 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2199 then see if loop versioning checks can be used to make all data
2200 references satisfy vect_supportable_dr_alignment. If so, update
2201 data structures as needed and return true.
2203 C) If neither peeling nor versioning were successful then return false if
2204 any data reference does not satisfy vect_supportable_dr_alignment.
2206 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2208 Note, Possibility 3 above (which is peeling and versioning together) is not
2209 being done at this time. */
2211 /* (1) Peeling to force alignment. */
2213 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2215 + How many accesses will become aligned due to the peeling
2216 - How many accesses will become unaligned due to the peeling,
2217 and the cost of misaligned accesses.
2218 - The cost of peeling (the extra runtime checks, the increase
2221 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2223 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2224 if (!vect_relevant_for_alignment_p (dr_info
))
2227 stmt_vec_info stmt_info
= dr_info
->stmt
;
2228 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2229 do_peeling
= vector_alignment_reachable_p (dr_info
);
2232 if (known_alignment_for_access_p (dr_info
, vectype
))
2234 unsigned int npeel_tmp
= 0;
2235 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2236 size_zero_node
) < 0;
2238 /* If known_alignment_for_access_p then we have set
2239 DR_MISALIGNMENT which is only done if we know it at compiler
2240 time, so it is safe to assume target alignment is constant.
2242 unsigned int target_align
=
2243 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
2244 unsigned HOST_WIDE_INT dr_size
= vect_get_scalar_dr_size (dr_info
);
2247 off
= (TYPE_VECTOR_SUBPARTS (vectype
) - 1) * -dr_size
;
2248 unsigned int mis
= dr_misalignment (dr_info
, vectype
, off
);
2249 mis
= negative
? mis
: -mis
;
2251 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
2253 /* For multiple types, it is possible that the bigger type access
2254 will have more than one peeling option. E.g., a loop with two
2255 types: one of size (vector size / 4), and the other one of
2256 size (vector size / 8). Vectorization factor will 8. If both
2257 accesses are misaligned by 3, the first one needs one scalar
2258 iteration to be aligned, and the second one needs 5. But the
2259 first one will be aligned also by peeling 5 scalar
2260 iterations, and in that case both accesses will be aligned.
2261 Hence, except for the immediate peeling amount, we also want
2262 to try to add full vector size, while we don't exceed
2263 vectorization factor.
2264 We do this automatically for cost model, since we calculate
2265 cost for every peeling option. */
2266 poly_uint64 nscalars
= npeel_tmp
;
2267 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2269 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2270 nscalars
= (STMT_SLP_TYPE (stmt_info
)
2271 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
2274 /* Save info about DR in the hash table. Also include peeling
2275 amounts according to the explanation above. Indicate
2276 the alignment status when the ref is not aligned.
2277 ??? Rather than using unknown alignment here we should
2278 prune all entries from the peeling hashtable which cause
2279 DRs to be not supported. */
2280 bool supportable_if_not_aligned
2281 = vect_supportable_dr_alignment
2282 (loop_vinfo
, dr_info
, vectype
, DR_MISALIGNMENT_UNKNOWN
);
2283 while (known_le (npeel_tmp
, nscalars
))
2285 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
2287 supportable_if_not_aligned
);
2288 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
2291 one_misalignment_known
= true;
2295 /* If we don't know any misalignment values, we prefer
2296 peeling for data-ref that has the maximum number of data-refs
2297 with the same alignment, unless the target prefers to align
2298 stores over load. */
2299 unsigned same_align_drs
= n_same_align_refs
[i
];
2301 || dr0_same_align_drs
< same_align_drs
)
2303 dr0_same_align_drs
= same_align_drs
;
2306 /* For data-refs with the same number of related
2307 accesses prefer the one where the misalign
2308 computation will be invariant in the outermost loop. */
2309 else if (dr0_same_align_drs
== same_align_drs
)
2311 class loop
*ivloop0
, *ivloop
;
2312 ivloop0
= outermost_invariant_loop_for_expr
2313 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
2314 ivloop
= outermost_invariant_loop_for_expr
2315 (loop
, DR_BASE_ADDRESS (dr
));
2316 if ((ivloop
&& !ivloop0
)
2317 || (ivloop
&& ivloop0
2318 && flow_loop_nested_p (ivloop
, ivloop0
)))
2322 one_misalignment_unknown
= true;
2324 /* Check for data refs with unsupportable alignment that
2326 enum dr_alignment_support supportable_dr_alignment
2327 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2328 DR_MISALIGNMENT_UNKNOWN
);
2329 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2331 one_dr_unsupportable
= true;
2332 unsupportable_dr_info
= dr_info
;
2335 if (!first_store
&& DR_IS_WRITE (dr
))
2337 first_store
= dr_info
;
2338 first_store_same_align_drs
= same_align_drs
;
2344 if (!aligned_access_p (dr_info
, vectype
))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2348 "vector alignment may not be reachable\n");
2354 /* Check if we can possibly peel the loop. */
2355 if (!vect_can_advance_ivs_p (loop_vinfo
)
2356 || !slpeel_can_duplicate_loop_p (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
),
2357 loop_preheader_edge (loop
))
2359 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
))
2362 struct _vect_peel_extended_info peel_for_known_alignment
;
2363 struct _vect_peel_extended_info peel_for_unknown_alignment
;
2364 struct _vect_peel_extended_info best_peel
;
2366 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
2367 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
2368 peel_for_unknown_alignment
.peel_info
.count
= 0;
2371 && one_misalignment_unknown
)
2373 /* Check if the target requires to prefer stores over loads, i.e., if
2374 misaligned stores are more expensive than misaligned loads (taking
2375 drs with same alignment into account). */
2376 unsigned int load_inside_cost
= 0;
2377 unsigned int load_outside_cost
= 0;
2378 unsigned int store_inside_cost
= 0;
2379 unsigned int store_outside_cost
= 0;
2380 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
2382 stmt_vector_for_cost dummy
;
2384 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
2387 &dummy
, &dummy
, estimated_npeels
);
2393 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
2395 &store_outside_cost
,
2402 store_inside_cost
= INT_MAX
;
2403 store_outside_cost
= INT_MAX
;
2406 if (load_inside_cost
> store_inside_cost
2407 || (load_inside_cost
== store_inside_cost
2408 && load_outside_cost
> store_outside_cost
))
2410 dr0_info
= first_store
;
2411 dr0_same_align_drs
= first_store_same_align_drs
;
2412 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2413 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2417 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2418 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2421 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2422 prologue_cost_vec
.create (2);
2423 epilogue_cost_vec
.create (2);
2426 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2427 (loop_vinfo
, estimated_npeels
, &dummy2
,
2428 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2429 &prologue_cost_vec
, &epilogue_cost_vec
);
2431 prologue_cost_vec
.release ();
2432 epilogue_cost_vec
.release ();
2434 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2437 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2438 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2440 best_peel
= peel_for_unknown_alignment
;
2442 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2443 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2444 peel_for_known_alignment
.peel_info
.count
= 0;
2445 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2447 if (do_peeling
&& one_misalignment_known
)
2449 /* Peeling is possible, but there is no data access that is not supported
2450 unless aligned. So we try to choose the best possible peeling from
2452 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2453 (&peeling_htab
, loop_vinfo
);
2456 /* Compare costs of peeling for known and unknown alignment. */
2457 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2458 && peel_for_unknown_alignment
.inside_cost
2459 >= peel_for_known_alignment
.inside_cost
)
2461 best_peel
= peel_for_known_alignment
;
2463 /* If the best peeling for known alignment has NPEEL == 0, perform no
2464 peeling at all except if there is an unsupportable dr that we can
2466 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2470 /* If there is an unsupportable data ref, prefer this over all choices so far
2471 since we'd have to discard a chosen peeling except when it accidentally
2472 aligned the unsupportable data ref. */
2473 if (one_dr_unsupportable
)
2474 dr0_info
= unsupportable_dr_info
;
2475 else if (do_peeling
)
2477 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2478 TODO: Use nopeel_outside_cost or get rid of it? */
2479 unsigned nopeel_inside_cost
= 0;
2480 unsigned nopeel_outside_cost
= 0;
2482 stmt_vector_for_cost dummy
;
2484 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2485 &nopeel_outside_cost
, &dummy
, &dummy
, 0);
2488 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2489 costs will be recorded. */
2490 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2491 prologue_cost_vec
.create (2);
2492 epilogue_cost_vec
.create (2);
2495 nopeel_outside_cost
+= vect_get_known_peeling_cost
2496 (loop_vinfo
, 0, &dummy2
,
2497 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2498 &prologue_cost_vec
, &epilogue_cost_vec
);
2500 prologue_cost_vec
.release ();
2501 epilogue_cost_vec
.release ();
2503 npeel
= best_peel
.peel_info
.npeel
;
2504 dr0_info
= best_peel
.peel_info
.dr_info
;
2506 /* If no peeling is not more expensive than the best peeling we
2507 have so far, don't perform any peeling. */
2508 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2514 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2515 if (known_alignment_for_access_p (dr0_info
,
2516 STMT_VINFO_VECTYPE (stmt_info
)))
2518 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2519 size_zero_node
) < 0;
2522 /* Since it's known at compile time, compute the number of
2523 iterations in the peeled loop (the peeling factor) for use in
2524 updating DR_MISALIGNMENT values. The peeling factor is the
2525 vectorization factor minus the misalignment as an element
2527 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2530 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2531 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2533 = dr_misalignment (dr0_info
, vectype
, off
);
2534 mis
= negative
? mis
: -mis
;
2535 /* If known_alignment_for_access_p then we have set
2536 DR_MISALIGNMENT which is only done if we know it at compiler
2537 time, so it is safe to assume target alignment is constant.
2539 unsigned int target_align
=
2540 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2541 npeel
= ((mis
& (target_align
- 1))
2542 / vect_get_scalar_dr_size (dr0_info
));
2545 /* For interleaved data access every iteration accesses all the
2546 members of the group, therefore we divide the number of iterations
2547 by the group size. */
2548 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2549 npeel
/= DR_GROUP_SIZE (stmt_info
);
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE
, vect_location
,
2553 "Try peeling by %d\n", npeel
);
2556 /* Ensure that all datarefs can be vectorized after the peel. */
2557 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2560 /* Check if all datarefs are supportable and log. */
2563 && known_alignment_for_access_p (dr0_info
,
2564 STMT_VINFO_VECTYPE (stmt_info
)))
2565 return opt_result::success ();
2567 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2570 unsigned max_allowed_peel
2571 = param_vect_max_peeling_for_alignment
;
2572 if (loop_cost_model (loop
) <= VECT_COST_MODEL_CHEAP
)
2573 max_allowed_peel
= 0;
2574 if (max_allowed_peel
!= (unsigned)-1)
2576 unsigned max_peel
= npeel
;
2579 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2580 unsigned HOST_WIDE_INT target_align_c
;
2581 if (target_align
.is_constant (&target_align_c
))
2583 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2587 if (dump_enabled_p ())
2588 dump_printf_loc (MSG_NOTE
, vect_location
,
2589 "Disable peeling, max peels set and vector"
2590 " alignment unknown\n");
2593 if (max_peel
> max_allowed_peel
)
2596 if (dump_enabled_p ())
2597 dump_printf_loc (MSG_NOTE
, vect_location
,
2598 "Disable peeling, max peels reached: %d\n", max_peel
);
2603 /* Cost model #2 - if peeling may result in a remaining loop not
2604 iterating enough to be vectorized then do not peel. Since this
2605 is a cost heuristic rather than a correctness decision, use the
2606 most likely runtime value for variable vectorization factors. */
2608 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2610 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2611 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2612 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2613 < assumed_vf
+ max_peel
)
2619 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2620 If the misalignment of DR_i is identical to that of dr0 then set
2621 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2622 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2623 by the peeling factor times the element size of DR_i (MOD the
2624 vectorization factor times the size). Otherwise, the
2625 misalignment of DR_i must be set to unknown. */
2626 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2627 if (dr
!= dr0_info
->dr
)
2629 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2630 if (!vect_relevant_for_alignment_p (dr_info
))
2633 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2636 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2638 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2640 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = -1;
2641 SET_DR_MISALIGNMENT (dr0_info
,
2642 vect_dr_misalign_for_aligned_access (dr0_info
));
2643 if (dump_enabled_p ())
2645 dump_printf_loc (MSG_NOTE
, vect_location
,
2646 "Alignment of access forced using peeling.\n");
2647 dump_printf_loc (MSG_NOTE
, vect_location
,
2648 "Peeling for alignment will be applied.\n");
2651 /* The inside-loop cost will be accounted for in vectorizable_load
2652 and vectorizable_store correctly with adjusted alignments.
2653 Drop the body_cst_vec on the floor here. */
2654 return opt_result::success ();
2658 /* (2) Versioning to force alignment. */
2660 /* Try versioning if:
2661 1) optimize loop for speed and the cost-model is not cheap
2662 2) there is at least one unsupported misaligned data ref with an unknown
2664 3) all misaligned data refs with a known misalignment are supported, and
2665 4) the number of runtime alignment checks is within reason. */
2668 = (optimize_loop_nest_for_speed_p (loop
)
2669 && !loop
->inner
/* FORNOW */
2670 && loop_cost_model (loop
) > VECT_COST_MODEL_CHEAP
);
2674 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2676 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2677 if (!vect_relevant_for_alignment_p (dr_info
))
2680 stmt_vec_info stmt_info
= dr_info
->stmt
;
2681 if (STMT_VINFO_STRIDED_P (stmt_info
))
2683 do_versioning
= false;
2687 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2688 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2689 size_zero_node
) < 0;
2692 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2693 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2695 if ((misalignment
= dr_misalignment (dr_info
, vectype
, off
)) == 0)
2698 enum dr_alignment_support supportable_dr_alignment
2699 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2701 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2703 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
2704 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2705 >= (unsigned) param_vect_max_version_for_alignment_checks
))
2707 do_versioning
= false;
2711 /* At present we don't support versioning for alignment
2712 with variable VF, since there's no guarantee that the
2713 VF is a power of two. We could relax this if we added
2714 a way of enforcing a power-of-two size. */
2715 unsigned HOST_WIDE_INT size
;
2716 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2718 do_versioning
= false;
2722 /* Forcing alignment in the first iteration is no good if
2723 we don't keep it across iterations. For now, just disable
2724 versioning in this case.
2725 ?? We could actually unroll the loop to achieve the required
2726 overall step alignment, and forcing the alignment could be
2727 done by doing some iterations of the non-vectorized loop. */
2728 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2729 * DR_STEP_ALIGNMENT (dr
),
2730 DR_TARGET_ALIGNMENT (dr_info
)))
2732 do_versioning
= false;
2736 /* The rightmost bits of an aligned address must be zeros.
2737 Construct the mask needed for this test. For example,
2738 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2739 mask must be 15 = 0xf. */
2740 int mask
= size
- 1;
2742 /* FORNOW: use the same mask to test all potentially unaligned
2743 references in the loop. */
2744 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2745 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2747 do_versioning
= false;
2751 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2752 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2756 /* Versioning requires at least one misaligned data reference. */
2757 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2758 do_versioning
= false;
2759 else if (!do_versioning
)
2760 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2765 const vec
<stmt_vec_info
> &may_misalign_stmts
2766 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2767 stmt_vec_info stmt_info
;
2769 /* It can now be assumed that the data references in the statements
2770 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2771 of the loop being vectorized. */
2772 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2774 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2775 SET_DR_MISALIGNMENT (dr_info
,
2776 vect_dr_misalign_for_aligned_access (dr_info
));
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_NOTE
, vect_location
,
2779 "Alignment of access forced using versioning.\n");
2782 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_NOTE
, vect_location
,
2784 "Versioning for alignment will be applied.\n");
2786 /* Peeling and versioning can't be done together at this time. */
2787 gcc_assert (! (do_peeling
&& do_versioning
));
2789 return opt_result::success ();
2792 /* This point is reached if neither peeling nor versioning is being done. */
2793 gcc_assert (! (do_peeling
|| do_versioning
));
2795 return opt_result::success ();
2799 /* Function vect_analyze_data_refs_alignment
2801 Analyze the alignment of the data-references in the loop.
2802 Return FALSE if a data reference is found that cannot be vectorized. */
2805 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2807 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2809 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2810 struct data_reference
*dr
;
2813 vect_record_base_alignments (loop_vinfo
);
2814 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2816 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2817 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2819 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
)
2820 && DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
) != dr_info
->stmt
)
2822 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
,
2823 STMT_VINFO_VECTYPE (dr_info
->stmt
));
2827 return opt_result::success ();
2831 /* Analyze alignment of DRs of stmts in NODE. */
2834 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2836 /* Alignment is maintained in the first element of the group. */
2837 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2838 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2839 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2840 tree vectype
= SLP_TREE_VECTYPE (node
);
2841 poly_uint64 vector_alignment
2842 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
2844 if (dr_info
->misalignment
== DR_MISALIGNMENT_UNINITIALIZED
)
2845 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2846 /* Re-analyze alignment when we're facing a vectorization with a bigger
2847 alignment requirement. */
2848 else if (known_lt (dr_info
->target_alignment
, vector_alignment
))
2850 poly_uint64 old_target_alignment
= dr_info
->target_alignment
;
2851 int old_misalignment
= dr_info
->misalignment
;
2852 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2853 /* But keep knowledge about a smaller alignment. */
2854 if (old_misalignment
!= DR_MISALIGNMENT_UNKNOWN
2855 && dr_info
->misalignment
== DR_MISALIGNMENT_UNKNOWN
)
2857 dr_info
->target_alignment
= old_target_alignment
;
2858 dr_info
->misalignment
= old_misalignment
;
2861 /* When we ever face unordered target alignments the first one wins in terms
2862 of analyzing and the other will become unknown in dr_misalignment. */
2866 /* Function vect_slp_analyze_instance_alignment
2868 Analyze the alignment of the data-references in the SLP instance.
2869 Return FALSE if a data reference is found that cannot be vectorized. */
2872 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
2873 slp_instance instance
)
2875 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2879 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2880 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
2883 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2884 && ! vect_slp_analyze_node_alignment
2885 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2892 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2893 accesses of legal size, step, etc. Detect gaps, single element
2894 interleaving, and other special cases. Set grouped access info.
2895 Collect groups of strided stores for further use in SLP analysis.
2896 Worker for vect_analyze_group_access. */
2899 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2901 data_reference
*dr
= dr_info
->dr
;
2902 tree step
= DR_STEP (dr
);
2903 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2904 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2905 stmt_vec_info stmt_info
= dr_info
->stmt
;
2906 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2907 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2908 HOST_WIDE_INT dr_step
= -1;
2909 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2910 bool slp_impossible
= false;
2912 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2913 size of the interleaving group (including gaps). */
2914 if (tree_fits_shwi_p (step
))
2916 dr_step
= tree_to_shwi (step
);
2917 /* Check that STEP is a multiple of type size. Otherwise there is
2918 a non-element-sized gap at the end of the group which we
2919 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2920 ??? As we can handle non-constant step fine here we should
2921 simply remove uses of DR_GROUP_GAP between the last and first
2922 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2923 simply not include that gap. */
2924 if ((dr_step
% type_size
) != 0)
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE
, vect_location
,
2928 "Step %T is not a multiple of the element size"
2933 groupsize
= absu_hwi (dr_step
) / type_size
;
2938 /* Not consecutive access is possible only if it is a part of interleaving. */
2939 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2941 /* Check if it this DR is a part of interleaving, and is a single
2942 element of the group that is accessed in the loop. */
2944 /* Gaps are supported only for loads. STEP must be a multiple of the type
2947 && (dr_step
% type_size
) == 0
2949 /* This could be UINT_MAX but as we are generating code in a very
2950 inefficient way we have to cap earlier.
2951 See PR91403 for example. */
2952 && groupsize
<= 4096)
2954 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2955 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2956 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2957 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_NOTE
, vect_location
,
2959 "Detected single element interleaving %T"
2966 if (dump_enabled_p ())
2967 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2968 "not consecutive access %G", stmt_info
->stmt
);
2972 /* Mark the statement as unvectorizable. */
2973 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2977 if (dump_enabled_p ())
2978 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2979 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2983 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2985 /* First stmt in the interleaving chain. Check the chain. */
2986 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2987 struct data_reference
*data_ref
= dr
;
2988 unsigned int count
= 1;
2989 tree prev_init
= DR_INIT (data_ref
);
2990 HOST_WIDE_INT diff
, gaps
= 0;
2992 /* By construction, all group members have INTEGER_CST DR_INITs. */
2995 /* We never have the same DR multiple times. */
2996 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
2997 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
2999 data_ref
= STMT_VINFO_DATA_REF (next
);
3001 /* All group members have the same STEP by construction. */
3002 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
3004 /* Check that the distance between two accesses is equal to the type
3005 size. Otherwise, we have gaps. */
3006 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
3007 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
3008 if (diff
< 1 || diff
> UINT_MAX
)
3010 /* For artificial testcases with array accesses with large
3011 constant indices we can run into overflow issues which
3012 can end up fooling the groupsize constraint below so
3013 check the individual gaps (which are represented as
3014 unsigned int) as well. */
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3017 "interleaved access with gap larger "
3018 "than representable\n");
3023 /* FORNOW: SLP of accesses with gaps is not supported. */
3024 slp_impossible
= true;
3025 if (DR_IS_WRITE (data_ref
))
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3029 "interleaved store with gaps\n");
3036 last_accessed_element
+= diff
;
3038 /* Store the gap from the previous member of the group. If there is no
3039 gap in the access, DR_GROUP_GAP is always 1. */
3040 DR_GROUP_GAP (next
) = diff
;
3042 prev_init
= DR_INIT (data_ref
);
3043 next
= DR_GROUP_NEXT_ELEMENT (next
);
3044 /* Count the number of data-refs in the chain. */
3049 groupsize
= count
+ gaps
;
3051 /* This could be UINT_MAX but as we are generating code in a very
3052 inefficient way we have to cap earlier. See PR78699 for example. */
3053 if (groupsize
> 4096)
3055 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3057 "group is too large\n");
3061 /* Check that the size of the interleaving is equal to count for stores,
3062 i.e., that there are no gaps. */
3063 if (groupsize
!= count
3064 && !DR_IS_READ (dr
))
3067 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3070 /* If there is a gap after the last load in the group it is the
3071 difference between the groupsize and the last accessed
3073 When there is no gap, this difference should be 0. */
3074 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
3076 DR_GROUP_SIZE (stmt_info
) = groupsize
;
3077 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE
, vect_location
,
3080 "Detected interleaving ");
3081 if (DR_IS_READ (dr
))
3082 dump_printf (MSG_NOTE
, "load ");
3083 else if (STMT_VINFO_STRIDED_P (stmt_info
))
3084 dump_printf (MSG_NOTE
, "strided store ");
3086 dump_printf (MSG_NOTE
, "store ");
3087 dump_printf (MSG_NOTE
, "of size %u\n",
3088 (unsigned)groupsize
);
3089 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
3090 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3093 if (DR_GROUP_GAP (next
) != 1)
3094 dump_printf_loc (MSG_NOTE
, vect_location
,
3095 "\t<gap of %d elements>\n",
3096 DR_GROUP_GAP (next
) - 1);
3097 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
3098 next
= DR_GROUP_NEXT_ELEMENT (next
);
3100 if (DR_GROUP_GAP (stmt_info
) != 0)
3101 dump_printf_loc (MSG_NOTE
, vect_location
,
3102 "\t<gap of %d elements>\n",
3103 DR_GROUP_GAP (stmt_info
));
3106 /* SLP: create an SLP data structure for every interleaving group of
3107 stores for further analysis in vect_analyse_slp. */
3108 if (DR_IS_WRITE (dr
) && !slp_impossible
)
3111 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
3113 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3120 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3121 accesses of legal size, step, etc. Detect gaps, single element
3122 interleaving, and other special cases. Set grouped access info.
3123 Collect groups of strided stores for further use in SLP analysis. */
3126 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3128 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
3130 /* Dissolve the group if present. */
3131 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
3134 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3135 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3136 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
3144 /* Analyze the access pattern of the data-reference DR_INFO.
3145 In case of non-consecutive accesses call vect_analyze_group_access() to
3146 analyze groups of accesses. */
3149 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3151 data_reference
*dr
= dr_info
->dr
;
3152 tree step
= DR_STEP (dr
);
3153 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
3154 stmt_vec_info stmt_info
= dr_info
->stmt
;
3155 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
3156 class loop
*loop
= NULL
;
3158 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
3162 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3164 if (loop_vinfo
&& !step
)
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3168 "bad data-ref access in loop\n");
3172 /* Allow loads with zero step in inner-loop vectorization. */
3173 if (loop_vinfo
&& integer_zerop (step
))
3175 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3176 if (!nested_in_vect_loop_p (loop
, stmt_info
))
3177 return DR_IS_READ (dr
);
3178 /* Allow references with zero step for outer loops marked
3179 with pragma omp simd only - it guarantees absence of
3180 loop-carried dependencies between inner loop iterations. */
3181 if (loop
->safelen
< 2)
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE
, vect_location
,
3185 "zero step in inner loop of nest\n");
3190 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
3192 /* Interleaved accesses are not yet supported within outer-loop
3193 vectorization for references in the inner-loop. */
3194 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3196 /* For the rest of the analysis we use the outer-loop step. */
3197 step
= STMT_VINFO_DR_STEP (stmt_info
);
3198 if (integer_zerop (step
))
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_NOTE
, vect_location
,
3202 "zero step in outer loop.\n");
3203 return DR_IS_READ (dr
);
3208 if (TREE_CODE (step
) == INTEGER_CST
)
3210 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
3211 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
3213 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
3215 /* Mark that it is not interleaving. */
3216 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3221 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_NOTE
, vect_location
,
3225 "grouped access in outer loop.\n");
3230 /* Assume this is a DR handled by non-constant strided load case. */
3231 if (TREE_CODE (step
) != INTEGER_CST
)
3232 return (STMT_VINFO_STRIDED_P (stmt_info
)
3233 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3234 || vect_analyze_group_access (vinfo
, dr_info
)));
3236 /* Not consecutive access - check if it's a part of interleaving group. */
3237 return vect_analyze_group_access (vinfo
, dr_info
);
3240 /* Compare two data-references DRA and DRB to group them into chunks
3241 suitable for grouping. */
3244 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
3246 dr_vec_info
*dra_info
= *(dr_vec_info
**)const_cast<void *>(dra_
);
3247 dr_vec_info
*drb_info
= *(dr_vec_info
**)const_cast<void *>(drb_
);
3248 data_reference_p dra
= dra_info
->dr
;
3249 data_reference_p drb
= drb_info
->dr
;
3252 /* Stabilize sort. */
3256 /* Different group IDs lead never belong to the same group. */
3257 if (dra_info
->group
!= drb_info
->group
)
3258 return dra_info
->group
< drb_info
->group
? -1 : 1;
3260 /* Ordering of DRs according to base. */
3261 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3262 DR_BASE_ADDRESS (drb
));
3266 /* And according to DR_OFFSET. */
3267 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
3271 /* Put reads before writes. */
3272 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
3273 return DR_IS_READ (dra
) ? -1 : 1;
3275 /* Then sort after access size. */
3276 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
3277 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
3281 /* And after step. */
3282 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
3286 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3287 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
3289 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
3293 /* If OP is the result of a conversion, return the unconverted value,
3294 otherwise return null. */
3297 strip_conversion (tree op
)
3299 if (TREE_CODE (op
) != SSA_NAME
)
3301 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
3302 if (!is_gimple_assign (stmt
)
3303 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
3305 return gimple_assign_rhs1 (stmt
);
3308 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3309 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3310 be grouped in SLP mode. */
3313 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
3316 if (gimple_assign_single_p (stmt1_info
->stmt
))
3317 return gimple_assign_single_p (stmt2_info
->stmt
);
3319 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
3320 if (call1
&& gimple_call_internal_p (call1
))
3322 /* Check for two masked loads or two masked stores. */
3323 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
3324 if (!call2
|| !gimple_call_internal_p (call2
))
3326 internal_fn ifn
= gimple_call_internal_fn (call1
);
3327 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
3329 if (ifn
!= gimple_call_internal_fn (call2
))
3332 /* Check that the masks are the same. Cope with casts of masks,
3333 like those created by build_mask_conversion. */
3334 tree mask1
= gimple_call_arg (call1
, 2);
3335 tree mask2
= gimple_call_arg (call2
, 2);
3336 if (!operand_equal_p (mask1
, mask2
, 0) && !allow_slp_p
)
3338 mask1
= strip_conversion (mask1
);
3341 mask2
= strip_conversion (mask2
);
3344 if (!operand_equal_p (mask1
, mask2
, 0))
3353 /* Function vect_analyze_data_ref_accesses.
3355 Analyze the access pattern of all the data references in the loop.
3357 FORNOW: the only access pattern that is considered vectorizable is a
3358 simple step 1 (consecutive) access.
3360 FORNOW: handle only arrays and pointer accesses. */
3363 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
3364 vec
<int> *dataref_groups
)
3367 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
3369 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3371 if (datarefs
.is_empty ())
3372 return opt_result::success ();
3374 /* Sort the array of datarefs to make building the interleaving chains
3375 linear. Don't modify the original vector's order, it is needed for
3376 determining what dependencies are reversed. */
3377 vec
<dr_vec_info
*> datarefs_copy
;
3378 datarefs_copy
.create (datarefs
.length ());
3379 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
3381 dr_vec_info
*dr_info
= vinfo
->lookup_dr (datarefs
[i
]);
3382 /* If the caller computed DR grouping use that, otherwise group by
3385 dr_info
->group
= (*dataref_groups
)[i
];
3387 dr_info
->group
= gimple_bb (DR_STMT (datarefs
[i
]))->index
;
3388 datarefs_copy
.quick_push (dr_info
);
3390 datarefs_copy
.qsort (dr_group_sort_cmp
);
3391 hash_set
<stmt_vec_info
> to_fixup
;
3393 /* Build the interleaving chains. */
3394 for (i
= 0; i
< datarefs_copy
.length () - 1;)
3396 dr_vec_info
*dr_info_a
= datarefs_copy
[i
];
3397 data_reference_p dra
= dr_info_a
->dr
;
3398 int dra_group_id
= dr_info_a
->group
;
3399 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
3400 stmt_vec_info lastinfo
= NULL
;
3401 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
3402 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
3407 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
3409 dr_vec_info
*dr_info_b
= datarefs_copy
[i
];
3410 data_reference_p drb
= dr_info_b
->dr
;
3411 int drb_group_id
= dr_info_b
->group
;
3412 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
3413 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
3414 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
3417 /* ??? Imperfect sorting (non-compatible types, non-modulo
3418 accesses, same accesses) can lead to a group to be artificially
3419 split here as we don't just skip over those. If it really
3420 matters we can push those to a worklist and re-iterate
3421 over them. The we can just skip ahead to the next DR here. */
3423 /* DRs in a different DR group should not be put into the same
3424 interleaving group. */
3425 if (dra_group_id
!= drb_group_id
)
3428 /* Check that the data-refs have same first location (except init)
3429 and they are both either store or load (not load and store,
3430 not masked loads or stores). */
3431 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3432 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3433 DR_BASE_ADDRESS (drb
)) != 0
3434 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3435 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3438 /* Check that the data-refs have the same constant size. */
3439 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3440 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3441 if (!tree_fits_uhwi_p (sza
)
3442 || !tree_fits_uhwi_p (szb
)
3443 || !tree_int_cst_equal (sza
, szb
))
3446 /* Check that the data-refs have the same step. */
3447 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3450 /* Check the types are compatible.
3451 ??? We don't distinguish this during sorting. */
3452 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3453 TREE_TYPE (DR_REF (drb
))))
3456 /* Check that the DR_INITs are compile-time constants. */
3457 if (!tree_fits_shwi_p (DR_INIT (dra
))
3458 || !tree_fits_shwi_p (DR_INIT (drb
)))
3461 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3462 just hold extra information. */
3463 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3464 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3465 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3468 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3469 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3470 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3471 HOST_WIDE_INT init_prev
3472 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]->dr
));
3473 gcc_assert (init_a
<= init_b
3474 && init_a
<= init_prev
3475 && init_prev
<= init_b
);
3477 /* Do not place the same access in the interleaving chain twice. */
3478 if (init_b
== init_prev
)
3480 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]->dr
))
3481 < gimple_uid (DR_STMT (drb
)));
3482 /* Simply link in duplicates and fix up the chain below. */
3486 /* If init_b == init_a + the size of the type * k, we have an
3487 interleaving, and DRA is accessed before DRB. */
3488 unsigned HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3489 if (type_size_a
== 0
3490 || (((unsigned HOST_WIDE_INT
)init_b
- init_a
)
3491 % type_size_a
!= 0))
3494 /* If we have a store, the accesses are adjacent. This splits
3495 groups into chunks we support (we don't support vectorization
3496 of stores with gaps). */
3497 if (!DR_IS_READ (dra
)
3498 && (((unsigned HOST_WIDE_INT
)init_b
- init_prev
)
3502 /* If the step (if not zero or non-constant) is smaller than the
3503 difference between data-refs' inits this splits groups into
3505 if (tree_fits_shwi_p (DR_STEP (dra
)))
3507 unsigned HOST_WIDE_INT step
3508 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3510 && step
<= ((unsigned HOST_WIDE_INT
)init_b
- init_a
))
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE
, vect_location
,
3518 ? "Detected interleaving load %T and %T\n"
3519 : "Detected interleaving store %T and %T\n",
3520 DR_REF (dra
), DR_REF (drb
));
3522 /* Link the found element into the group list. */
3523 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3525 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3526 lastinfo
= stmtinfo_a
;
3528 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3529 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3530 lastinfo
= stmtinfo_b
;
3532 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3533 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3535 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3536 dump_printf_loc (MSG_NOTE
, vect_location
,
3537 "Load suitable for SLP vectorization only.\n");
3539 if (init_b
== init_prev
3540 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3541 && dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE
, vect_location
,
3543 "Queuing group with duplicate access for fixup\n");
3547 /* Fixup groups with duplicate entries by splitting it. */
3550 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3551 if (!(it
!= to_fixup
.end ()))
3553 stmt_vec_info grp
= *it
;
3554 to_fixup
.remove (grp
);
3556 /* Find the earliest duplicate group member. */
3557 unsigned first_duplicate
= -1u;
3558 stmt_vec_info next
, g
= grp
;
3559 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3561 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3562 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3563 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3564 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3567 if (first_duplicate
== -1U)
3570 /* Then move all stmts after the first duplicate to a new group.
3571 Note this is a heuristic but one with the property that *it
3572 is fixed up completely. */
3574 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3575 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3577 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3579 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3583 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3585 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3588 g
= DR_GROUP_NEXT_ELEMENT (g
);
3590 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3592 /* Fixup the new group which still may contain duplicates. */
3593 to_fixup
.add (newgroup
);
3596 dr_vec_info
*dr_info
;
3597 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_info
)
3599 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3600 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3602 if (dump_enabled_p ())
3603 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3604 "not vectorized: complicated access pattern.\n");
3606 if (is_a
<bb_vec_info
> (vinfo
))
3608 /* Mark the statement as not vectorizable. */
3609 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3614 datarefs_copy
.release ();
3615 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3617 " complicated access pattern.\n");
3622 datarefs_copy
.release ();
3623 return opt_result::success ();
3626 /* Function vect_vfa_segment_size.
3629 DR_INFO: The data reference.
3630 LENGTH_FACTOR: segment length to consider.
3632 Return a value suitable for the dr_with_seg_len::seg_len field.
3633 This is the "distance travelled" by the pointer from the first
3634 iteration in the segment to the last. Note that it does not include
3635 the size of the access; in effect it only describes the first byte. */
3638 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3640 length_factor
= size_binop (MINUS_EXPR
,
3641 fold_convert (sizetype
, length_factor
),
3643 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3647 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3648 gives the worst-case number of bytes covered by the segment. */
3650 static unsigned HOST_WIDE_INT
3651 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3653 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3654 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3655 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3656 unsigned HOST_WIDE_INT access_size
= ref_size
;
3657 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3659 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3660 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3662 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3664 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3665 && ((misalignment
= dr_misalignment (dr_info
, vectype
)), true)
3666 && (vect_supportable_dr_alignment (vinfo
, dr_info
, vectype
, misalignment
)
3667 == dr_explicit_realign_optimized
))
3669 /* We might access a full vector's worth. */
3670 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3675 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3679 vect_vfa_align (dr_vec_info
*dr_info
)
3681 return dr_alignment (dr_info
->dr
);
3684 /* Function vect_no_alias_p.
3686 Given data references A and B with equal base and offset, see whether
3687 the alias relation can be decided at compilation time. Return 1 if
3688 it can and the references alias, 0 if it can and the references do
3689 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3690 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3691 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3694 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3695 tree segment_length_a
, tree segment_length_b
,
3696 unsigned HOST_WIDE_INT access_size_a
,
3697 unsigned HOST_WIDE_INT access_size_b
)
3699 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3700 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3701 poly_uint64 const_length_a
;
3702 poly_uint64 const_length_b
;
3704 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3705 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3707 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3709 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3710 offset_a
-= const_length_a
;
3713 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3714 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3716 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3717 offset_b
-= const_length_b
;
3720 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3722 const_length_a
+= access_size_a
;
3723 const_length_b
+= access_size_b
;
3725 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3726 offset_b
, const_length_b
))
3729 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3730 offset_b
, const_length_b
))
3736 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3740 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3741 unsigned int loop_depth
, poly_uint64 vf
)
3743 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3744 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3747 /* If the dependence is exact, we should have limited the VF instead. */
3748 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3751 lambda_vector dist_v
;
3752 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3754 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3756 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3757 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3761 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_NOTE
, vect_location
,
3763 "dependence distance between %T and %T is >= VF\n",
3764 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3769 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3772 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3774 dump_printf (dump_kind
, "%s (%T) >= ",
3775 lower_bound
.unsigned_p
? "unsigned" : "abs",
3777 dump_dec (dump_kind
, lower_bound
.min_value
);
3780 /* Record that the vectorized loop requires the vec_lower_bound described
3781 by EXPR, UNSIGNED_P and MIN_VALUE. */
3784 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3785 poly_uint64 min_value
)
3787 vec
<vec_lower_bound
> &lower_bounds
3788 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3789 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3790 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3792 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3793 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3794 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3795 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3797 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3798 lower_bounds
[i
].min_value
= min_value
;
3799 if (dump_enabled_p ())
3801 dump_printf_loc (MSG_NOTE
, vect_location
,
3802 "updating run-time check to ");
3803 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3804 dump_printf (MSG_NOTE
, "\n");
3810 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3811 if (dump_enabled_p ())
3813 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3814 dump_lower_bound (MSG_NOTE
, lower_bound
);
3815 dump_printf (MSG_NOTE
, "\n");
3817 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3820 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3821 will span fewer than GAP bytes. */
3824 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3827 stmt_vec_info stmt_info
= dr_info
->stmt
;
3829 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3830 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3831 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3832 return (estimated_poly_value (gap
)
3833 <= count
* vect_get_scalar_dr_size (dr_info
));
3836 /* Return true if we know that there is no alias between DR_INFO_A and
3837 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3838 When returning true, set *LOWER_BOUND_OUT to this N. */
3841 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3842 poly_uint64
*lower_bound_out
)
3844 /* Check that there is a constant gap of known sign between DR_A
3846 data_reference
*dr_a
= dr_info_a
->dr
;
3847 data_reference
*dr_b
= dr_info_b
->dr
;
3848 poly_int64 init_a
, init_b
;
3849 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3850 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3851 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3852 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3853 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3854 || !ordered_p (init_a
, init_b
))
3857 /* Sort DR_A and DR_B by the address they access. */
3858 if (maybe_lt (init_b
, init_a
))
3860 std::swap (init_a
, init_b
);
3861 std::swap (dr_info_a
, dr_info_b
);
3862 std::swap (dr_a
, dr_b
);
3865 /* If the two accesses could be dependent within a scalar iteration,
3866 make sure that we'd retain their order. */
3867 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3868 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3871 /* There is no alias if abs (DR_STEP) is greater than or equal to
3872 the bytes spanned by the combination of the two accesses. */
3873 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3877 /* Function vect_prune_runtime_alias_test_list.
3879 Prune a list of ddrs to be tested at run-time by versioning for alias.
3880 Merge several alias checks into one if possible.
3881 Return FALSE if resulting list of ddrs is longer then allowed by
3882 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3885 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3887 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3888 hash_set
<tree_pair_hash
> compared_objects
;
3890 const vec
<ddr_p
> &may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3891 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3892 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3893 const vec
<vec_object_pair
> &check_unequal_addrs
3894 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3895 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3896 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3902 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3904 /* Step values are irrelevant for aliasing if the number of vector
3905 iterations is equal to the number of scalar iterations (which can
3906 happen for fully-SLP loops). */
3907 bool vf_one_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3911 /* Convert the checks for nonzero steps into bound tests. */
3913 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3914 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3917 if (may_alias_ddrs
.is_empty ())
3918 return opt_result::success ();
3920 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3922 unsigned int loop_depth
3923 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3924 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3926 /* First, we collect all data ref pairs for aliasing checks. */
3927 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3929 poly_uint64 lower_bound
;
3930 tree segment_length_a
, segment_length_b
;
3931 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3932 unsigned int align_a
, align_b
;
3934 /* Ignore the alias if the VF we chose ended up being no greater
3935 than the dependence distance. */
3936 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3939 if (DDR_OBJECT_A (ddr
))
3941 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3942 if (!compared_objects
.add (new_pair
))
3944 if (dump_enabled_p ())
3945 dump_printf_loc (MSG_NOTE
, vect_location
,
3946 "checking that %T and %T"
3947 " have different addresses\n",
3948 new_pair
.first
, new_pair
.second
);
3949 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3954 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3955 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3957 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3958 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3960 bool preserves_scalar_order_p
3961 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3964 && (preserves_scalar_order_p
3965 || operand_equal_p (DR_STEP (dr_info_a
->dr
),
3966 DR_STEP (dr_info_b
->dr
))));
3968 /* Skip the pair if inter-iteration dependencies are irrelevant
3969 and intra-iteration dependencies are guaranteed to be honored. */
3971 && (preserves_scalar_order_p
3972 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3975 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_NOTE
, vect_location
,
3977 "no need for alias check between "
3978 "%T and %T when VF is 1\n",
3979 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3983 /* See whether we can handle the alias using a bounds check on
3984 the step, and whether that's likely to be the best approach.
3985 (It might not be, for example, if the minimum step is much larger
3986 than the number of bytes handled by one vector iteration.) */
3988 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3989 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3991 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3992 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3994 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3995 if (dump_enabled_p ())
3997 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3998 "%T and %T when the step %T is outside ",
3999 DR_REF (dr_info_a
->dr
),
4000 DR_REF (dr_info_b
->dr
),
4001 DR_STEP (dr_info_a
->dr
));
4003 dump_printf (MSG_NOTE
, "[0");
4006 dump_printf (MSG_NOTE
, "(");
4007 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
4009 dump_printf (MSG_NOTE
, ", ");
4010 dump_dec (MSG_NOTE
, lower_bound
);
4011 dump_printf (MSG_NOTE
, ")\n");
4013 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
4014 unsigned_p
, lower_bound
);
4018 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
4019 if (dr_group_first_a
)
4021 stmt_info_a
= dr_group_first_a
;
4022 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
4025 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
4026 if (dr_group_first_b
)
4028 stmt_info_b
= dr_group_first_b
;
4029 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
4034 segment_length_a
= size_zero_node
;
4035 segment_length_b
= size_zero_node
;
4039 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
4040 DR_STEP (dr_info_b
->dr
), 0))
4041 length_factor
= scalar_loop_iters
;
4043 length_factor
= size_int (vect_factor
);
4044 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
4045 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
4047 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
4048 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
4049 align_a
= vect_vfa_align (dr_info_a
);
4050 align_b
= vect_vfa_align (dr_info_b
);
4052 /* See whether the alias is known at compilation time. */
4053 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
4054 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
4055 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
4056 DR_OFFSET (dr_info_b
->dr
), 0)
4057 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
4058 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
4059 && poly_int_tree_p (segment_length_a
)
4060 && poly_int_tree_p (segment_length_b
))
4062 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
4067 if (res
>= 0 && dump_enabled_p ())
4069 dump_printf_loc (MSG_NOTE
, vect_location
,
4070 "can tell at compile time that %T and %T",
4071 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
4073 dump_printf (MSG_NOTE
, " do not alias\n");
4075 dump_printf (MSG_NOTE
, " alias\n");
4082 return opt_result::failure_at (stmt_info_b
->stmt
,
4084 " compilation time alias: %G%G",
4089 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
4090 access_size_a
, align_a
);
4091 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
4092 access_size_b
, align_b
);
4093 /* Canonicalize the order to be the one that's needed for accurate
4094 RAW, WAR and WAW flags, in cases where the data references are
4095 well-ordered. The order doesn't really matter otherwise,
4096 but we might as well be consistent. */
4097 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
4098 std::swap (dr_a
, dr_b
);
4100 dr_with_seg_len_pair_t dr_with_seg_len_pair
4101 (dr_a
, dr_b
, (preserves_scalar_order_p
4102 ? dr_with_seg_len_pair_t::WELL_ORDERED
4103 : dr_with_seg_len_pair_t::REORDERED
));
4105 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
4108 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
4110 unsigned int count
= (comp_alias_ddrs
.length ()
4111 + check_unequal_addrs
.length ());
4114 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
))
4115 == VECT_COST_MODEL_VERY_CHEAP
))
4116 return opt_result::failure_at
4117 (vect_location
, "would need a runtime alias check\n");
4119 if (dump_enabled_p ())
4120 dump_printf_loc (MSG_NOTE
, vect_location
,
4121 "improved number of alias checks from %d to %d\n",
4122 may_alias_ddrs
.length (), count
);
4123 unsigned limit
= param_vect_max_version_for_alias_checks
;
4124 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)) == VECT_COST_MODEL_CHEAP
)
4125 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
4127 return opt_result::failure_at
4129 "number of versioning for alias run-time tests exceeds %d "
4130 "(--param vect-max-version-for-alias-checks)\n", limit
);
4132 return opt_result::success ();
4135 /* Check whether we can use an internal function for a gather load
4136 or scatter store. READ_P is true for loads and false for stores.
4137 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4138 the type of the memory elements being loaded or stored. OFFSET_TYPE
4139 is the type of the offset that is being applied to the invariant
4140 base address. SCALE is the amount by which the offset should
4141 be multiplied *after* it has been converted to address width.
4143 Return true if the function is supported, storing the function id in
4144 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4147 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
4148 tree vectype
, tree memory_type
, tree offset_type
,
4149 int scale
, internal_fn
*ifn_out
,
4150 tree
*offset_vectype_out
)
4152 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
4153 unsigned int element_bits
= vector_element_bits (vectype
);
4154 if (element_bits
!= memory_bits
)
4155 /* For now the vector elements must be the same width as the
4159 /* Work out which function we need. */
4160 internal_fn ifn
, alt_ifn
, alt_ifn2
;
4163 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
4164 alt_ifn
= IFN_MASK_GATHER_LOAD
;
4165 /* When target supports MASK_LEN_GATHER_LOAD, we always
4166 use MASK_LEN_GATHER_LOAD regardless whether len and
4167 mask are valid or not. */
4168 alt_ifn2
= IFN_MASK_LEN_GATHER_LOAD
;
4172 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
4173 alt_ifn
= IFN_MASK_SCATTER_STORE
;
4174 /* When target supports MASK_LEN_SCATTER_STORE, we always
4175 use MASK_LEN_SCATTER_STORE regardless whether len and
4176 mask are valid or not. */
4177 alt_ifn2
= IFN_MASK_LEN_SCATTER_STORE
;
4182 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
4183 if (!offset_vectype
)
4186 /* Test whether the target supports this combination. */
4187 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
4188 offset_vectype
, scale
))
4191 *offset_vectype_out
= offset_vectype
;
4195 && internal_gather_scatter_fn_supported_p (alt_ifn
, vectype
,
4201 *offset_vectype_out
= offset_vectype
;
4204 else if (internal_gather_scatter_fn_supported_p (alt_ifn2
, vectype
,
4206 offset_vectype
, scale
))
4208 *ifn_out
= alt_ifn2
;
4209 *offset_vectype_out
= offset_vectype
;
4213 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
4214 && TYPE_PRECISION (offset_type
) >= element_bits
)
4217 offset_type
= build_nonstandard_integer_type
4218 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
4222 /* STMT_INFO is a call to an internal gather load or scatter store function.
4223 Describe the operation in INFO. */
4226 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
4227 gather_scatter_info
*info
)
4229 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
4230 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4231 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4233 info
->ifn
= gimple_call_internal_fn (call
);
4234 info
->decl
= NULL_TREE
;
4235 info
->base
= gimple_call_arg (call
, 0);
4236 info
->offset
= gimple_call_arg (call
, 1);
4237 info
->offset_dt
= vect_unknown_def_type
;
4238 info
->offset_vectype
= NULL_TREE
;
4239 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
4240 info
->element_type
= TREE_TYPE (vectype
);
4241 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
4244 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4245 gather load or scatter store. Describe the operation in *INFO if so. */
4248 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
4249 gather_scatter_info
*info
)
4251 HOST_WIDE_INT scale
= 1;
4252 poly_int64 pbitpos
, pbitsize
;
4253 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4254 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4255 tree offtype
= NULL_TREE
;
4256 tree decl
= NULL_TREE
, base
, off
;
4257 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4258 tree memory_type
= TREE_TYPE (DR_REF (dr
));
4260 int punsignedp
, reversep
, pvolatilep
= 0;
4262 tree offset_vectype
;
4263 bool masked_p
= false;
4265 /* See whether this is already a call to a gather/scatter internal function.
4266 If not, see whether it's a masked load or store. */
4267 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4268 if (call
&& gimple_call_internal_p (call
))
4270 ifn
= gimple_call_internal_fn (call
);
4271 if (internal_gather_scatter_fn_p (ifn
))
4273 vect_describe_gather_scatter_call (stmt_info
, info
);
4276 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
4279 /* True if we should aim to use internal functions rather than
4280 built-in functions. */
4281 bool use_ifn_p
= (DR_IS_READ (dr
)
4282 ? supports_vec_gather_load_p (TYPE_MODE (vectype
))
4283 : supports_vec_scatter_store_p (TYPE_MODE (vectype
)));
4286 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4287 see if we can use the def stmt of the address. */
4289 && TREE_CODE (base
) == MEM_REF
4290 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
4291 && integer_zerop (TREE_OPERAND (base
, 1))
4292 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
4294 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
4295 if (is_gimple_assign (def_stmt
)
4296 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
4297 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
4300 /* The gather and scatter builtins need address of the form
4301 loop_invariant + vector * {1, 2, 4, 8}
4303 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4304 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4305 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4306 multiplications and additions in it. To get a vector, we need
4307 a single SSA_NAME that will be defined in the loop and will
4308 contain everything that is not loop invariant and that can be
4309 vectorized. The following code attempts to find such a preexistng
4310 SSA_NAME OFF and put the loop invariants into a tree BASE
4311 that can be gimplified before the loop. */
4312 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
4313 &punsignedp
, &reversep
, &pvolatilep
);
4317 /* PR 107346. Packed structs can have fields at offsets that are not
4318 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4319 if (!multiple_p (pbitpos
, BITS_PER_UNIT
))
4322 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
4324 if (TREE_CODE (base
) == MEM_REF
)
4326 if (!integer_zerop (TREE_OPERAND (base
, 1)))
4328 if (off
== NULL_TREE
)
4329 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
4331 off
= size_binop (PLUS_EXPR
, off
,
4332 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
4334 base
= TREE_OPERAND (base
, 0);
4337 base
= build_fold_addr_expr (base
);
4339 if (off
== NULL_TREE
)
4340 off
= size_zero_node
;
4342 /* If base is not loop invariant, either off is 0, then we start with just
4343 the constant offset in the loop invariant BASE and continue with base
4344 as OFF, otherwise give up.
4345 We could handle that case by gimplifying the addition of base + off
4346 into some SSA_NAME and use that as off, but for now punt. */
4347 if (!expr_invariant_in_loop_p (loop
, base
))
4349 if (!integer_zerop (off
))
4352 base
= size_int (pbytepos
);
4354 /* Otherwise put base + constant offset into the loop invariant BASE
4355 and continue with OFF. */
4358 base
= fold_convert (sizetype
, base
);
4359 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
4362 /* OFF at this point may be either a SSA_NAME or some tree expression
4363 from get_inner_reference. Try to peel off loop invariants from it
4364 into BASE as long as possible. */
4366 while (offtype
== NULL_TREE
)
4368 enum tree_code code
;
4369 tree op0
, op1
, add
= NULL_TREE
;
4371 if (TREE_CODE (off
) == SSA_NAME
)
4373 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
4375 if (expr_invariant_in_loop_p (loop
, off
))
4378 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
4381 op0
= gimple_assign_rhs1 (def_stmt
);
4382 code
= gimple_assign_rhs_code (def_stmt
);
4383 op1
= gimple_assign_rhs2 (def_stmt
);
4387 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
4389 code
= TREE_CODE (off
);
4390 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
4394 case POINTER_PLUS_EXPR
:
4396 if (expr_invariant_in_loop_p (loop
, op0
))
4401 add
= fold_convert (sizetype
, add
);
4403 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
4404 base
= size_binop (PLUS_EXPR
, base
, add
);
4407 if (expr_invariant_in_loop_p (loop
, op1
))
4415 if (expr_invariant_in_loop_p (loop
, op1
))
4417 add
= fold_convert (sizetype
, op1
);
4418 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
4424 if (scale
== 1 && tree_fits_shwi_p (op1
))
4426 int new_scale
= tree_to_shwi (op1
);
4427 /* Only treat this as a scaling operation if the target
4428 supports it for at least some offset type. */
4430 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4431 masked_p
, vectype
, memory_type
,
4432 signed_char_type_node
,
4435 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4436 masked_p
, vectype
, memory_type
,
4437 unsigned_char_type_node
,
4450 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
4451 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
4454 /* Don't include the conversion if the target is happy with
4455 the current offset type. */
4457 && TREE_CODE (off
) == SSA_NAME
4458 && !POINTER_TYPE_P (TREE_TYPE (off
))
4459 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4460 masked_p
, vectype
, memory_type
,
4461 TREE_TYPE (off
), scale
, &ifn
,
4465 if (TYPE_PRECISION (TREE_TYPE (op0
))
4466 == TYPE_PRECISION (TREE_TYPE (off
)))
4472 /* Include the conversion if it is widening and we're using
4473 the IFN path or the target can handle the converted from
4474 offset or the current size is not already the same as the
4475 data vector element size. */
4476 if ((TYPE_PRECISION (TREE_TYPE (op0
))
4477 < TYPE_PRECISION (TREE_TYPE (off
)))
4480 ? (targetm
.vectorize
.builtin_gather
4481 && targetm
.vectorize
.builtin_gather (vectype
,
4484 : (targetm
.vectorize
.builtin_scatter
4485 && targetm
.vectorize
.builtin_scatter (vectype
,
4488 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off
)),
4489 TYPE_SIZE (TREE_TYPE (vectype
)), 0)))
4492 offtype
= TREE_TYPE (off
);
4503 /* If at the end OFF still isn't a SSA_NAME or isn't
4504 defined in the loop, punt. */
4505 if (TREE_CODE (off
) != SSA_NAME
4506 || expr_invariant_in_loop_p (loop
, off
))
4509 if (offtype
== NULL_TREE
)
4510 offtype
= TREE_TYPE (off
);
4514 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4515 vectype
, memory_type
, offtype
, scale
,
4516 &ifn
, &offset_vectype
))
4522 if (DR_IS_READ (dr
))
4524 if (targetm
.vectorize
.builtin_gather
)
4525 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4529 if (targetm
.vectorize
.builtin_scatter
)
4530 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4533 /* The offset vector type will be read from DECL when needed. */
4534 offset_vectype
= NULL_TREE
;
4541 info
->offset_dt
= vect_unknown_def_type
;
4542 info
->offset_vectype
= offset_vectype
;
4543 info
->scale
= scale
;
4544 info
->element_type
= TREE_TYPE (vectype
);
4545 info
->memory_type
= memory_type
;
4549 /* Find the data references in STMT, analyze them with respect to LOOP and
4550 append them to DATAREFS. Return false if datarefs in this stmt cannot
4554 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4555 vec
<data_reference_p
> *datarefs
,
4556 vec
<int> *dataref_groups
, int group_id
)
4558 /* We can ignore clobbers for dataref analysis - they are removed during
4559 loop vectorization and BB vectorization checks dependences with a
4561 if (gimple_clobber_p (stmt
))
4562 return opt_result::success ();
4564 if (gimple_has_volatile_ops (stmt
))
4565 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4568 if (stmt_can_throw_internal (cfun
, stmt
))
4569 return opt_result::failure_at (stmt
,
4571 " statement can throw an exception: %G",
4574 auto_vec
<data_reference_p
, 2> refs
;
4575 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4579 if (refs
.is_empty ())
4580 return opt_result::success ();
4582 if (refs
.length () > 1)
4584 while (!refs
.is_empty ())
4585 free_data_ref (refs
.pop ());
4586 return opt_result::failure_at (stmt
,
4587 "not vectorized: more than one "
4588 "data ref in stmt: %G", stmt
);
4591 data_reference_p dr
= refs
.pop ();
4592 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4593 if (!gimple_call_internal_p (call
)
4594 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4595 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4598 return opt_result::failure_at (stmt
,
4599 "not vectorized: dr in a call %G", stmt
);
4602 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4603 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4606 return opt_result::failure_at (stmt
,
4608 " statement is an unsupported"
4609 " bitfield access %G", stmt
);
4612 if (DR_BASE_ADDRESS (dr
)
4613 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4616 return opt_result::failure_at (stmt
,
4618 " base addr of dr is a constant\n");
4621 /* Check whether this may be a SIMD lane access and adjust the
4622 DR to make it easier for us to handle it. */
4625 && (!DR_BASE_ADDRESS (dr
)
4630 struct data_reference
*newdr
4631 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4632 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4633 if (DR_BASE_ADDRESS (newdr
)
4634 && DR_OFFSET (newdr
)
4637 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4638 && integer_zerop (DR_STEP (newdr
)))
4640 tree base_address
= DR_BASE_ADDRESS (newdr
);
4641 tree off
= DR_OFFSET (newdr
);
4642 tree step
= ssize_int (1);
4643 if (integer_zerop (off
)
4644 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4646 off
= TREE_OPERAND (base_address
, 1);
4647 base_address
= TREE_OPERAND (base_address
, 0);
4650 if (TREE_CODE (off
) == MULT_EXPR
4651 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4653 step
= TREE_OPERAND (off
, 1);
4654 off
= TREE_OPERAND (off
, 0);
4657 if (CONVERT_EXPR_P (off
)
4658 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4659 < TYPE_PRECISION (TREE_TYPE (off
))))
4660 off
= TREE_OPERAND (off
, 0);
4661 if (TREE_CODE (off
) == SSA_NAME
)
4663 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4664 /* Look through widening conversion. */
4665 if (is_gimple_assign (def
)
4666 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4668 tree rhs1
= gimple_assign_rhs1 (def
);
4669 if (TREE_CODE (rhs1
) == SSA_NAME
4670 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4671 && (TYPE_PRECISION (TREE_TYPE (off
))
4672 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4673 def
= SSA_NAME_DEF_STMT (rhs1
);
4675 if (is_gimple_call (def
)
4676 && gimple_call_internal_p (def
)
4677 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4679 tree arg
= gimple_call_arg (def
, 0);
4680 tree reft
= TREE_TYPE (DR_REF (newdr
));
4681 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4682 arg
= SSA_NAME_VAR (arg
);
4683 if (arg
== loop
->simduid
4685 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4687 DR_BASE_ADDRESS (newdr
) = base_address
;
4688 DR_OFFSET (newdr
) = ssize_int (0);
4689 DR_STEP (newdr
) = step
;
4690 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4691 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4692 /* Mark as simd-lane access. */
4693 tree arg2
= gimple_call_arg (def
, 1);
4694 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4696 datarefs
->safe_push (newdr
);
4698 dataref_groups
->safe_push (group_id
);
4699 return opt_result::success ();
4704 free_data_ref (newdr
);
4707 datarefs
->safe_push (dr
);
4709 dataref_groups
->safe_push (group_id
);
4710 return opt_result::success ();
4713 /* Function vect_analyze_data_refs.
4715 Find all the data references in the loop or basic block.
4717 The general structure of the analysis of data refs in the vectorizer is as
4719 1- vect_analyze_data_refs(loop/bb): call
4720 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4721 in the loop/bb and their dependences.
4722 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4723 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4724 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4729 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4731 class loop
*loop
= NULL
;
4733 struct data_reference
*dr
;
4736 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4738 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4739 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4741 /* Go through the data-refs, check that the analysis succeeded. Update
4742 pointer from stmt_vec_info struct to DR and vectype. */
4744 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4745 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4747 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4750 gcc_assert (DR_REF (dr
));
4751 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4752 gcc_assert (!stmt_info
->dr_aux
.dr
);
4753 stmt_info
->dr_aux
.dr
= dr
;
4754 stmt_info
->dr_aux
.stmt
= stmt_info
;
4756 /* Check that analysis of the data-ref succeeded. */
4757 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4762 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4765 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4767 /* If target supports vector gather loads or scatter stores,
4768 see if they can't be used. */
4769 if (is_a
<loop_vec_info
> (vinfo
)
4770 && !nested_in_vect_loop_p (loop
, stmt_info
))
4772 if (maybe_gather
|| maybe_scatter
)
4775 gatherscatter
= GATHER
;
4777 gatherscatter
= SCATTER
;
4781 if (gatherscatter
== SG_NONE
)
4783 if (dump_enabled_p ())
4784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4785 "not vectorized: data ref analysis "
4786 "failed %G", stmt_info
->stmt
);
4787 if (is_a
<bb_vec_info
> (vinfo
))
4789 /* In BB vectorization the ref can still participate
4790 in dependence analysis, we just can't vectorize it. */
4791 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4794 return opt_result::failure_at (stmt_info
->stmt
,
4796 " data ref analysis failed: %G",
4801 /* See if this was detected as SIMD lane access. */
4802 if (dr
->aux
== (void *)-1
4803 || dr
->aux
== (void *)-2
4804 || dr
->aux
== (void *)-3
4805 || dr
->aux
== (void *)-4)
4807 if (nested_in_vect_loop_p (loop
, stmt_info
))
4808 return opt_result::failure_at (stmt_info
->stmt
,
4810 " data ref analysis failed: %G",
4812 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4813 = -(uintptr_t) dr
->aux
;
4816 tree base
= get_base_address (DR_REF (dr
));
4817 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4819 if (dump_enabled_p ())
4820 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4821 "not vectorized: base object not addressable "
4822 "for stmt: %G", stmt_info
->stmt
);
4823 if (is_a
<bb_vec_info
> (vinfo
))
4825 /* In BB vectorization the ref can still participate
4826 in dependence analysis, we just can't vectorize it. */
4827 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4830 return opt_result::failure_at (stmt_info
->stmt
,
4831 "not vectorized: base object not"
4832 " addressable for stmt: %G",
4836 if (is_a
<loop_vec_info
> (vinfo
)
4838 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4840 if (nested_in_vect_loop_p (loop
, stmt_info
))
4841 return opt_result::failure_at (stmt_info
->stmt
,
4843 "not suitable for strided load %G",
4845 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4848 /* Update DR field in stmt_vec_info struct. */
4850 /* If the dataref is in an inner-loop of the loop that is considered for
4851 for vectorization, we also want to analyze the access relative to
4852 the outer-loop (DR contains information only relative to the
4853 inner-most enclosing loop). We do that by building a reference to the
4854 first location accessed by the inner-loop, and analyze it relative to
4856 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4858 /* Build a reference to the first location accessed by the
4859 inner loop: *(BASE + INIT + OFFSET). By construction,
4860 this address must be invariant in the inner loop, so we
4861 can consider it as being used in the outer loop. */
4862 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4863 tree offset
= unshare_expr (DR_OFFSET (dr
));
4864 tree init
= unshare_expr (DR_INIT (dr
));
4865 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4867 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4868 tree init_ref
= build_fold_indirect_ref (init_addr
);
4870 if (dump_enabled_p ())
4871 dump_printf_loc (MSG_NOTE
, vect_location
,
4872 "analyze in outer loop: %T\n", init_ref
);
4875 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4876 init_ref
, loop
, stmt_info
->stmt
);
4878 /* dr_analyze_innermost already explained the failure. */
4881 if (dump_enabled_p ())
4882 dump_printf_loc (MSG_NOTE
, vect_location
,
4883 "\touter base_address: %T\n"
4884 "\touter offset from base address: %T\n"
4885 "\touter constant offset from base address: %T\n"
4886 "\touter step: %T\n"
4887 "\touter base alignment: %d\n\n"
4888 "\touter base misalignment: %d\n"
4889 "\touter offset alignment: %d\n"
4890 "\touter step alignment: %d\n",
4891 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4892 STMT_VINFO_DR_OFFSET (stmt_info
),
4893 STMT_VINFO_DR_INIT (stmt_info
),
4894 STMT_VINFO_DR_STEP (stmt_info
),
4895 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4896 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4897 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4898 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4901 /* Set vectype for STMT. */
4902 scalar_type
= TREE_TYPE (DR_REF (dr
));
4903 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4906 if (dump_enabled_p ())
4908 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4909 "not vectorized: no vectype for stmt: %G",
4911 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4912 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4914 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4917 if (is_a
<bb_vec_info
> (vinfo
))
4919 /* No vector type is fine, the ref can still participate
4920 in dependence analysis, we just can't vectorize it. */
4921 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4926 return opt_result::failure_at (stmt_info
->stmt
,
4928 " no vectype for stmt: %G"
4929 " scalar_type: %T\n",
4930 stmt_info
->stmt
, scalar_type
);
4934 if (dump_enabled_p ())
4935 dump_printf_loc (MSG_NOTE
, vect_location
,
4936 "got vectype for stmt: %G%T\n",
4937 stmt_info
->stmt
, vectype
);
4940 /* Adjust the minimal vectorization factor according to the
4942 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4943 *min_vf
= upper_bound (*min_vf
, vf
);
4945 /* Leave the BB vectorizer to pick the vector type later, based on
4946 the final dataref group size and SLP node size. */
4947 if (is_a
<loop_vec_info
> (vinfo
))
4948 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4950 if (gatherscatter
!= SG_NONE
)
4952 gather_scatter_info gs_info
;
4953 if (!vect_check_gather_scatter (stmt_info
,
4954 as_a
<loop_vec_info
> (vinfo
),
4956 || !get_vectype_for_scalar_type (vinfo
,
4957 TREE_TYPE (gs_info
.offset
)))
4961 return opt_result::failure_at
4963 (gatherscatter
== GATHER
)
4964 ? "not vectorized: not suitable for gather load %G"
4965 : "not vectorized: not suitable for scatter store %G",
4968 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4972 /* We used to stop processing and prune the list here. Verify we no
4974 gcc_assert (i
== datarefs
.length ());
4976 return opt_result::success ();
4980 /* Function vect_get_new_vect_var.
4982 Returns a name for a new variable. The current naming scheme appends the
4983 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4984 the name of vectorizer generated variables, and appends that to NAME if
4988 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4995 case vect_simple_var
:
4998 case vect_scalar_var
:
5004 case vect_pointer_var
:
5013 char* tmp
= concat (prefix
, "_", name
, NULL
);
5014 new_vect_var
= create_tmp_reg (type
, tmp
);
5018 new_vect_var
= create_tmp_reg (type
, prefix
);
5020 return new_vect_var
;
5023 /* Like vect_get_new_vect_var but return an SSA name. */
5026 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
5033 case vect_simple_var
:
5036 case vect_scalar_var
:
5039 case vect_pointer_var
:
5048 char* tmp
= concat (prefix
, "_", name
, NULL
);
5049 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
5053 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
5055 return new_vect_var
;
5058 /* Duplicate points-to info on NAME from DR_INFO. */
5061 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
5063 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
5064 /* DR_PTR_INFO is for a base SSA name, not including constant or
5065 variable offsets in the ref so its alignment info does not apply. */
5066 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
5069 /* Function vect_create_addr_base_for_vector_ref.
5071 Create an expression that computes the address of the first memory location
5072 that will be accessed for a data reference.
5075 STMT_INFO: The statement containing the data reference.
5076 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5077 OFFSET: Optional. If supplied, it is be added to the initial address.
5078 LOOP: Specify relative to which loop-nest should the address be computed.
5079 For example, when the dataref is in an inner-loop nested in an
5080 outer-loop that is now being vectorized, LOOP can be either the
5081 outer-loop, or the inner-loop. The first memory location accessed
5082 by the following dataref ('in' points to short):
5089 if LOOP=i_loop: &in (relative to i_loop)
5090 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5093 1. Return an SSA_NAME whose value is the address of the memory location of
5094 the first vector of the data reference.
5095 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5096 these statement(s) which define the returned SSA_NAME.
5098 FORNOW: We are only handling array accesses with step 1. */
5101 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5102 gimple_seq
*new_stmt_list
,
5105 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5106 struct data_reference
*dr
= dr_info
->dr
;
5107 const char *base_name
;
5110 gimple_seq seq
= NULL
;
5112 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5113 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
5115 tree data_ref_base
= unshare_expr (drb
->base_address
);
5116 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
5117 tree init
= unshare_expr (drb
->init
);
5120 base_name
= get_name (data_ref_base
);
5123 base_offset
= ssize_int (0);
5124 init
= ssize_int (0);
5125 base_name
= get_name (DR_REF (dr
));
5128 /* Create base_offset */
5129 base_offset
= size_binop (PLUS_EXPR
,
5130 fold_convert (sizetype
, base_offset
),
5131 fold_convert (sizetype
, init
));
5135 offset
= fold_convert (sizetype
, offset
);
5136 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
5137 base_offset
, offset
);
5140 /* base + base_offset */
5142 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
5144 addr_base
= build1 (ADDR_EXPR
,
5145 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
5146 /* Strip zero offset components since we don't need
5147 them and they can confuse late diagnostics if
5148 we CSE them wrongly. See PR106904 for example. */
5149 unshare_expr (strip_zero_offset_components
5152 vect_ptr_type
= build_pointer_type (TREE_TYPE (DR_REF (dr
)));
5153 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
5154 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
5155 gimple_seq_add_seq (new_stmt_list
, seq
);
5157 if (DR_PTR_INFO (dr
)
5158 && TREE_CODE (addr_base
) == SSA_NAME
5159 /* We should only duplicate pointer info to newly created SSA names. */
5160 && SSA_NAME_VAR (addr_base
) == dest
)
5162 gcc_assert (!SSA_NAME_PTR_INFO (addr_base
));
5163 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
5173 /* Function vect_create_data_ref_ptr.
5175 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5176 location accessed in the loop by STMT_INFO, along with the def-use update
5177 chain to appropriately advance the pointer through the loop iterations.
5178 Also set aliasing information for the pointer. This pointer is used by
5179 the callers to this function to create a memory reference expression for
5180 vector load/store access.
5183 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5184 GIMPLE_ASSIGN <name, data-ref> or
5185 GIMPLE_ASSIGN <data-ref, name>.
5186 2. AGGR_TYPE: the type of the reference, which should be either a vector
5188 3. AT_LOOP: the loop where the vector memref is to be created.
5189 4. OFFSET (optional): a byte offset to be added to the initial address
5190 accessed by the data-ref in STMT_INFO.
5191 5. BSI: location where the new stmts are to be placed if there is no loop
5192 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5193 pointing to the initial address.
5194 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5195 to the IV during each iteration of the loop. NULL says to move
5196 by one copy of AGGR_TYPE up or down, depending on the step of the
5200 1. Declare a new ptr to vector_type, and have it point to the base of the
5201 data reference (initial addressed accessed by the data reference).
5202 For example, for vector of type V8HI, the following code is generated:
5205 ap = (v8hi *)initial_address;
5207 if OFFSET is not supplied:
5208 initial_address = &a[init];
5209 if OFFSET is supplied:
5210 initial_address = &a[init] + OFFSET;
5211 if BYTE_OFFSET is supplied:
5212 initial_address = &a[init] + BYTE_OFFSET;
5214 Return the initial_address in INITIAL_ADDRESS.
5216 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5217 update the pointer in each iteration of the loop.
5219 Return the increment stmt that updates the pointer in PTR_INCR.
5221 3. Return the pointer. */
5224 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5225 tree aggr_type
, class loop
*at_loop
, tree offset
,
5226 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
5227 gimple
**ptr_incr
, bool only_init
,
5230 const char *base_name
;
5231 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5232 class loop
*loop
= NULL
;
5233 bool nested_in_vect_loop
= false;
5234 class loop
*containing_loop
= NULL
;
5238 gimple_seq new_stmt_list
= NULL
;
5242 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5243 struct data_reference
*dr
= dr_info
->dr
;
5245 gimple_stmt_iterator incr_gsi
;
5247 tree indx_before_incr
, indx_after_incr
;
5249 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
5251 gcc_assert (iv_step
!= NULL_TREE
5252 || TREE_CODE (aggr_type
) == ARRAY_TYPE
5253 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
5257 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5258 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5259 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5260 pe
= loop_preheader_edge (loop
);
5264 gcc_assert (bb_vinfo
);
5269 /* Create an expression for the first address accessed by this load
5271 base_name
= get_name (DR_BASE_ADDRESS (dr
));
5273 if (dump_enabled_p ())
5275 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
5276 dump_printf_loc (MSG_NOTE
, vect_location
,
5277 "create %s-pointer variable to type: %T",
5278 get_tree_code_name (TREE_CODE (aggr_type
)),
5280 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
5281 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
5282 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
5283 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
5284 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
5285 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
5287 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
5288 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
5291 /* (1) Create the new aggregate-pointer variable.
5292 Vector and array types inherit the alias set of their component
5293 type by default so we need to use a ref-all pointer if the data
5294 reference does not conflict with the created aggregated data
5295 reference because it is not addressable. */
5296 bool need_ref_all
= false;
5297 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5298 get_alias_set (DR_REF (dr
))))
5299 need_ref_all
= true;
5300 /* Likewise for any of the data references in the stmt group. */
5301 else if (DR_GROUP_SIZE (stmt_info
) > 1)
5303 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5306 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
5307 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5308 get_alias_set (DR_REF (sdr
))))
5310 need_ref_all
= true;
5313 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
5317 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
5319 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
5322 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5323 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5324 def-use update cycles for the pointer: one relative to the outer-loop
5325 (LOOP), which is what steps (3) and (4) below do. The other is relative
5326 to the inner-loop (which is the inner-most loop containing the dataref),
5327 and this is done be step (5) below.
5329 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5330 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5331 redundant. Steps (3),(4) create the following:
5334 LOOP: vp1 = phi(vp0,vp2)
5340 If there is an inner-loop nested in loop, then step (5) will also be
5341 applied, and an additional update in the inner-loop will be created:
5344 LOOP: vp1 = phi(vp0,vp2)
5346 inner: vp3 = phi(vp1,vp4)
5347 vp4 = vp3 + inner_step
5353 /* (2) Calculate the initial address of the aggregate-pointer, and set
5354 the aggregate-pointer to point to it before the loop. */
5356 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5358 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
5359 stmt_info
, &new_stmt_list
,
5365 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
5366 gcc_assert (!new_bb
);
5369 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
5372 *initial_address
= new_temp
;
5373 aggr_ptr_init
= new_temp
;
5375 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5376 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5377 inner-loop nested in LOOP (during outer-loop vectorization). */
5379 /* No update in loop is required. */
5380 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
5381 aptr
= aggr_ptr_init
;
5384 /* Accesses to invariant addresses should be handled specially
5386 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
5387 gcc_assert (!integer_zerop (step
));
5389 if (iv_step
== NULL_TREE
)
5391 /* The step of the aggregate pointer is the type size,
5392 negated for downward accesses. */
5393 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
5394 if (tree_int_cst_sgn (step
) == -1)
5395 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
5398 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5400 create_iv (aggr_ptr_init
, PLUS_EXPR
,
5401 fold_convert (aggr_ptr_type
, iv_step
),
5402 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
5403 &indx_before_incr
, &indx_after_incr
);
5404 incr
= gsi_stmt (incr_gsi
);
5406 /* Copy the points-to information if it exists. */
5407 if (DR_PTR_INFO (dr
))
5409 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5410 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5415 aptr
= indx_before_incr
;
5418 if (!nested_in_vect_loop
|| only_init
)
5422 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5423 nested in LOOP, if exists. */
5425 gcc_assert (nested_in_vect_loop
);
5428 standard_iv_increment_position (containing_loop
, &incr_gsi
,
5430 create_iv (aptr
, PLUS_EXPR
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)),
5431 aggr_ptr
, containing_loop
, &incr_gsi
, insert_after
,
5432 &indx_before_incr
, &indx_after_incr
);
5433 incr
= gsi_stmt (incr_gsi
);
5435 /* Copy the points-to information if it exists. */
5436 if (DR_PTR_INFO (dr
))
5438 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5439 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5444 return indx_before_incr
;
5451 /* Function bump_vector_ptr
5453 Increment a pointer (to a vector type) by vector-size. If requested,
5454 i.e. if PTR-INCR is given, then also connect the new increment stmt
5455 to the existing def-use update-chain of the pointer, by modifying
5456 the PTR_INCR as illustrated below:
5458 The pointer def-use update-chain before this function:
5459 DATAREF_PTR = phi (p_0, p_2)
5461 PTR_INCR: p_2 = DATAREF_PTR + step
5463 The pointer def-use update-chain after this function:
5464 DATAREF_PTR = phi (p_0, p_2)
5466 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5468 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5471 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5473 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5474 the loop. The increment amount across iterations is expected
5476 BSI - location where the new update stmt is to be placed.
5477 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5478 BUMP - optional. The offset by which to bump the pointer. If not given,
5479 the offset is assumed to be vector_size.
5481 Output: Return NEW_DATAREF_PTR as illustrated above.
5486 bump_vector_ptr (vec_info
*vinfo
,
5487 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5488 stmt_vec_info stmt_info
, tree bump
)
5490 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5491 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5492 tree update
= TYPE_SIZE_UNIT (vectype
);
5495 use_operand_p use_p
;
5496 tree new_dataref_ptr
;
5501 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5502 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5503 else if (is_gimple_min_invariant (dataref_ptr
))
5504 /* When possible avoid emitting a separate increment stmt that will
5505 force the addressed object addressable. */
5506 return build1 (ADDR_EXPR
, TREE_TYPE (dataref_ptr
),
5507 fold_build2 (MEM_REF
,
5508 TREE_TYPE (TREE_TYPE (dataref_ptr
)),
5510 fold_convert (ptr_type_node
, update
)));
5512 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5513 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5514 dataref_ptr
, update
);
5515 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5516 /* Fold the increment, avoiding excessive chains use-def chains of
5517 those, leading to compile-time issues for passes until the next
5518 forwprop pass which would do this as well. */
5519 gimple_stmt_iterator fold_gsi
= gsi_for_stmt (incr_stmt
);
5520 if (fold_stmt (&fold_gsi
, follow_all_ssa_edges
))
5522 incr_stmt
= gsi_stmt (fold_gsi
);
5523 update_stmt (incr_stmt
);
5526 /* Copy the points-to information if it exists. */
5527 if (DR_PTR_INFO (dr
))
5529 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5530 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5534 return new_dataref_ptr
;
5536 /* Update the vector-pointer's cross-iteration increment. */
5537 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5539 tree use
= USE_FROM_PTR (use_p
);
5541 if (use
== dataref_ptr
)
5542 SET_USE (use_p
, new_dataref_ptr
);
5544 gcc_assert (operand_equal_p (use
, update
, 0));
5547 return new_dataref_ptr
;
5551 /* Copy memory reference info such as base/clique from the SRC reference
5552 to the DEST MEM_REF. */
5555 vect_copy_ref_info (tree dest
, tree src
)
5557 if (TREE_CODE (dest
) != MEM_REF
)
5560 tree src_base
= src
;
5561 while (handled_component_p (src_base
))
5562 src_base
= TREE_OPERAND (src_base
, 0);
5563 if (TREE_CODE (src_base
) != MEM_REF
5564 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5567 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5568 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5572 /* Function vect_create_destination_var.
5574 Create a new temporary of type VECTYPE. */
5577 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5583 enum vect_var_kind kind
;
5586 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5590 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5592 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5594 name
= get_name (scalar_dest
);
5596 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5598 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5599 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5605 /* Function vect_grouped_store_supported.
5607 Returns TRUE if interleave high and interleave low permutations
5608 are supported, and FALSE otherwise. */
5611 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5613 machine_mode mode
= TYPE_MODE (vectype
);
5615 /* vect_permute_store_chain requires the group size to be equal to 3 or
5616 be a power of two. */
5617 if (count
!= 3 && exact_log2 (count
) == -1)
5619 if (dump_enabled_p ())
5620 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5621 "the size of the group of accesses"
5622 " is not a power of 2 or not eqaul to 3\n");
5626 /* Check that the permutation is supported. */
5627 if (VECTOR_MODE_P (mode
))
5632 unsigned int j0
= 0, j1
= 0, j2
= 0;
5636 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5638 if (dump_enabled_p ())
5639 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5640 "cannot handle groups of 3 stores for"
5641 " variable-length vectors\n");
5645 vec_perm_builder
sel (nelt
, nelt
, 1);
5646 sel
.quick_grow (nelt
);
5647 vec_perm_indices indices
;
5648 for (j
= 0; j
< 3; j
++)
5650 int nelt0
= ((3 - j
) * nelt
) % 3;
5651 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5652 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5653 for (i
= 0; i
< nelt
; i
++)
5655 if (3 * i
+ nelt0
< nelt
)
5656 sel
[3 * i
+ nelt0
] = j0
++;
5657 if (3 * i
+ nelt1
< nelt
)
5658 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5659 if (3 * i
+ nelt2
< nelt
)
5660 sel
[3 * i
+ nelt2
] = 0;
5662 indices
.new_vector (sel
, 2, nelt
);
5663 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5665 if (dump_enabled_p ())
5666 dump_printf (MSG_MISSED_OPTIMIZATION
,
5667 "permutation op not supported by target.\n");
5671 for (i
= 0; i
< nelt
; i
++)
5673 if (3 * i
+ nelt0
< nelt
)
5674 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5675 if (3 * i
+ nelt1
< nelt
)
5676 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5677 if (3 * i
+ nelt2
< nelt
)
5678 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5680 indices
.new_vector (sel
, 2, nelt
);
5681 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5683 if (dump_enabled_p ())
5684 dump_printf (MSG_MISSED_OPTIMIZATION
,
5685 "permutation op not supported by target.\n");
5693 /* If length is not equal to 3 then only power of 2 is supported. */
5694 gcc_assert (pow2p_hwi (count
));
5695 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5697 /* The encoding has 2 interleaved stepped patterns. */
5698 if(!multiple_p (nelt
, 2))
5700 vec_perm_builder
sel (nelt
, 2, 3);
5702 for (i
= 0; i
< 3; i
++)
5705 sel
[i
* 2 + 1] = i
+ nelt
;
5707 vec_perm_indices
indices (sel
, 2, nelt
);
5708 if (can_vec_perm_const_p (mode
, mode
, indices
))
5710 for (i
= 0; i
< 6; i
++)
5711 sel
[i
] += exact_div (nelt
, 2);
5712 indices
.new_vector (sel
, 2, nelt
);
5713 if (can_vec_perm_const_p (mode
, mode
, indices
))
5719 if (dump_enabled_p ())
5720 dump_printf (MSG_MISSED_OPTIMIZATION
,
5721 "permutation op not supported by target.\n");
5725 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5726 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5729 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5732 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5733 vec_mask_len_store_lanes_optab
, vectype
,
5735 return IFN_MASK_LEN_STORE_LANES
;
5738 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5739 vec_mask_store_lanes_optab
, vectype
,
5741 return IFN_MASK_STORE_LANES
;
5745 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5746 vec_store_lanes_optab
, vectype
, count
))
5747 return IFN_STORE_LANES
;
5753 /* Function vect_permute_store_chain.
5755 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5756 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5757 the data correctly for the stores. Return the final references for stores
5760 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5761 The input is 4 vectors each containing 8 elements. We assign a number to
5762 each element, the input sequence is:
5764 1st vec: 0 1 2 3 4 5 6 7
5765 2nd vec: 8 9 10 11 12 13 14 15
5766 3rd vec: 16 17 18 19 20 21 22 23
5767 4th vec: 24 25 26 27 28 29 30 31
5769 The output sequence should be:
5771 1st vec: 0 8 16 24 1 9 17 25
5772 2nd vec: 2 10 18 26 3 11 19 27
5773 3rd vec: 4 12 20 28 5 13 21 30
5774 4th vec: 6 14 22 30 7 15 23 31
5776 i.e., we interleave the contents of the four vectors in their order.
5778 We use interleave_high/low instructions to create such output. The input of
5779 each interleave_high/low operation is two vectors:
5782 the even elements of the result vector are obtained left-to-right from the
5783 high/low elements of the first vector. The odd elements of the result are
5784 obtained left-to-right from the high/low elements of the second vector.
5785 The output of interleave_high will be: 0 4 1 5
5786 and of interleave_low: 2 6 3 7
5789 The permutation is done in log LENGTH stages. In each stage interleave_high
5790 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5791 where the first argument is taken from the first half of DR_CHAIN and the
5792 second argument from it's second half.
5795 I1: interleave_high (1st vec, 3rd vec)
5796 I2: interleave_low (1st vec, 3rd vec)
5797 I3: interleave_high (2nd vec, 4th vec)
5798 I4: interleave_low (2nd vec, 4th vec)
5800 The output for the first stage is:
5802 I1: 0 16 1 17 2 18 3 19
5803 I2: 4 20 5 21 6 22 7 23
5804 I3: 8 24 9 25 10 26 11 27
5805 I4: 12 28 13 29 14 30 15 31
5807 The output of the second stage, i.e. the final result is:
5809 I1: 0 8 16 24 1 9 17 25
5810 I2: 2 10 18 26 3 11 19 27
5811 I3: 4 12 20 28 5 13 21 30
5812 I4: 6 14 22 30 7 15 23 31. */
5815 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> &dr_chain
,
5816 unsigned int length
,
5817 stmt_vec_info stmt_info
,
5818 gimple_stmt_iterator
*gsi
,
5819 vec
<tree
> *result_chain
)
5821 tree vect1
, vect2
, high
, low
;
5823 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5824 tree perm_mask_low
, perm_mask_high
;
5826 tree perm3_mask_low
, perm3_mask_high
;
5827 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5829 result_chain
->quick_grow (length
);
5830 memcpy (result_chain
->address (), dr_chain
.address (),
5831 length
* sizeof (tree
));
5835 /* vect_grouped_store_supported ensures that this is constant. */
5836 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5837 unsigned int j0
= 0, j1
= 0, j2
= 0;
5839 vec_perm_builder
sel (nelt
, nelt
, 1);
5840 sel
.quick_grow (nelt
);
5841 vec_perm_indices indices
;
5842 for (j
= 0; j
< 3; j
++)
5844 int nelt0
= ((3 - j
) * nelt
) % 3;
5845 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5846 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5848 for (i
= 0; i
< nelt
; i
++)
5850 if (3 * i
+ nelt0
< nelt
)
5851 sel
[3 * i
+ nelt0
] = j0
++;
5852 if (3 * i
+ nelt1
< nelt
)
5853 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5854 if (3 * i
+ nelt2
< nelt
)
5855 sel
[3 * i
+ nelt2
] = 0;
5857 indices
.new_vector (sel
, 2, nelt
);
5858 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5860 for (i
= 0; i
< nelt
; i
++)
5862 if (3 * i
+ nelt0
< nelt
)
5863 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5864 if (3 * i
+ nelt1
< nelt
)
5865 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5866 if (3 * i
+ nelt2
< nelt
)
5867 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5869 indices
.new_vector (sel
, 2, nelt
);
5870 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5872 vect1
= dr_chain
[0];
5873 vect2
= dr_chain
[1];
5875 /* Create interleaving stmt:
5876 low = VEC_PERM_EXPR <vect1, vect2,
5877 {j, nelt, *, j + 1, nelt + j + 1, *,
5878 j + 2, nelt + j + 2, *, ...}> */
5879 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5880 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5881 vect2
, perm3_mask_low
);
5882 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5885 vect2
= dr_chain
[2];
5886 /* Create interleaving stmt:
5887 low = VEC_PERM_EXPR <vect1, vect2,
5888 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5889 6, 7, nelt + j + 2, ...}> */
5890 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5891 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5892 vect2
, perm3_mask_high
);
5893 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5894 (*result_chain
)[j
] = data_ref
;
5899 /* If length is not equal to 3 then only power of 2 is supported. */
5900 gcc_assert (pow2p_hwi (length
));
5902 /* The encoding has 2 interleaved stepped patterns. */
5903 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5904 vec_perm_builder
sel (nelt
, 2, 3);
5906 for (i
= 0; i
< 3; i
++)
5909 sel
[i
* 2 + 1] = i
+ nelt
;
5911 vec_perm_indices
indices (sel
, 2, nelt
);
5912 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5914 for (i
= 0; i
< 6; i
++)
5915 sel
[i
] += exact_div (nelt
, 2);
5916 indices
.new_vector (sel
, 2, nelt
);
5917 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5919 for (i
= 0, n
= log_length
; i
< n
; i
++)
5921 for (j
= 0; j
< length
/2; j
++)
5923 vect1
= dr_chain
[j
];
5924 vect2
= dr_chain
[j
+length
/2];
5926 /* Create interleaving stmt:
5927 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5929 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5930 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5931 vect2
, perm_mask_high
);
5932 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5933 (*result_chain
)[2*j
] = high
;
5935 /* Create interleaving stmt:
5936 low = VEC_PERM_EXPR <vect1, vect2,
5937 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5939 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5940 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5941 vect2
, perm_mask_low
);
5942 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5943 (*result_chain
)[2*j
+1] = low
;
5945 memcpy (dr_chain
.address (), result_chain
->address (),
5946 length
* sizeof (tree
));
5951 /* Function vect_setup_realignment
5953 This function is called when vectorizing an unaligned load using
5954 the dr_explicit_realign[_optimized] scheme.
5955 This function generates the following code at the loop prolog:
5958 x msq_init = *(floor(p)); # prolog load
5959 realignment_token = call target_builtin;
5961 x msq = phi (msq_init, ---)
5963 The stmts marked with x are generated only for the case of
5964 dr_explicit_realign_optimized.
5966 The code above sets up a new (vector) pointer, pointing to the first
5967 location accessed by STMT_INFO, and a "floor-aligned" load using that
5968 pointer. It also generates code to compute the "realignment-token"
5969 (if the relevant target hook was defined), and creates a phi-node at the
5970 loop-header bb whose arguments are the result of the prolog-load (created
5971 by this function) and the result of a load that takes place in the loop
5972 (to be created by the caller to this function).
5974 For the case of dr_explicit_realign_optimized:
5975 The caller to this function uses the phi-result (msq) to create the
5976 realignment code inside the loop, and sets up the missing phi argument,
5979 msq = phi (msq_init, lsq)
5980 lsq = *(floor(p')); # load in loop
5981 result = realign_load (msq, lsq, realignment_token);
5983 For the case of dr_explicit_realign:
5985 msq = *(floor(p)); # load in loop
5987 lsq = *(floor(p')); # load in loop
5988 result = realign_load (msq, lsq, realignment_token);
5991 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5992 a memory location that may be unaligned.
5993 BSI - place where new code is to be inserted.
5994 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5998 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5999 target hook, if defined.
6000 Return value - the result of the loop-header phi node. */
6003 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6004 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
6005 enum dr_alignment_support alignment_support_scheme
,
6007 class loop
**at_loop
)
6009 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6010 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6011 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
6012 struct data_reference
*dr
= dr_info
->dr
;
6013 class loop
*loop
= NULL
;
6015 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
6021 tree msq_init
= NULL_TREE
;
6024 tree msq
= NULL_TREE
;
6025 gimple_seq stmts
= NULL
;
6026 bool compute_in_loop
= false;
6027 bool nested_in_vect_loop
= false;
6028 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
6029 class loop
*loop_for_initial_load
= NULL
;
6033 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6034 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
6037 gcc_assert (alignment_support_scheme
== dr_explicit_realign
6038 || alignment_support_scheme
== dr_explicit_realign_optimized
);
6040 /* We need to generate three things:
6041 1. the misalignment computation
6042 2. the extra vector load (for the optimized realignment scheme).
6043 3. the phi node for the two vectors from which the realignment is
6044 done (for the optimized realignment scheme). */
6046 /* 1. Determine where to generate the misalignment computation.
6048 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6049 calculation will be generated by this function, outside the loop (in the
6050 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6051 caller, inside the loop.
6053 Background: If the misalignment remains fixed throughout the iterations of
6054 the loop, then both realignment schemes are applicable, and also the
6055 misalignment computation can be done outside LOOP. This is because we are
6056 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6057 are a multiple of VS (the Vector Size), and therefore the misalignment in
6058 different vectorized LOOP iterations is always the same.
6059 The problem arises only if the memory access is in an inner-loop nested
6060 inside LOOP, which is now being vectorized using outer-loop vectorization.
6061 This is the only case when the misalignment of the memory access may not
6062 remain fixed throughout the iterations of the inner-loop (as explained in
6063 detail in vect_supportable_dr_alignment). In this case, not only is the
6064 optimized realignment scheme not applicable, but also the misalignment
6065 computation (and generation of the realignment token that is passed to
6066 REALIGN_LOAD) have to be done inside the loop.
6068 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6069 or not, which in turn determines if the misalignment is computed inside
6070 the inner-loop, or outside LOOP. */
6072 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
6074 compute_in_loop
= true;
6075 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
6079 /* 2. Determine where to generate the extra vector load.
6081 For the optimized realignment scheme, instead of generating two vector
6082 loads in each iteration, we generate a single extra vector load in the
6083 preheader of the loop, and in each iteration reuse the result of the
6084 vector load from the previous iteration. In case the memory access is in
6085 an inner-loop nested inside LOOP, which is now being vectorized using
6086 outer-loop vectorization, we need to determine whether this initial vector
6087 load should be generated at the preheader of the inner-loop, or can be
6088 generated at the preheader of LOOP. If the memory access has no evolution
6089 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6090 to be generated inside LOOP (in the preheader of the inner-loop). */
6092 if (nested_in_vect_loop
)
6094 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
6095 bool invariant_in_outerloop
=
6096 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
6097 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
6100 loop_for_initial_load
= loop
;
6102 *at_loop
= loop_for_initial_load
;
6104 tree vuse
= NULL_TREE
;
6105 if (loop_for_initial_load
)
6107 pe
= loop_preheader_edge (loop_for_initial_load
);
6108 if (gphi
*vphi
= get_virtual_phi (loop_for_initial_load
->header
))
6109 vuse
= PHI_ARG_DEF_FROM_EDGE (vphi
, pe
);
6112 vuse
= gimple_vuse (gsi_stmt (*gsi
));
6114 /* 3. For the case of the optimized realignment, create the first vector
6115 load at the loop preheader. */
6117 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
6119 /* Create msq_init = *(floor(p1)) in the loop preheader */
6122 gcc_assert (!compute_in_loop
);
6123 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6124 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
6125 loop_for_initial_load
, NULL_TREE
,
6126 &init_addr
, NULL
, &inc
, true);
6127 if (TREE_CODE (ptr
) == SSA_NAME
)
6128 new_temp
= copy_ssa_name (ptr
);
6130 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
6131 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
6132 tree type
= TREE_TYPE (ptr
);
6133 new_stmt
= gimple_build_assign
6134 (new_temp
, BIT_AND_EXPR
, ptr
,
6135 fold_build2 (MINUS_EXPR
, type
,
6136 build_int_cst (type
, 0),
6137 build_int_cst (type
, align
)));
6138 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6139 gcc_assert (!new_bb
);
6141 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
6142 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
6143 vect_copy_ref_info (data_ref
, DR_REF (dr
));
6144 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
6145 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6146 gimple_assign_set_lhs (new_stmt
, new_temp
);
6147 gimple_set_vuse (new_stmt
, vuse
);
6150 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6151 gcc_assert (!new_bb
);
6154 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6156 msq_init
= gimple_assign_lhs (new_stmt
);
6159 /* 4. Create realignment token using a target builtin, if available.
6160 It is done either inside the containing loop, or before LOOP (as
6161 determined above). */
6163 if (targetm
.vectorize
.builtin_mask_for_load
)
6168 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6171 /* Generate the INIT_ADDR computation outside LOOP. */
6172 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
6177 pe
= loop_preheader_edge (loop
);
6178 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6179 gcc_assert (!new_bb
);
6182 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
6185 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
6186 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
6188 vect_create_destination_var (scalar_dest
,
6189 gimple_call_return_type (new_stmt
));
6190 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6191 gimple_call_set_lhs (new_stmt
, new_temp
);
6193 if (compute_in_loop
)
6194 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6197 /* Generate the misalignment computation outside LOOP. */
6198 pe
= loop_preheader_edge (loop
);
6199 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6200 gcc_assert (!new_bb
);
6203 *realignment_token
= gimple_call_lhs (new_stmt
);
6205 /* The result of the CALL_EXPR to this builtin is determined from
6206 the value of the parameter and no global variables are touched
6207 which makes the builtin a "const" function. Requiring the
6208 builtin to have the "const" attribute makes it unnecessary
6209 to call mark_call_clobbered. */
6210 gcc_assert (TREE_READONLY (builtin_decl
));
6213 if (alignment_support_scheme
== dr_explicit_realign
)
6216 gcc_assert (!compute_in_loop
);
6217 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
6220 /* 5. Create msq = phi <msq_init, lsq> in loop */
6222 pe
= loop_preheader_edge (containing_loop
);
6223 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6224 msq
= make_ssa_name (vec_dest
);
6225 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
6226 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
6232 /* Function vect_grouped_load_supported.
6234 COUNT is the size of the load group (the number of statements plus the
6235 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6236 only one statement, with a gap of COUNT - 1.
6238 Returns true if a suitable permute exists. */
6241 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
6242 unsigned HOST_WIDE_INT count
)
6244 machine_mode mode
= TYPE_MODE (vectype
);
6246 /* If this is single-element interleaving with an element distance
6247 that leaves unused vector loads around punt - we at least create
6248 very sub-optimal code in that case (and blow up memory,
6250 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6254 "single-element interleaving not supported "
6255 "for not adjacent vector loads\n");
6259 /* vect_permute_load_chain requires the group size to be equal to 3 or
6260 be a power of two. */
6261 if (count
!= 3 && exact_log2 (count
) == -1)
6263 if (dump_enabled_p ())
6264 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6265 "the size of the group of accesses"
6266 " is not a power of 2 or not equal to 3\n");
6270 /* Check that the permutation is supported. */
6271 if (VECTOR_MODE_P (mode
))
6277 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
6279 if (dump_enabled_p ())
6280 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6281 "cannot handle groups of 3 loads for"
6282 " variable-length vectors\n");
6286 vec_perm_builder
sel (nelt
, nelt
, 1);
6287 sel
.quick_grow (nelt
);
6288 vec_perm_indices indices
;
6290 for (k
= 0; k
< 3; k
++)
6292 for (i
= 0; i
< nelt
; i
++)
6293 if (3 * i
+ k
< 2 * nelt
)
6297 indices
.new_vector (sel
, 2, nelt
);
6298 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6300 if (dump_enabled_p ())
6301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6302 "shuffle of 3 loads is not supported by"
6306 for (i
= 0, j
= 0; i
< nelt
; i
++)
6307 if (3 * i
+ k
< 2 * nelt
)
6310 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6311 indices
.new_vector (sel
, 2, nelt
);
6312 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6314 if (dump_enabled_p ())
6315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6316 "shuffle of 3 loads is not supported by"
6325 /* If length is not equal to 3 then only power of 2 is supported. */
6326 gcc_assert (pow2p_hwi (count
));
6327 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
6329 /* The encoding has a single stepped pattern. */
6330 vec_perm_builder
sel (nelt
, 1, 3);
6332 for (i
= 0; i
< 3; i
++)
6334 vec_perm_indices
indices (sel
, 2, nelt
);
6335 if (can_vec_perm_const_p (mode
, mode
, indices
))
6337 for (i
= 0; i
< 3; i
++)
6339 indices
.new_vector (sel
, 2, nelt
);
6340 if (can_vec_perm_const_p (mode
, mode
, indices
))
6346 if (dump_enabled_p ())
6347 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6348 "extract even/odd not supported by target\n");
6352 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6353 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6356 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
6359 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6360 vec_mask_len_load_lanes_optab
, vectype
,
6362 return IFN_MASK_LEN_LOAD_LANES
;
6365 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6366 vec_mask_load_lanes_optab
, vectype
,
6368 return IFN_MASK_LOAD_LANES
;
6372 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab
,
6374 return IFN_LOAD_LANES
;
6379 /* Function vect_permute_load_chain.
6381 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6382 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6383 the input data correctly. Return the final references for loads in
6386 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6387 The input is 4 vectors each containing 8 elements. We assign a number to each
6388 element, the input sequence is:
6390 1st vec: 0 1 2 3 4 5 6 7
6391 2nd vec: 8 9 10 11 12 13 14 15
6392 3rd vec: 16 17 18 19 20 21 22 23
6393 4th vec: 24 25 26 27 28 29 30 31
6395 The output sequence should be:
6397 1st vec: 0 4 8 12 16 20 24 28
6398 2nd vec: 1 5 9 13 17 21 25 29
6399 3rd vec: 2 6 10 14 18 22 26 30
6400 4th vec: 3 7 11 15 19 23 27 31
6402 i.e., the first output vector should contain the first elements of each
6403 interleaving group, etc.
6405 We use extract_even/odd instructions to create such output. The input of
6406 each extract_even/odd operation is two vectors
6410 and the output is the vector of extracted even/odd elements. The output of
6411 extract_even will be: 0 2 4 6
6412 and of extract_odd: 1 3 5 7
6415 The permutation is done in log LENGTH stages. In each stage extract_even
6416 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6417 their order. In our example,
6419 E1: extract_even (1st vec, 2nd vec)
6420 E2: extract_odd (1st vec, 2nd vec)
6421 E3: extract_even (3rd vec, 4th vec)
6422 E4: extract_odd (3rd vec, 4th vec)
6424 The output for the first stage will be:
6426 E1: 0 2 4 6 8 10 12 14
6427 E2: 1 3 5 7 9 11 13 15
6428 E3: 16 18 20 22 24 26 28 30
6429 E4: 17 19 21 23 25 27 29 31
6431 In order to proceed and create the correct sequence for the next stage (or
6432 for the correct output, if the second stage is the last one, as in our
6433 example), we first put the output of extract_even operation and then the
6434 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6435 The input for the second stage is:
6437 1st vec (E1): 0 2 4 6 8 10 12 14
6438 2nd vec (E3): 16 18 20 22 24 26 28 30
6439 3rd vec (E2): 1 3 5 7 9 11 13 15
6440 4th vec (E4): 17 19 21 23 25 27 29 31
6442 The output of the second stage:
6444 E1: 0 4 8 12 16 20 24 28
6445 E2: 2 6 10 14 18 22 26 30
6446 E3: 1 5 9 13 17 21 25 29
6447 E4: 3 7 11 15 19 23 27 31
6449 And RESULT_CHAIN after reordering:
6451 1st vec (E1): 0 4 8 12 16 20 24 28
6452 2nd vec (E3): 1 5 9 13 17 21 25 29
6453 3rd vec (E2): 2 6 10 14 18 22 26 30
6454 4th vec (E4): 3 7 11 15 19 23 27 31. */
6457 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6458 unsigned int length
,
6459 stmt_vec_info stmt_info
,
6460 gimple_stmt_iterator
*gsi
,
6461 vec
<tree
> *result_chain
)
6463 tree data_ref
, first_vect
, second_vect
;
6464 tree perm_mask_even
, perm_mask_odd
;
6465 tree perm3_mask_low
, perm3_mask_high
;
6467 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6468 unsigned int i
, j
, log_length
= exact_log2 (length
);
6470 result_chain
->quick_grow (length
);
6471 memcpy (result_chain
->address (), dr_chain
.address (),
6472 length
* sizeof (tree
));
6476 /* vect_grouped_load_supported ensures that this is constant. */
6477 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6480 vec_perm_builder
sel (nelt
, nelt
, 1);
6481 sel
.quick_grow (nelt
);
6482 vec_perm_indices indices
;
6483 for (k
= 0; k
< 3; k
++)
6485 for (i
= 0; i
< nelt
; i
++)
6486 if (3 * i
+ k
< 2 * nelt
)
6490 indices
.new_vector (sel
, 2, nelt
);
6491 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6493 for (i
= 0, j
= 0; i
< nelt
; i
++)
6494 if (3 * i
+ k
< 2 * nelt
)
6497 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6498 indices
.new_vector (sel
, 2, nelt
);
6499 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6501 first_vect
= dr_chain
[0];
6502 second_vect
= dr_chain
[1];
6504 /* Create interleaving stmt (low part of):
6505 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6507 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6508 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6509 second_vect
, perm3_mask_low
);
6510 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6512 /* Create interleaving stmt (high part of):
6513 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6515 first_vect
= data_ref
;
6516 second_vect
= dr_chain
[2];
6517 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6518 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6519 second_vect
, perm3_mask_high
);
6520 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6521 (*result_chain
)[k
] = data_ref
;
6526 /* If length is not equal to 3 then only power of 2 is supported. */
6527 gcc_assert (pow2p_hwi (length
));
6529 /* The encoding has a single stepped pattern. */
6530 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6531 vec_perm_builder
sel (nelt
, 1, 3);
6533 for (i
= 0; i
< 3; ++i
)
6535 vec_perm_indices
indices (sel
, 2, nelt
);
6536 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6538 for (i
= 0; i
< 3; ++i
)
6540 indices
.new_vector (sel
, 2, nelt
);
6541 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6543 for (i
= 0; i
< log_length
; i
++)
6545 for (j
= 0; j
< length
; j
+= 2)
6547 first_vect
= dr_chain
[j
];
6548 second_vect
= dr_chain
[j
+1];
6550 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6551 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6552 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6553 first_vect
, second_vect
,
6555 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6556 (*result_chain
)[j
/2] = data_ref
;
6558 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6559 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6560 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6561 first_vect
, second_vect
,
6563 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6564 (*result_chain
)[j
/2+length
/2] = data_ref
;
6566 memcpy (dr_chain
.address (), result_chain
->address (),
6567 length
* sizeof (tree
));
6572 /* Function vect_shift_permute_load_chain.
6574 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6575 sequence of stmts to reorder the input data accordingly.
6576 Return the final references for loads in RESULT_CHAIN.
6577 Return true if successed, false otherwise.
6579 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6580 The input is 3 vectors each containing 8 elements. We assign a
6581 number to each element, the input sequence is:
6583 1st vec: 0 1 2 3 4 5 6 7
6584 2nd vec: 8 9 10 11 12 13 14 15
6585 3rd vec: 16 17 18 19 20 21 22 23
6587 The output sequence should be:
6589 1st vec: 0 3 6 9 12 15 18 21
6590 2nd vec: 1 4 7 10 13 16 19 22
6591 3rd vec: 2 5 8 11 14 17 20 23
6593 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6595 First we shuffle all 3 vectors to get correct elements order:
6597 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6598 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6599 3rd vec: (16 19 22) (17 20 23) (18 21)
6601 Next we unite and shift vector 3 times:
6604 shift right by 6 the concatenation of:
6605 "1st vec" and "2nd vec"
6606 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6607 "2nd vec" and "3rd vec"
6608 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6609 "3rd vec" and "1st vec"
6610 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6613 So that now new vectors are:
6615 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6616 2nd vec: (10 13) (16 19 22) (17 20 23)
6617 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6620 shift right by 5 the concatenation of:
6621 "1st vec" and "3rd vec"
6622 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6623 "2nd vec" and "1st vec"
6624 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6625 "3rd vec" and "2nd vec"
6626 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6629 So that now new vectors are:
6631 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6632 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6633 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6636 shift right by 5 the concatenation of:
6637 "1st vec" and "1st vec"
6638 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6639 shift right by 3 the concatenation of:
6640 "2nd vec" and "2nd vec"
6641 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6644 So that now all vectors are READY:
6645 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6646 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6647 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6649 This algorithm is faster than one in vect_permute_load_chain if:
6650 1. "shift of a concatination" is faster than general permutation.
6652 2. The TARGET machine can't execute vector instructions in parallel.
6653 This is because each step of the algorithm depends on previous.
6654 The algorithm in vect_permute_load_chain is much more parallel.
6656 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6660 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6661 unsigned int length
,
6662 stmt_vec_info stmt_info
,
6663 gimple_stmt_iterator
*gsi
,
6664 vec
<tree
> *result_chain
)
6666 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6667 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6668 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6671 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6672 machine_mode vmode
= TYPE_MODE (vectype
);
6674 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6676 unsigned HOST_WIDE_INT nelt
, vf
;
6677 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6678 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6679 /* Not supported for variable-length vectors. */
6682 vec_perm_builder
sel (nelt
, nelt
, 1);
6683 sel
.quick_grow (nelt
);
6685 result_chain
->quick_grow (length
);
6686 memcpy (result_chain
->address (), dr_chain
.address (),
6687 length
* sizeof (tree
));
6689 if (pow2p_hwi (length
) && vf
> 4)
6691 unsigned int j
, log_length
= exact_log2 (length
);
6692 for (i
= 0; i
< nelt
/ 2; ++i
)
6694 for (i
= 0; i
< nelt
/ 2; ++i
)
6695 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6696 vec_perm_indices
indices (sel
, 2, nelt
);
6697 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6699 if (dump_enabled_p ())
6700 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6701 "shuffle of 2 fields structure is not \
6702 supported by target\n");
6705 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6707 for (i
= 0; i
< nelt
/ 2; ++i
)
6709 for (i
= 0; i
< nelt
/ 2; ++i
)
6710 sel
[nelt
/ 2 + i
] = i
* 2;
6711 indices
.new_vector (sel
, 2, nelt
);
6712 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6714 if (dump_enabled_p ())
6715 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6716 "shuffle of 2 fields structure is not \
6717 supported by target\n");
6720 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6722 /* Generating permutation constant to shift all elements.
6723 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6724 for (i
= 0; i
< nelt
; i
++)
6725 sel
[i
] = nelt
/ 2 + i
;
6726 indices
.new_vector (sel
, 2, nelt
);
6727 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6729 if (dump_enabled_p ())
6730 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6731 "shift permutation is not supported by target\n");
6734 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6736 /* Generating permutation constant to select vector from 2.
6737 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6738 for (i
= 0; i
< nelt
/ 2; i
++)
6740 for (i
= nelt
/ 2; i
< nelt
; i
++)
6742 indices
.new_vector (sel
, 2, nelt
);
6743 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6747 "select is not supported by target\n");
6750 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6752 for (i
= 0; i
< log_length
; i
++)
6754 for (j
= 0; j
< length
; j
+= 2)
6756 first_vect
= dr_chain
[j
];
6757 second_vect
= dr_chain
[j
+ 1];
6759 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6760 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6761 first_vect
, first_vect
,
6763 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6766 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6767 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6768 second_vect
, second_vect
,
6770 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6773 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6774 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6775 vect
[0], vect
[1], shift1_mask
);
6776 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6777 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6779 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6780 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6781 vect
[0], vect
[1], select_mask
);
6782 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6783 (*result_chain
)[j
/2] = data_ref
;
6785 memcpy (dr_chain
.address (), result_chain
->address (),
6786 length
* sizeof (tree
));
6790 if (length
== 3 && vf
> 2)
6792 unsigned int k
= 0, l
= 0;
6794 /* Generating permutation constant to get all elements in rigth order.
6795 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6796 for (i
= 0; i
< nelt
; i
++)
6798 if (3 * k
+ (l
% 3) >= nelt
)
6801 l
+= (3 - (nelt
% 3));
6803 sel
[i
] = 3 * k
+ (l
% 3);
6806 vec_perm_indices
indices (sel
, 2, nelt
);
6807 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6811 "shuffle of 3 fields structure is not \
6812 supported by target\n");
6815 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6817 /* Generating permutation constant to shift all elements.
6818 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6819 for (i
= 0; i
< nelt
; i
++)
6820 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6821 indices
.new_vector (sel
, 2, nelt
);
6822 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6824 if (dump_enabled_p ())
6825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6826 "shift permutation is not supported by target\n");
6829 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6831 /* Generating permutation constant to shift all elements.
6832 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6833 for (i
= 0; i
< nelt
; i
++)
6834 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6835 indices
.new_vector (sel
, 2, nelt
);
6836 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6838 if (dump_enabled_p ())
6839 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6840 "shift permutation is not supported by target\n");
6843 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6845 /* Generating permutation constant to shift all elements.
6846 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6847 for (i
= 0; i
< nelt
; i
++)
6848 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6849 indices
.new_vector (sel
, 2, nelt
);
6850 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6854 "shift permutation is not supported by target\n");
6857 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6859 /* Generating permutation constant to shift all elements.
6860 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6861 for (i
= 0; i
< nelt
; i
++)
6862 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6863 indices
.new_vector (sel
, 2, nelt
);
6864 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6868 "shift permutation is not supported by target\n");
6871 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6873 for (k
= 0; k
< 3; k
++)
6875 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6876 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6877 dr_chain
[k
], dr_chain
[k
],
6879 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6883 for (k
= 0; k
< 3; k
++)
6885 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6886 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6887 vect
[k
% 3], vect
[(k
+ 1) % 3],
6889 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6890 vect_shift
[k
] = data_ref
;
6893 for (k
= 0; k
< 3; k
++)
6895 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6896 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6897 vect_shift
[(4 - k
) % 3],
6898 vect_shift
[(3 - k
) % 3],
6900 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6904 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6906 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6907 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6908 vect
[0], shift3_mask
);
6909 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6910 (*result_chain
)[nelt
% 3] = data_ref
;
6912 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6913 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6914 vect
[1], shift4_mask
);
6915 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6916 (*result_chain
)[0] = data_ref
;
6922 /* Function vect_transform_grouped_load.
6924 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6925 to perform their permutation and ascribe the result vectorized statements to
6926 the scalar statements.
6930 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6932 int size
, gimple_stmt_iterator
*gsi
)
6935 vec
<tree
> result_chain
= vNULL
;
6937 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6938 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6939 vectors, that are ready for vector computation. */
6940 result_chain
.create (size
);
6942 /* If reassociation width for vector type is 2 or greater target machine can
6943 execute 2 or more vector instructions in parallel. Otherwise try to
6944 get chain for loads group using vect_shift_permute_load_chain. */
6945 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6946 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6948 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6949 gsi
, &result_chain
))
6950 vect_permute_load_chain (vinfo
, dr_chain
,
6951 size
, stmt_info
, gsi
, &result_chain
);
6952 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6953 result_chain
.release ();
6956 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6957 generated as part of the vectorization of STMT_INFO. Assign the statement
6958 for each vector to the associated scalar statement. */
6961 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
6962 vec
<tree
> result_chain
)
6964 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6965 unsigned int i
, gap_count
;
6968 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6969 Since we scan the chain starting from it's first node, their order
6970 corresponds the order of data-refs in RESULT_CHAIN. */
6971 stmt_vec_info next_stmt_info
= first_stmt_info
;
6973 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6975 if (!next_stmt_info
)
6978 /* Skip the gaps. Loads created for the gaps will be removed by dead
6979 code elimination pass later. No need to check for the first stmt in
6980 the group, since it always exists.
6981 DR_GROUP_GAP is the number of steps in elements from the previous
6982 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6983 correspond to the gaps. */
6984 if (next_stmt_info
!= first_stmt_info
6985 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6991 /* ??? The following needs cleanup after the removal of
6992 DR_GROUP_SAME_DR_STMT. */
6995 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6996 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6997 copies, and we put the new vector statement last. */
6998 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
7000 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
7006 /* Function vect_force_dr_alignment_p.
7008 Returns whether the alignment of a DECL can be forced to be aligned
7009 on ALIGNMENT bit boundary. */
7012 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
7017 if (decl_in_symtab_p (decl
)
7018 && !symtab_node::get (decl
)->can_increase_alignment_p ())
7021 if (TREE_STATIC (decl
))
7022 return (known_le (alignment
,
7023 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
7025 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
7028 /* Return whether the data reference DR_INFO is supported with respect to its
7030 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7031 it is aligned, i.e., check if it is possible to vectorize it with different
7034 enum dr_alignment_support
7035 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
7036 tree vectype
, int misalignment
)
7038 data_reference
*dr
= dr_info
->dr
;
7039 stmt_vec_info stmt_info
= dr_info
->stmt
;
7040 machine_mode mode
= TYPE_MODE (vectype
);
7041 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7042 class loop
*vect_loop
= NULL
;
7043 bool nested_in_vect_loop
= false;
7045 if (misalignment
== 0)
7048 /* For now assume all conditional loads/stores support unaligned
7049 access without any special code. */
7050 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
7051 if (gimple_call_internal_p (stmt
)
7052 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
7053 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
7054 return dr_unaligned_supported
;
7058 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7059 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
7062 /* Possibly unaligned access. */
7064 /* We can choose between using the implicit realignment scheme (generating
7065 a misaligned_move stmt) and the explicit realignment scheme (generating
7066 aligned loads with a REALIGN_LOAD). There are two variants to the
7067 explicit realignment scheme: optimized, and unoptimized.
7068 We can optimize the realignment only if the step between consecutive
7069 vector loads is equal to the vector size. Since the vector memory
7070 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7071 is guaranteed that the misalignment amount remains the same throughout the
7072 execution of the vectorized loop. Therefore, we can create the
7073 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7074 at the loop preheader.
7076 However, in the case of outer-loop vectorization, when vectorizing a
7077 memory access in the inner-loop nested within the LOOP that is now being
7078 vectorized, while it is guaranteed that the misalignment of the
7079 vectorized memory access will remain the same in different outer-loop
7080 iterations, it is *not* guaranteed that is will remain the same throughout
7081 the execution of the inner-loop. This is because the inner-loop advances
7082 with the original scalar step (and not in steps of VS). If the inner-loop
7083 step happens to be a multiple of VS, then the misalignment remains fixed
7084 and we can use the optimized realignment scheme. For example:
7090 When vectorizing the i-loop in the above example, the step between
7091 consecutive vector loads is 1, and so the misalignment does not remain
7092 fixed across the execution of the inner-loop, and the realignment cannot
7093 be optimized (as illustrated in the following pseudo vectorized loop):
7095 for (i=0; i<N; i+=4)
7096 for (j=0; j<M; j++){
7097 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7098 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7099 // (assuming that we start from an aligned address).
7102 We therefore have to use the unoptimized realignment scheme:
7104 for (i=0; i<N; i+=4)
7105 for (j=k; j<M; j+=4)
7106 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7107 // that the misalignment of the initial address is
7110 The loop can then be vectorized as follows:
7112 for (k=0; k<4; k++){
7113 rt = get_realignment_token (&vp[k]);
7114 for (i=0; i<N; i+=4){
7116 for (j=k; j<M; j+=4){
7118 va = REALIGN_LOAD <v1,v2,rt>;
7125 if (DR_IS_READ (dr
))
7127 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
7128 && (!targetm
.vectorize
.builtin_mask_for_load
7129 || targetm
.vectorize
.builtin_mask_for_load ()))
7131 /* If we are doing SLP then the accesses need not have the
7132 same alignment, instead it depends on the SLP group size. */
7134 && STMT_SLP_TYPE (stmt_info
)
7135 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
7136 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7138 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
7139 TYPE_VECTOR_SUBPARTS (vectype
))))
7141 else if (!loop_vinfo
7142 || (nested_in_vect_loop
7143 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
7144 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
7145 return dr_explicit_realign
;
7147 return dr_explicit_realign_optimized
;
7151 bool is_packed
= false;
7152 tree type
= TREE_TYPE (DR_REF (dr
));
7153 if (misalignment
== DR_MISALIGNMENT_UNKNOWN
)
7154 is_packed
= not_size_aligned (DR_REF (dr
));
7155 if (targetm
.vectorize
.support_vector_misalignment (mode
, type
, misalignment
,
7157 return dr_unaligned_supported
;
7160 return dr_unaligned_unsupported
;