1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode
), count
);
83 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
87 "cannot use %s<%s><%s>\n", name
,
88 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE
, vect_location
,
94 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
95 GET_MODE_NAME (mode
));
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
105 simd_clone_call_p (gimple
*stmt
)
107 gcall
*call
= dyn_cast
<gcall
*> (stmt
);
111 tree fndecl
= NULL_TREE
;
112 if (gimple_call_internal_p (call
, IFN_MASK_CALL
))
113 fndecl
= TREE_OPERAND (gimple_call_arg (stmt
, 0), 0);
115 fndecl
= gimple_call_fndecl (stmt
);
117 if (fndecl
== NULL_TREE
)
120 cgraph_node
*node
= cgraph_node::get (fndecl
);
121 if (node
&& node
->simd_clones
!= NULL
)
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
, tree scalar_type
)
149 HOST_WIDE_INT lhs
, rhs
;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
156 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
158 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
161 scalar_type
= TREE_TYPE (gimple_assign_lhs (assign
));
162 if (gimple_assign_cast_p (assign
)
163 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
169 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
171 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
173 scalar_type
= rhs_type
;
176 else if (cgraph_node
*node
= simd_clone_call_p (stmt_info
->stmt
))
178 auto clone
= node
->simd_clones
->simdclone
;
179 for (unsigned int i
= 0; i
< clone
->nargs
; ++i
)
181 if (clone
->args
[i
].arg_type
== SIMD_CLONE_ARG_TYPE_VECTOR
)
183 tree arg_scalar_type
= TREE_TYPE (clone
->args
[i
].vector_type
);
184 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type
));
187 scalar_type
= arg_scalar_type
;
193 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
196 if (gimple_call_internal_p (call
))
198 internal_fn ifn
= gimple_call_internal_fn (call
);
199 if (internal_load_fn_p (ifn
))
200 /* For loads the LHS type does the trick. */
202 else if (internal_store_fn_p (ifn
))
204 /* For stores use the tyep of the stored value. */
205 i
= internal_fn_stored_value_index (ifn
);
206 scalar_type
= TREE_TYPE (gimple_call_arg (call
, i
));
209 else if (internal_fn_mask_index (ifn
) == 0)
212 if (i
< gimple_call_num_args (call
))
214 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
217 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
219 scalar_type
= rhs_type
;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
233 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
235 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
237 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
238 return opt_result::failure_at (vect_location
,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
244 = runtime_alias_check_p (ddr
, loop
,
245 optimize_loop_nest_for_speed_p (loop
));
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
256 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
258 const vec
<tree
> &checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
259 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
260 if (checks
[i
] == value
)
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE
, vect_location
,
265 "need run-time check that %T is nonzero\n",
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
275 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
277 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
278 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
285 /* STMT_A and STMT_B belong to overlapping groups. All loads are
286 emitted at the position of the first scalar load.
287 Stores in a group are emitted at the position of the last scalar store.
288 Compute that position and check whether the resulting order matches
290 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
293 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
294 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
295 s
= DR_GROUP_NEXT_ELEMENT (s
))
296 il_a
= get_later_stmt (il_a
, s
);
297 else /* DR_IS_READ */
298 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
299 s
= DR_GROUP_NEXT_ELEMENT (s
))
300 if (get_later_stmt (il_a
, s
) == il_a
)
305 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
308 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
309 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
310 s
= DR_GROUP_NEXT_ELEMENT (s
))
311 il_b
= get_later_stmt (il_b
, s
);
312 else /* DR_IS_READ */
313 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
314 s
= DR_GROUP_NEXT_ELEMENT (s
))
315 if (get_later_stmt (il_b
, s
) == il_b
)
320 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
321 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
324 /* A subroutine of vect_analyze_data_ref_dependence. Handle
325 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
326 distances. These distances are conservatively correct but they don't
327 reflect a guaranteed dependence.
329 Return true if this function does all the work necessary to avoid
330 an alias or false if the caller should use the dependence distances
331 to limit the vectorization factor in the usual way. LOOP_DEPTH is
332 the depth of the loop described by LOOP_VINFO and the other arguments
333 are as for vect_analyze_data_ref_dependence. */
336 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
337 loop_vec_info loop_vinfo
,
338 int loop_depth
, unsigned int *max_vf
)
340 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
341 for (lambda_vector
&dist_v
: DDR_DIST_VECTS (ddr
))
343 int dist
= dist_v
[loop_depth
];
344 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
346 /* If the user asserted safelen >= DIST consecutive iterations
347 can be executed concurrently, assume independence.
349 ??? An alternative would be to add the alias check even
350 in this case, and vectorize the fallback loop with the
351 maximum VF set to safelen. However, if the user has
352 explicitly given a length, it's less likely that that
354 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
356 if ((unsigned int) loop
->safelen
< *max_vf
)
357 *max_vf
= loop
->safelen
;
358 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
362 /* For dependence distances of 2 or more, we have the option
363 of limiting VF or checking for an alias at runtime.
364 Prefer to check at runtime if we can, to avoid limiting
365 the VF unnecessarily when the bases are in fact independent.
367 Note that the alias checks will be removed if the VF ends up
368 being small enough. */
369 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
370 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
371 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
372 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
373 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
380 /* Function vect_analyze_data_ref_dependence.
382 FIXME: I needed to change the sense of the returned flag.
384 Return FALSE if there (might) exist a dependence between a memory-reference
385 DRA and a memory-reference DRB. When versioning for alias may check a
386 dependence at run-time, return TRUE. Adjust *MAX_VF according to
387 the data dependence. */
390 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
391 loop_vec_info loop_vinfo
,
392 unsigned int *max_vf
)
395 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
396 struct data_reference
*dra
= DDR_A (ddr
);
397 struct data_reference
*drb
= DDR_B (ddr
);
398 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
399 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
400 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
401 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
402 lambda_vector dist_v
;
403 unsigned int loop_depth
;
405 /* If user asserted safelen consecutive iterations can be
406 executed concurrently, assume independence. */
407 auto apply_safelen
= [&]()
409 if (loop
->safelen
>= 2)
411 if ((unsigned int) loop
->safelen
< *max_vf
)
412 *max_vf
= loop
->safelen
;
413 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
419 /* In loop analysis all data references should be vectorizable. */
420 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
421 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
424 /* Independent data accesses. */
425 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
426 return opt_result::success ();
429 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
430 return opt_result::success ();
432 /* We do not have to consider dependences between accesses that belong
433 to the same group, unless the stride could be smaller than the
435 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
436 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
437 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
438 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
439 return opt_result::success ();
441 /* Even if we have an anti-dependence then, as the vectorized loop covers at
442 least two scalar iterations, there is always also a true dependence.
443 As the vectorizer does not re-order loads and stores we can ignore
444 the anti-dependence if TBAA can disambiguate both DRs similar to the
445 case with known negative distance anti-dependences (positive
446 distance anti-dependences would violate TBAA constraints). */
447 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
448 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
449 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
450 get_alias_set (DR_REF (drb
))))
451 return opt_result::success ();
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
456 if (apply_safelen ())
457 return opt_result::success ();
459 return opt_result::failure_at
461 "possible alias involving gather/scatter between %T and %T\n",
462 DR_REF (dra
), DR_REF (drb
));
465 /* Unknown data dependence. */
466 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
468 if (apply_safelen ())
469 return opt_result::success ();
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
473 "versioning for alias required: "
474 "can't determine dependence between %T and %T\n",
475 DR_REF (dra
), DR_REF (drb
));
477 /* Add to list of ddrs that need to be tested at run-time. */
478 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
481 /* Known data dependence. */
482 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
484 if (apply_safelen ())
485 return opt_result::success ();
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
489 "versioning for alias required: "
490 "bad dist vector for %T and %T\n",
491 DR_REF (dra
), DR_REF (drb
));
492 /* Add to list of ddrs that need to be tested at run-time. */
493 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
496 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
498 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
499 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
501 return opt_result::success ();
503 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
505 int dist
= dist_v
[loop_depth
];
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE
, vect_location
,
509 "dependence distance = %d.\n", dist
);
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE
, vect_location
,
515 "dependence distance == 0 between %T and %T\n",
516 DR_REF (dra
), DR_REF (drb
));
518 /* When we perform grouped accesses and perform implicit CSE
519 by detecting equal accesses and doing disambiguation with
520 runtime alias tests like for
528 where we will end up loading { a[i], a[i+1] } once, make
529 sure that inserting group loads before the first load and
530 stores after the last store will do the right thing.
531 Similar for groups like
535 where loads from the group interleave with the store. */
536 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
537 return opt_result::failure_at (stmtinfo_a
->stmt
,
538 "READ_WRITE dependence"
539 " in interleaving.\n");
541 if (loop
->safelen
< 2)
543 tree indicator
= dr_zero_step_indicator (dra
);
544 if (!indicator
|| integer_zerop (indicator
))
545 return opt_result::failure_at (stmtinfo_a
->stmt
,
546 "access also has a zero step\n");
547 else if (TREE_CODE (indicator
) != INTEGER_CST
)
548 vect_check_nonzero_value (loop_vinfo
, indicator
);
553 if (dist
> 0 && DDR_REVERSED_P (ddr
))
555 /* If DDR_REVERSED_P the order of the data-refs in DDR was
556 reversed (to make distance vector positive), and the actual
557 distance is negative. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE
, vect_location
,
560 "dependence distance negative.\n");
561 /* When doing outer loop vectorization, we need to check if there is
562 a backward dependence at the inner loop level if the dependence
563 at the outer loop is reversed. See PR81740. */
564 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
565 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
567 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
568 DDR_LOOP_NEST (ddr
));
569 if (dist_v
[inner_depth
] < 0)
570 return opt_result::failure_at (stmtinfo_a
->stmt
,
571 "not vectorized, dependence "
572 "between data-refs %T and %T\n",
573 DR_REF (dra
), DR_REF (drb
));
575 /* Record a negative dependence distance to later limit the
576 amount of stmt copying / unrolling we can perform.
577 Only need to handle read-after-write dependence. */
579 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
580 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
581 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
585 unsigned int abs_dist
= abs (dist
);
586 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
588 /* The dependence distance requires reduction of the maximal
589 vectorization factor. */
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE
, vect_location
,
593 "adjusting maximal vectorization factor to %i\n",
597 if (abs_dist
>= *max_vf
)
599 /* Dependence distance does not create dependence, as far as
600 vectorization is concerned, in this case. */
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE
, vect_location
,
603 "dependence distance >= VF.\n");
607 return opt_result::failure_at (stmtinfo_a
->stmt
,
608 "not vectorized, possible dependence "
609 "between data-refs %T and %T\n",
610 DR_REF (dra
), DR_REF (drb
));
613 return opt_result::success ();
616 /* Funcion vect_analyze_early_break_dependences.
618 Examime all the data references in the loop and make sure that if we have
619 mulitple exits that we are able to safely move stores such that they become
620 safe for vectorization. The function also calculates the place where to move
621 the instructions to and computes what the new vUSE chain should be.
623 This works in tandem with the CFG that will be produced by
624 slpeel_tree_duplicate_loop_to_edge_cfg later on.
626 This function tries to validate whether an early break vectorization
627 is possible for the current instruction sequence. Returns True i
628 possible, otherwise False.
631 - Any memory access must be to a fixed size buffer.
632 - There must not be any loads and stores to the same object.
633 - Multiple loads are allowed as long as they don't alias.
636 This implemementation is very conservative. Any overlappig loads/stores
637 that take place before the early break statement gets rejected aside from
654 is which is the common case. */
657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo
)
659 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
661 /* List of all load data references found during traversal. */
662 auto_vec
<data_reference
*> bases
;
663 basic_block dest_bb
= NULL
;
665 hash_set
<gimple
*> visited
;
666 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
667 class loop
*loop_nest
= loop_outer (loop
);
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE
, vect_location
,
671 "loop contains multiple exits, analyzing"
672 " statement dependencies.\n");
674 for (gimple
*c
: LOOP_VINFO_LOOP_CONDS (loop_vinfo
))
676 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (c
);
677 if (STMT_VINFO_TYPE (loop_cond_info
) != loop_exit_ctrl_vec_info_type
)
680 gimple_stmt_iterator gsi
= gsi_for_stmt (c
);
682 /* Now analyze all the remaining statements and try to determine which
683 instructions are allowed/needed to be moved. */
684 while (!gsi_end_p (gsi
))
686 gimple
*stmt
= gsi_stmt (gsi
);
688 if (!gimple_has_ops (stmt
)
689 || is_gimple_debug (stmt
))
692 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (stmt
);
693 auto dr_ref
= STMT_VINFO_DATA_REF (stmt_vinfo
);
697 /* We currently only support statically allocated objects due to
698 not having first-faulting loads support or peeling for
699 alignment support. Compute the size of the referenced object
700 (it could be dynamically allocated). */
701 tree obj
= DR_BASE_ADDRESS (dr_ref
);
702 if (!obj
|| TREE_CODE (obj
) != ADDR_EXPR
)
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
706 "early breaks only supported on statically"
707 " allocated objects.\n");
708 return opt_result::failure_at (c
,
709 "can't safely apply code motion to "
710 "dependencies of %G to vectorize "
711 "the early exit.\n", c
);
714 tree refop
= TREE_OPERAND (obj
, 0);
715 tree refbase
= get_base_address (refop
);
716 if (!refbase
|| !DECL_P (refbase
) || !DECL_SIZE (refbase
)
717 || TREE_CODE (DECL_SIZE (refbase
)) != INTEGER_CST
)
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
721 "early breaks only supported on"
722 " statically allocated objects.\n");
723 return opt_result::failure_at (c
,
724 "can't safely apply code motion to "
725 "dependencies of %G to vectorize "
726 "the early exit.\n", c
);
729 /* Check if vector accesses to the object will be within bounds.
730 must be a constant or assume loop will be versioned or niters
731 bounded by VF so accesses are within range. */
732 if (!ref_within_array_bound (stmt
, DR_REF (dr_ref
)))
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
736 "early breaks not supported: vectorization "
737 "would %s beyond size of obj.",
738 DR_IS_READ (dr_ref
) ? "read" : "write");
739 return opt_result::failure_at (c
,
740 "can't safely apply code motion to "
741 "dependencies of %G to vectorize "
742 "the early exit.\n", c
);
745 if (DR_IS_READ (dr_ref
))
746 bases
.safe_push (dr_ref
);
747 else if (DR_IS_WRITE (dr_ref
))
749 /* We are moving writes down in the CFG. To be sure that this
750 is valid after vectorization we have to check all the loads
751 we are sinking the stores past to see if any of them may
752 alias or are the same object.
754 Same objects will not be an issue because unless the store
755 is marked volatile the value can be forwarded. If the
756 store is marked volatile we don't vectorize the loop
759 That leaves the check for aliasing. We don't really need
760 to care about the stores aliasing with each other since the
761 stores are moved in order so the effects are still observed
762 correctly. This leaves the check for WAR dependencies
763 which we would be introducing here if the DR can alias.
764 The check is quadratic in loads/stores but I have not found
765 a better API to do this. I believe all loads and stores
766 must be checked. We also must check them when we
767 encountered the store, since we don't care about loads past
770 for (auto dr_read
: bases
)
771 if (dr_may_alias_p (dr_ref
, dr_read
, loop_nest
))
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
776 "early breaks not supported: "
777 "overlapping loads and stores "
778 "found before the break "
781 return opt_result::failure_at (stmt
,
782 "can't safely apply code motion to dependencies"
783 " to vectorize the early exit. %G may alias with"
784 " %G\n", stmt
, dr_read
->stmt
);
788 if (gimple_vdef (stmt
))
790 if (dump_enabled_p ())
791 dump_printf_loc (MSG_NOTE
, vect_location
,
792 "==> recording stmt %G", stmt
);
794 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
).safe_push (stmt
);
796 else if (gimple_vuse (stmt
))
798 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).safe_insert (0, stmt
);
799 if (dump_enabled_p ())
800 dump_printf_loc (MSG_NOTE
, vect_location
,
801 "marked statement for vUSE update: %G", stmt
);
805 /* Save destination as we go, BB are visited in order and the last one
806 is where statements should be moved to. */
808 dest_bb
= gimple_bb (c
);
811 basic_block curr_bb
= gimple_bb (c
);
812 if (dominated_by_p (CDI_DOMINATORS
, curr_bb
, dest_bb
))
817 basic_block dest_bb0
= EDGE_SUCC (dest_bb
, 0)->dest
;
818 basic_block dest_bb1
= EDGE_SUCC (dest_bb
, 1)->dest
;
819 dest_bb
= flow_bb_inside_loop_p (loop
, dest_bb0
) ? dest_bb0
: dest_bb1
;
820 /* We don't allow outer -> inner loop transitions which should have been
821 trapped already during loop form analysis. */
822 gcc_assert (dest_bb
->loop_father
== loop
);
824 gcc_assert (dest_bb
);
825 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
) = dest_bb
;
827 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).is_empty ())
829 /* All uses shall be updated to that of the first load. Entries are
830 stored in reverse order. */
831 tree vuse
= gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).last ());
832 for (auto g
: LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
))
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_NOTE
, vect_location
,
836 "will update use: %T, mem_ref: %G", vuse
, g
);
840 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE
, vect_location
,
842 "recorded statements to be moved to BB %d\n",
843 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
)->index
);
845 return opt_result::success ();
848 /* Function vect_analyze_data_ref_dependences.
850 Examine all the data references in the loop, and make sure there do not
851 exist any data dependences between them. Set *MAX_VF according to
852 the maximum vectorization factor the data dependences allow. */
855 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
856 unsigned int *max_vf
)
859 struct data_dependence_relation
*ddr
;
861 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
863 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
865 LOOP_VINFO_DDRS (loop_vinfo
)
866 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
867 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
868 /* We do not need read-read dependences. */
869 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
870 &LOOP_VINFO_DDRS (loop_vinfo
),
871 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
876 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
878 /* For epilogues we either have no aliases or alias versioning
879 was applied to original loop. Therefore we may just get max_vf
880 using VF of original loop. */
881 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
882 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
884 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
887 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
892 /* If we have early break statements in the loop, check to see if they
893 are of a form we can vectorizer. */
894 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
895 return vect_analyze_early_break_dependences (loop_vinfo
);
897 return opt_result::success ();
901 /* Function vect_slp_analyze_data_ref_dependence.
903 Return TRUE if there (might) exist a dependence between a memory-reference
904 DRA and a memory-reference DRB for VINFO. When versioning for alias
905 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
906 according to the data dependence. */
909 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
910 struct data_dependence_relation
*ddr
)
912 struct data_reference
*dra
= DDR_A (ddr
);
913 struct data_reference
*drb
= DDR_B (ddr
);
914 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
915 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
917 /* We need to check dependences of statements marked as unvectorizable
918 as well, they still can prohibit vectorization. */
920 /* Independent data accesses. */
921 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
927 /* Read-read is OK. */
928 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
931 /* If dra and drb are part of the same interleaving chain consider
933 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
934 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
935 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
938 /* Unknown data dependence. */
939 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
941 if (dump_enabled_p ())
942 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
943 "can't determine dependence between %T and %T\n",
944 DR_REF (dra
), DR_REF (drb
));
946 else if (dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE
, vect_location
,
948 "determined dependence between %T and %T\n",
949 DR_REF (dra
), DR_REF (drb
));
955 /* Analyze dependences involved in the transform of a store SLP NODE. */
958 vect_slp_analyze_store_dependences (vec_info
*vinfo
, slp_tree node
)
960 /* This walks over all stmts involved in the SLP store done
961 in NODE verifying we can sink them up to the last stmt in the
963 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
964 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info
)));
966 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
968 stmt_vec_info access_info
969 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
970 if (access_info
== last_access_info
)
972 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
974 bool ref_initialized_p
= false;
975 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
976 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
978 gimple
*stmt
= gsi_stmt (gsi
);
979 if (! gimple_vuse (stmt
))
982 /* If we couldn't record a (single) data reference for this
983 stmt we have to resort to the alias oracle. */
984 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
985 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
988 /* We are moving a store - this means
989 we cannot use TBAA for disambiguation. */
990 if (!ref_initialized_p
)
991 ao_ref_init (&ref
, DR_REF (dr_a
));
992 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
993 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
998 gcc_assert (!gimple_visited_p (stmt
));
1000 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
1002 bool dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1003 free_dependence_relation (ddr
);
1011 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1012 contain the vector of scalar stores of this instance if we are
1013 disambiguating the loads. */
1016 vect_slp_analyze_load_dependences (vec_info
*vinfo
, slp_tree node
,
1017 vec
<stmt_vec_info
> stores
,
1018 stmt_vec_info last_store_info
)
1020 /* This walks over all stmts involved in the SLP load done
1021 in NODE verifying we can hoist them up to the first stmt in the
1023 stmt_vec_info first_access_info
= vect_find_first_scalar_stmt_in_slp (node
);
1024 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info
)));
1026 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
1028 stmt_vec_info access_info
1029 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
1030 if (access_info
== first_access_info
)
1032 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
1034 bool ref_initialized_p
= false;
1035 hash_set
<stmt_vec_info
> grp_visited
;
1036 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
1037 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
1039 gimple
*stmt
= gsi_stmt (gsi
);
1040 if (! gimple_vdef (stmt
))
1043 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
1045 /* If we run into a store of this same instance (we've just
1046 marked those) then delay dependence checking until we run
1047 into the last store because this is where it will have
1048 been sunk to (and we verified that we can do that already). */
1049 if (gimple_visited_p (stmt
))
1051 if (stmt_info
!= last_store_info
)
1054 for (stmt_vec_info
&store_info
: stores
)
1056 data_reference
*store_dr
= STMT_VINFO_DATA_REF (store_info
);
1057 ddr_p ddr
= initialize_data_dependence_relation
1058 (dr_a
, store_dr
, vNULL
);
1060 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1061 free_dependence_relation (ddr
);
1068 auto check_hoist
= [&] (stmt_vec_info stmt_info
) -> bool
1070 /* We are hoisting a load - this means we can use TBAA for
1072 if (!ref_initialized_p
)
1073 ao_ref_init (&ref
, DR_REF (dr_a
));
1074 if (stmt_may_clobber_ref_p_1 (stmt_info
->stmt
, &ref
, true))
1076 /* If we couldn't record a (single) data reference for this
1077 stmt we have to give up now. */
1078 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
1081 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
1084 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1085 free_dependence_relation (ddr
);
1089 /* No dependence. */
1092 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1094 /* When we run into a store group we have to honor
1095 that earlier stores might be moved here. We don't
1096 know exactly which and where to since we lack a
1097 back-mapping from DR to SLP node, so assume all
1098 earlier stores are sunk here. It's enough to
1099 consider the last stmt of a group for this.
1100 ??? Both this and the fact that we disregard that
1101 the conflicting instance might be removed later
1102 is overly conservative. */
1103 if (!grp_visited
.add (DR_GROUP_FIRST_ELEMENT (stmt_info
)))
1104 for (auto store_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1106 store_info
= DR_GROUP_NEXT_ELEMENT (store_info
))
1107 if ((store_info
== stmt_info
1108 || get_later_stmt (store_info
, stmt_info
) == stmt_info
)
1109 && !check_hoist (store_info
))
1114 if (!check_hoist (stmt_info
))
1123 /* Function vect_analyze_data_ref_dependences.
1125 Examine all the data references in the basic-block, and make sure there
1126 do not exist any data dependences between them. Set *MAX_VF according to
1127 the maximum vectorization factor the data dependences allow. */
1130 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
1132 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1134 /* The stores of this instance are at the root of the SLP tree. */
1135 slp_tree store
= NULL
;
1136 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
)
1137 store
= SLP_INSTANCE_TREE (instance
);
1139 /* Verify we can sink stores to the vectorized stmt insert location. */
1140 stmt_vec_info last_store_info
= NULL
;
1143 if (! vect_slp_analyze_store_dependences (vinfo
, store
))
1146 /* Mark stores in this instance and remember the last one. */
1147 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
1148 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
1149 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
1154 /* Verify we can sink loads to the vectorized stmt insert location,
1155 special-casing stores of this instance. */
1156 for (slp_tree
&load
: SLP_INSTANCE_LOADS (instance
))
1157 if (! vect_slp_analyze_load_dependences (vinfo
, load
,
1159 ? SLP_TREE_SCALAR_STMTS (store
)
1160 : vNULL
, last_store_info
))
1166 /* Unset the visited flag. */
1168 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
1169 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
1174 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1178 dr_misalignment (dr_vec_info
*dr_info
, tree vectype
, poly_int64 offset
)
1180 HOST_WIDE_INT diff
= 0;
1181 /* Alignment is only analyzed for the first element of a DR group,
1182 use that but adjust misalignment by the offset of the access. */
1183 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
))
1185 dr_vec_info
*first_dr
1186 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
));
1187 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1188 INTEGER_CSTs and the first element in the group has the lowest
1190 diff
= (TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
))
1191 - TREE_INT_CST_LOW (DR_INIT (first_dr
->dr
)));
1192 gcc_assert (diff
>= 0);
1196 int misalign
= dr_info
->misalignment
;
1197 gcc_assert (misalign
!= DR_MISALIGNMENT_UNINITIALIZED
);
1198 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
1201 /* If the access is only aligned for a vector type with smaller alignment
1202 requirement the access has unknown misalignment. */
1203 if (maybe_lt (dr_info
->target_alignment
* BITS_PER_UNIT
,
1204 targetm
.vectorize
.preferred_vector_alignment (vectype
)))
1205 return DR_MISALIGNMENT_UNKNOWN
;
1207 /* Apply the offset from the DR group start and the externally supplied
1208 offset which can for example result from a negative stride access. */
1209 poly_int64 misalignment
= misalign
+ diff
+ offset
;
1211 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1212 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1213 unsigned HOST_WIDE_INT target_alignment_c
1214 = dr_info
->target_alignment
.to_constant ();
1215 if (!known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1216 return DR_MISALIGNMENT_UNKNOWN
;
1220 /* Record the base alignment guarantee given by DRB, which occurs
1224 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
1225 innermost_loop_behavior
*drb
)
1228 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> &entry
1229 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
1230 if (!existed
|| entry
.second
->base_alignment
< drb
->base_alignment
)
1232 entry
= std::make_pair (stmt_info
, drb
);
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_NOTE
, vect_location
,
1235 "recording new base alignment for %T\n"
1237 " misalignment: %d\n"
1240 drb
->base_alignment
,
1241 drb
->base_misalignment
,
1246 /* If the region we're going to vectorize is reached, all unconditional
1247 data references occur at least once. We can therefore pool the base
1248 alignment guarantees from each unconditional reference. Do this by
1249 going through all the data references in VINFO and checking whether
1250 the containing statement makes the reference unconditionally. If so,
1251 record the alignment of the base address in VINFO so that it can be
1252 used for all other references with the same base. */
1255 vect_record_base_alignments (vec_info
*vinfo
)
1257 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1258 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
1259 for (data_reference
*dr
: vinfo
->shared
->datarefs
)
1261 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
1262 stmt_vec_info stmt_info
= dr_info
->stmt
;
1263 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
1264 && STMT_VINFO_VECTORIZABLE (stmt_info
)
1265 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1267 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
1269 /* If DR is nested in the loop that is being vectorized, we can also
1270 record the alignment of the base wrt the outer loop. */
1271 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
1272 vect_record_base_alignment
1273 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
1278 /* Function vect_compute_data_ref_alignment
1280 Compute the misalignment of the data reference DR_INFO when vectorizing
1284 1. initialized misalignment info for DR_INFO
1286 FOR NOW: No analysis is actually performed. Misalignment is calculated
1287 only for trivial cases. TODO. */
1290 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1293 stmt_vec_info stmt_info
= dr_info
->stmt
;
1294 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
1295 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1296 class loop
*loop
= NULL
;
1297 tree ref
= DR_REF (dr_info
->dr
);
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_NOTE
, vect_location
,
1301 "vect_compute_data_ref_alignment:\n");
1304 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1306 /* Initialize misalignment to unknown. */
1307 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1309 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1312 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
1313 bool step_preserves_misalignment_p
;
1315 poly_uint64 vector_alignment
1316 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
1318 SET_DR_TARGET_ALIGNMENT (dr_info
, vector_alignment
);
1320 /* If the main loop has peeled for alignment we have no way of knowing
1321 whether the data accesses in the epilogues are aligned. We can't at
1322 compile time answer the question whether we have entered the main loop or
1323 not. Fixes PR 92351. */
1326 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1328 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1332 unsigned HOST_WIDE_INT vect_align_c
;
1333 if (!vector_alignment
.is_constant (&vect_align_c
))
1336 /* No step for BB vectorization. */
1339 gcc_assert (integer_zerop (drb
->step
));
1340 step_preserves_misalignment_p
= true;
1343 /* In case the dataref is in an inner-loop of the loop that is being
1344 vectorized (LOOP), we use the base and misalignment information
1345 relative to the outer-loop (LOOP). This is ok only if the misalignment
1346 stays the same throughout the execution of the inner-loop, which is why
1347 we have to check that the stride of the dataref in the inner-loop evenly
1348 divides by the vector alignment. */
1349 else if (nested_in_vect_loop_p (loop
, stmt_info
))
1351 step_preserves_misalignment_p
1352 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1354 if (dump_enabled_p ())
1356 if (step_preserves_misalignment_p
)
1357 dump_printf_loc (MSG_NOTE
, vect_location
,
1358 "inner step divides the vector alignment.\n");
1360 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1361 "inner step doesn't divide the vector"
1366 /* Similarly we can only use base and misalignment information relative to
1367 an innermost loop if the misalignment stays the same throughout the
1368 execution of the loop. As above, this is the case if the stride of
1369 the dataref evenly divides by the alignment. */
1372 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1373 step_preserves_misalignment_p
1374 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
1376 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1377 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1378 "step doesn't divide the vector alignment.\n");
1381 unsigned int base_alignment
= drb
->base_alignment
;
1382 unsigned int base_misalignment
= drb
->base_misalignment
;
1384 /* Calculate the maximum of the pooled base address alignment and the
1385 alignment that we can compute for DR itself. */
1386 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> *entry
1387 = base_alignments
->get (drb
->base_address
);
1389 && base_alignment
< (*entry
).second
->base_alignment
1391 || (dominated_by_p (CDI_DOMINATORS
, gimple_bb (stmt_info
->stmt
),
1392 gimple_bb (entry
->first
->stmt
))
1393 && (gimple_bb (stmt_info
->stmt
) != gimple_bb (entry
->first
->stmt
)
1394 || (entry
->first
->dr_aux
.group
<= dr_info
->group
)))))
1396 base_alignment
= entry
->second
->base_alignment
;
1397 base_misalignment
= entry
->second
->base_misalignment
;
1400 if (drb
->offset_alignment
< vect_align_c
1401 || !step_preserves_misalignment_p
1402 /* We need to know whether the step wrt the vectorized loop is
1403 negative when computing the starting misalignment below. */
1404 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1408 "Unknown alignment for access: %T\n", ref
);
1412 if (base_alignment
< vect_align_c
)
1414 unsigned int max_alignment
;
1415 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1416 if (max_alignment
< vect_align_c
1417 || !vect_can_force_dr_alignment_p (base
,
1418 vect_align_c
* BITS_PER_UNIT
))
1420 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_NOTE
, vect_location
,
1422 "can't force alignment of ref: %T\n", ref
);
1426 /* Force the alignment of the decl.
1427 NOTE: This is the only change to the code we make during
1428 the analysis phase, before deciding to vectorize the loop. */
1429 if (dump_enabled_p ())
1430 dump_printf_loc (MSG_NOTE
, vect_location
,
1431 "force alignment of %T\n", ref
);
1433 dr_info
->base_decl
= base
;
1434 dr_info
->base_misaligned
= true;
1435 base_misalignment
= 0;
1437 poly_int64 misalignment
1438 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1440 unsigned int const_misalignment
;
1441 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1443 if (dump_enabled_p ())
1444 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1445 "Non-constant misalignment for access: %T\n", ref
);
1449 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1451 if (dump_enabled_p ())
1452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1453 "misalign = %d bytes of ref %T\n",
1454 const_misalignment
, ref
);
1459 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1460 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1461 is made aligned via peeling. */
1464 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1465 dr_vec_info
*dr_peel_info
)
1467 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1468 DR_TARGET_ALIGNMENT (dr_info
)))
1470 poly_offset_int diff
1471 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1472 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1473 if (known_eq (diff
, 0)
1474 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1480 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1481 aligned via peeling. */
1484 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1485 dr_vec_info
*dr_peel_info
)
1487 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1488 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1489 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1490 DR_OFFSET (dr_peel_info
->dr
), 0)
1491 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1492 DR_STEP (dr_peel_info
->dr
), 0))
1495 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1498 /* Compute the value for dr_info->misalign so that the access appears
1499 aligned. This is used by peeling to compensate for dr_misalignment
1500 applying the offset for negative step. */
1503 vect_dr_misalign_for_aligned_access (dr_vec_info
*dr_info
)
1505 if (tree_int_cst_sgn (DR_STEP (dr_info
->dr
)) >= 0)
1508 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1509 poly_int64 misalignment
1510 = ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1511 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1513 unsigned HOST_WIDE_INT target_alignment_c
;
1515 if (!dr_info
->target_alignment
.is_constant (&target_alignment_c
)
1516 || !known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1517 return DR_MISALIGNMENT_UNKNOWN
;
1521 /* Function vect_update_misalignment_for_peel.
1522 Sets DR_INFO's misalignment
1523 - to 0 if it has the same alignment as DR_PEEL_INFO,
1524 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1525 - to -1 (unknown) otherwise.
1527 DR_INFO - the data reference whose misalignment is to be adjusted.
1528 DR_PEEL_INFO - the data reference whose misalignment is being made
1529 zero in the vector loop by the peel.
1530 NPEEL - the number of iterations in the peel loop if the misalignment
1531 of DR_PEEL_INFO is known at compile time. */
1534 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1535 dr_vec_info
*dr_peel_info
, int npeel
)
1537 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1538 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1540 SET_DR_MISALIGNMENT (dr_info
,
1541 vect_dr_misalign_for_aligned_access (dr_peel_info
));
1545 unsigned HOST_WIDE_INT alignment
;
1546 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1547 && known_alignment_for_access_p (dr_info
,
1548 STMT_VINFO_VECTYPE (dr_info
->stmt
))
1549 && known_alignment_for_access_p (dr_peel_info
,
1550 STMT_VINFO_VECTYPE (dr_peel_info
->stmt
)))
1552 int misal
= dr_info
->misalignment
;
1553 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1554 misal
&= alignment
- 1;
1555 set_dr_misalignment (dr_info
, misal
);
1559 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1561 "to unknown (-1).\n");
1562 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1565 /* Return true if alignment is relevant for DR_INFO. */
1568 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1570 stmt_vec_info stmt_info
= dr_info
->stmt
;
1572 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1575 /* For interleaving, only the alignment of the first access matters. */
1576 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1577 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1580 /* Scatter-gather and invariant accesses continue to address individual
1581 scalars, so vector-level alignment is irrelevant. */
1582 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1583 || integer_zerop (DR_STEP (dr_info
->dr
)))
1586 /* Strided accesses perform only component accesses, alignment is
1587 irrelevant for them. */
1588 if (STMT_VINFO_STRIDED_P (stmt_info
)
1589 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1595 /* Given an memory reference EXP return whether its alignment is less
1599 not_size_aligned (tree exp
)
1601 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1604 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1605 > get_object_alignment (exp
));
1608 /* Function vector_alignment_reachable_p
1610 Return true if vector alignment for DR_INFO is reachable by peeling
1611 a few loop iterations. Return false otherwise. */
1614 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1616 stmt_vec_info stmt_info
= dr_info
->stmt
;
1617 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1619 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1621 /* For interleaved access we peel only if number of iterations in
1622 the prolog loop ({VF - misalignment}), is a multiple of the
1623 number of the interleaved accesses. */
1624 int elem_size
, mis_in_elements
;
1626 /* FORNOW: handle only known alignment. */
1627 if (!known_alignment_for_access_p (dr_info
, vectype
))
1630 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1631 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1632 elem_size
= vector_element_size (vector_size
, nelements
);
1633 mis_in_elements
= dr_misalignment (dr_info
, vectype
) / elem_size
;
1635 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1639 /* If misalignment is known at the compile time then allow peeling
1640 only if natural alignment is reachable through peeling. */
1641 if (known_alignment_for_access_p (dr_info
, vectype
)
1642 && !aligned_access_p (dr_info
, vectype
))
1644 HOST_WIDE_INT elmsize
=
1645 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1646 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_NOTE
, vect_location
,
1649 "data size = %wd. misalignment = %d.\n", elmsize
,
1650 dr_misalignment (dr_info
, vectype
));
1652 if (dr_misalignment (dr_info
, vectype
) % elmsize
)
1654 if (dump_enabled_p ())
1655 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1656 "data size does not divide the misalignment.\n");
1661 if (!known_alignment_for_access_p (dr_info
, vectype
))
1663 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1664 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1667 "Unknown misalignment, %snaturally aligned\n",
1668 is_packed
? "not " : "");
1669 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1676 /* Calculate the cost of the memory access represented by DR_INFO. */
1679 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1680 dr_alignment_support alignment_support_scheme
,
1682 unsigned int *inside_cost
,
1683 unsigned int *outside_cost
,
1684 stmt_vector_for_cost
*body_cost_vec
,
1685 stmt_vector_for_cost
*prologue_cost_vec
)
1687 stmt_vec_info stmt_info
= dr_info
->stmt
;
1688 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1691 if (PURE_SLP_STMT (stmt_info
))
1694 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1696 if (DR_IS_READ (dr_info
->dr
))
1697 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, alignment_support_scheme
,
1698 misalignment
, true, inside_cost
,
1699 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1701 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, alignment_support_scheme
,
1702 misalignment
, inside_cost
, body_cost_vec
);
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_NOTE
, vect_location
,
1706 "vect_get_data_access_cost: inside_cost = %d, "
1707 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1711 typedef struct _vect_peel_info
1713 dr_vec_info
*dr_info
;
1718 typedef struct _vect_peel_extended_info
1721 struct _vect_peel_info peel_info
;
1722 unsigned int inside_cost
;
1723 unsigned int outside_cost
;
1724 } *vect_peel_extended_info
;
1727 /* Peeling hashtable helpers. */
1729 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1731 static inline hashval_t
hash (const _vect_peel_info
*);
1732 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1736 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1738 return (hashval_t
) peel_info
->npeel
;
1742 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1744 return (a
->npeel
== b
->npeel
);
1748 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1751 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1752 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1753 int npeel
, bool supportable_if_not_aligned
)
1755 struct _vect_peel_info elem
, *slot
;
1756 _vect_peel_info
**new_slot
;
1759 slot
= peeling_htab
->find (&elem
);
1764 slot
= XNEW (struct _vect_peel_info
);
1765 slot
->npeel
= npeel
;
1766 slot
->dr_info
= dr_info
;
1768 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1772 /* If this DR is not supported with unknown misalignment then bias
1773 this slot when the cost model is disabled. */
1774 if (!supportable_if_not_aligned
1775 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1776 slot
->count
+= VECT_MAX_COST
;
1780 /* Traverse peeling hash table to find peeling option that aligns maximum
1781 number of data accesses. */
1784 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1785 _vect_peel_extended_info
*max
)
1787 vect_peel_info elem
= *slot
;
1789 if (elem
->count
> max
->peel_info
.count
1790 || (elem
->count
== max
->peel_info
.count
1791 && max
->peel_info
.npeel
> elem
->npeel
))
1793 max
->peel_info
.npeel
= elem
->npeel
;
1794 max
->peel_info
.count
= elem
->count
;
1795 max
->peel_info
.dr_info
= elem
->dr_info
;
1801 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1802 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1803 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1807 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1808 dr_vec_info
*dr0_info
,
1809 unsigned int *inside_cost
,
1810 unsigned int *outside_cost
,
1811 stmt_vector_for_cost
*body_cost_vec
,
1812 stmt_vector_for_cost
*prologue_cost_vec
,
1815 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1817 bool dr0_alignment_known_p
1819 && known_alignment_for_access_p (dr0_info
,
1820 STMT_VINFO_VECTYPE (dr0_info
->stmt
)));
1822 for (data_reference
*dr
: datarefs
)
1824 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1825 if (!vect_relevant_for_alignment_p (dr_info
))
1828 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1829 dr_alignment_support alignment_support_scheme
;
1831 unsigned HOST_WIDE_INT alignment
;
1833 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1834 size_zero_node
) < 0;
1837 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1838 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1841 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1842 else if (dr_info
== dr0_info
1843 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1845 else if (!dr0_alignment_known_p
1846 || !known_alignment_for_access_p (dr_info
, vectype
)
1847 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1848 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1851 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1852 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1853 misalignment
&= alignment
- 1;
1855 alignment_support_scheme
1856 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1859 vect_get_data_access_cost (loop_vinfo
, dr_info
,
1860 alignment_support_scheme
, misalignment
,
1861 inside_cost
, outside_cost
,
1862 body_cost_vec
, prologue_cost_vec
);
1866 /* Traverse peeling hash table and calculate cost for each peeling option.
1867 Find the one with the lowest cost. */
1870 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1871 _vect_peel_extended_info
*min
)
1873 vect_peel_info elem
= *slot
;
1875 unsigned int inside_cost
= 0, outside_cost
= 0;
1876 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1877 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1880 prologue_cost_vec
.create (2);
1881 body_cost_vec
.create (2);
1882 epilogue_cost_vec
.create (2);
1884 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1885 &outside_cost
, &body_cost_vec
,
1886 &prologue_cost_vec
, elem
->npeel
);
1888 body_cost_vec
.release ();
1890 outside_cost
+= vect_get_known_peeling_cost
1891 (loop_vinfo
, elem
->npeel
, &dummy
,
1892 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1893 &prologue_cost_vec
, &epilogue_cost_vec
);
1895 /* Prologue and epilogue costs are added to the target model later.
1896 These costs depend only on the scalar iteration cost, the
1897 number of peeling iterations finally chosen, and the number of
1898 misaligned statements. So discard the information found here. */
1899 prologue_cost_vec
.release ();
1900 epilogue_cost_vec
.release ();
1902 if (inside_cost
< min
->inside_cost
1903 || (inside_cost
== min
->inside_cost
1904 && outside_cost
< min
->outside_cost
))
1906 min
->inside_cost
= inside_cost
;
1907 min
->outside_cost
= outside_cost
;
1908 min
->peel_info
.dr_info
= elem
->dr_info
;
1909 min
->peel_info
.npeel
= elem
->npeel
;
1910 min
->peel_info
.count
= elem
->count
;
1917 /* Choose best peeling option by traversing peeling hash table and either
1918 choosing an option with the lowest cost (if cost model is enabled) or the
1919 option that aligns as many accesses as possible. */
1921 static struct _vect_peel_extended_info
1922 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1923 loop_vec_info loop_vinfo
)
1925 struct _vect_peel_extended_info res
;
1927 res
.peel_info
.dr_info
= NULL
;
1928 res
.vinfo
= loop_vinfo
;
1930 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1932 res
.inside_cost
= INT_MAX
;
1933 res
.outside_cost
= INT_MAX
;
1934 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1935 vect_peeling_hash_get_lowest_cost
> (&res
);
1939 res
.peel_info
.count
= 0;
1940 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1941 vect_peeling_hash_get_most_frequent
> (&res
);
1942 res
.inside_cost
= 0;
1943 res
.outside_cost
= 0;
1949 /* Return true if the new peeling NPEEL is supported. */
1952 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1955 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1956 enum dr_alignment_support supportable_dr_alignment
;
1958 bool dr0_alignment_known_p
1959 = known_alignment_for_access_p (dr0_info
,
1960 STMT_VINFO_VECTYPE (dr0_info
->stmt
));
1962 /* Ensure that all data refs can be vectorized after the peel. */
1963 for (data_reference
*dr
: datarefs
)
1965 if (dr
== dr0_info
->dr
)
1968 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1969 if (!vect_relevant_for_alignment_p (dr_info
)
1970 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1973 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1975 unsigned HOST_WIDE_INT alignment
;
1976 if (!dr0_alignment_known_p
1977 || !known_alignment_for_access_p (dr_info
, vectype
)
1978 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1979 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1982 misalignment
= dr_misalignment (dr_info
, vectype
);
1983 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1984 misalignment
&= alignment
- 1;
1986 supportable_dr_alignment
1987 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1989 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
1996 /* Compare two data-references DRA and DRB to group them into chunks
1997 with related alignment. */
2000 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
2002 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2003 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2006 /* Stabilize sort. */
2010 /* Ordering of DRs according to base. */
2011 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2012 DR_BASE_ADDRESS (drb
));
2016 /* And according to DR_OFFSET. */
2017 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2021 /* And after step. */
2022 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2026 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2027 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2029 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2033 /* Function vect_enhance_data_refs_alignment
2035 This pass will use loop versioning and loop peeling in order to enhance
2036 the alignment of data references in the loop.
2038 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2039 original loop is to be vectorized. Any other loops that are created by
2040 the transformations performed in this pass - are not supposed to be
2041 vectorized. This restriction will be relaxed.
2043 This pass will require a cost model to guide it whether to apply peeling
2044 or versioning or a combination of the two. For example, the scheme that
2045 intel uses when given a loop with several memory accesses, is as follows:
2046 choose one memory access ('p') which alignment you want to force by doing
2047 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2048 other accesses are not necessarily aligned, or (2) use loop versioning to
2049 generate one loop in which all accesses are aligned, and another loop in
2050 which only 'p' is necessarily aligned.
2052 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2053 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2054 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2056 Devising a cost model is the most critical aspect of this work. It will
2057 guide us on which access to peel for, whether to use loop versioning, how
2058 many versions to create, etc. The cost model will probably consist of
2059 generic considerations as well as target specific considerations (on
2060 powerpc for example, misaligned stores are more painful than misaligned
2063 Here are the general steps involved in alignment enhancements:
2065 -- original loop, before alignment analysis:
2066 for (i=0; i<N; i++){
2067 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2068 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2071 -- After vect_compute_data_refs_alignment:
2072 for (i=0; i<N; i++){
2073 x = q[i]; # DR_MISALIGNMENT(q) = 3
2074 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2077 -- Possibility 1: we do loop versioning:
2079 for (i=0; i<N; i++){ # loop 1A
2080 x = q[i]; # DR_MISALIGNMENT(q) = 3
2081 p[i] = y; # DR_MISALIGNMENT(p) = 0
2085 for (i=0; i<N; i++){ # loop 1B
2086 x = q[i]; # DR_MISALIGNMENT(q) = 3
2087 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2091 -- Possibility 2: we do loop peeling:
2092 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2096 for (i = 3; i < N; i++){ # loop 2A
2097 x = q[i]; # DR_MISALIGNMENT(q) = 0
2098 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2101 -- Possibility 3: combination of loop peeling and versioning:
2102 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2107 for (i = 3; i<N; i++){ # loop 3A
2108 x = q[i]; # DR_MISALIGNMENT(q) = 0
2109 p[i] = y; # DR_MISALIGNMENT(p) = 0
2113 for (i = 3; i<N; i++){ # loop 3B
2114 x = q[i]; # DR_MISALIGNMENT(q) = 0
2115 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2119 These loops are later passed to loop_transform to be vectorized. The
2120 vectorizer will use the alignment information to guide the transformation
2121 (whether to generate regular loads/stores, or with special handling for
2125 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
2127 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2128 dr_vec_info
*first_store
= NULL
;
2129 dr_vec_info
*dr0_info
= NULL
;
2130 struct data_reference
*dr
;
2132 bool do_peeling
= false;
2133 bool do_versioning
= false;
2134 unsigned int npeel
= 0;
2135 bool one_misalignment_known
= false;
2136 bool one_misalignment_unknown
= false;
2137 bool one_dr_unsupportable
= false;
2138 dr_vec_info
*unsupportable_dr_info
= NULL
;
2139 unsigned int dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
2140 hash_table
<peel_info_hasher
> peeling_htab (1);
2142 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2144 /* Reset data so we can safely be called multiple times. */
2145 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2146 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
2148 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
2149 return opt_result::success ();
2151 /* Sort the vector of datarefs so DRs that have the same or dependent
2152 alignment are next to each other. */
2153 auto_vec
<data_reference_p
> datarefs
2154 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
2155 datarefs
.qsort (dr_align_group_sort_cmp
);
2157 /* Compute the number of DRs that become aligned when we peel
2158 a dataref so it becomes aligned. */
2159 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
2160 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
2162 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
2163 if (DR_BASE_ADDRESS (datarefs
[i0
]))
2165 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
2167 if (i
== datarefs
.length ()
2168 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
2169 DR_BASE_ADDRESS (datarefs
[i
]), 0)
2170 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
2171 DR_OFFSET (datarefs
[i
]), 0)
2172 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
2173 DR_STEP (datarefs
[i
]), 0))
2175 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2176 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2177 will get known misalignment if we align one of the refs
2178 with the largest DR_TARGET_ALIGNMENT. */
2179 for (unsigned j
= i0
; j
< i
; ++j
)
2181 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
2182 for (unsigned k
= i0
; k
< i
; ++k
)
2186 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
2187 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
2189 n_same_align_refs
[j
]++;
2196 /* While cost model enhancements are expected in the future, the high level
2197 view of the code at this time is as follows:
2199 A) If there is a misaligned access then see if peeling to align
2200 this access can make all data references satisfy
2201 vect_supportable_dr_alignment. If so, update data structures
2202 as needed and return true.
2204 B) If peeling wasn't possible and there is a data reference with an
2205 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2206 then see if loop versioning checks can be used to make all data
2207 references satisfy vect_supportable_dr_alignment. If so, update
2208 data structures as needed and return true.
2210 C) If neither peeling nor versioning were successful then return false if
2211 any data reference does not satisfy vect_supportable_dr_alignment.
2213 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2215 Note, Possibility 3 above (which is peeling and versioning together) is not
2216 being done at this time. */
2218 /* (1) Peeling to force alignment. */
2220 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2222 + How many accesses will become aligned due to the peeling
2223 - How many accesses will become unaligned due to the peeling,
2224 and the cost of misaligned accesses.
2225 - The cost of peeling (the extra runtime checks, the increase
2228 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2230 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2231 if (!vect_relevant_for_alignment_p (dr_info
))
2234 stmt_vec_info stmt_info
= dr_info
->stmt
;
2235 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2236 do_peeling
= vector_alignment_reachable_p (dr_info
);
2239 if (known_alignment_for_access_p (dr_info
, vectype
))
2241 unsigned int npeel_tmp
= 0;
2242 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2243 size_zero_node
) < 0;
2245 /* If known_alignment_for_access_p then we have set
2246 DR_MISALIGNMENT which is only done if we know it at compiler
2247 time, so it is safe to assume target alignment is constant.
2249 unsigned int target_align
=
2250 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
2251 unsigned HOST_WIDE_INT dr_size
= vect_get_scalar_dr_size (dr_info
);
2254 off
= (TYPE_VECTOR_SUBPARTS (vectype
) - 1) * -dr_size
;
2255 unsigned int mis
= dr_misalignment (dr_info
, vectype
, off
);
2256 mis
= negative
? mis
: -mis
;
2258 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
2260 /* For multiple types, it is possible that the bigger type access
2261 will have more than one peeling option. E.g., a loop with two
2262 types: one of size (vector size / 4), and the other one of
2263 size (vector size / 8). Vectorization factor will 8. If both
2264 accesses are misaligned by 3, the first one needs one scalar
2265 iteration to be aligned, and the second one needs 5. But the
2266 first one will be aligned also by peeling 5 scalar
2267 iterations, and in that case both accesses will be aligned.
2268 Hence, except for the immediate peeling amount, we also want
2269 to try to add full vector size, while we don't exceed
2270 vectorization factor.
2271 We do this automatically for cost model, since we calculate
2272 cost for every peeling option. */
2273 poly_uint64 nscalars
= npeel_tmp
;
2274 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2276 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2277 nscalars
= (STMT_SLP_TYPE (stmt_info
)
2278 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
2281 /* Save info about DR in the hash table. Also include peeling
2282 amounts according to the explanation above. Indicate
2283 the alignment status when the ref is not aligned.
2284 ??? Rather than using unknown alignment here we should
2285 prune all entries from the peeling hashtable which cause
2286 DRs to be not supported. */
2287 bool supportable_if_not_aligned
2288 = vect_supportable_dr_alignment
2289 (loop_vinfo
, dr_info
, vectype
, DR_MISALIGNMENT_UNKNOWN
);
2290 while (known_le (npeel_tmp
, nscalars
))
2292 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
2294 supportable_if_not_aligned
);
2295 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
2298 one_misalignment_known
= true;
2302 /* If we don't know any misalignment values, we prefer
2303 peeling for data-ref that has the maximum number of data-refs
2304 with the same alignment, unless the target prefers to align
2305 stores over load. */
2306 unsigned same_align_drs
= n_same_align_refs
[i
];
2308 || dr0_same_align_drs
< same_align_drs
)
2310 dr0_same_align_drs
= same_align_drs
;
2313 /* For data-refs with the same number of related
2314 accesses prefer the one where the misalign
2315 computation will be invariant in the outermost loop. */
2316 else if (dr0_same_align_drs
== same_align_drs
)
2318 class loop
*ivloop0
, *ivloop
;
2319 ivloop0
= outermost_invariant_loop_for_expr
2320 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
2321 ivloop
= outermost_invariant_loop_for_expr
2322 (loop
, DR_BASE_ADDRESS (dr
));
2323 if ((ivloop
&& !ivloop0
)
2324 || (ivloop
&& ivloop0
2325 && flow_loop_nested_p (ivloop
, ivloop0
)))
2329 one_misalignment_unknown
= true;
2331 /* Check for data refs with unsupportable alignment that
2333 enum dr_alignment_support supportable_dr_alignment
2334 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2335 DR_MISALIGNMENT_UNKNOWN
);
2336 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2338 one_dr_unsupportable
= true;
2339 unsupportable_dr_info
= dr_info
;
2342 if (!first_store
&& DR_IS_WRITE (dr
))
2344 first_store
= dr_info
;
2345 first_store_same_align_drs
= same_align_drs
;
2351 if (!aligned_access_p (dr_info
, vectype
))
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2355 "vector alignment may not be reachable\n");
2361 /* Check if we can possibly peel the loop. */
2362 if (!vect_can_advance_ivs_p (loop_vinfo
)
2363 || !slpeel_can_duplicate_loop_p (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
),
2364 LOOP_VINFO_IV_EXIT (loop_vinfo
))
2368 struct _vect_peel_extended_info peel_for_known_alignment
;
2369 struct _vect_peel_extended_info peel_for_unknown_alignment
;
2370 struct _vect_peel_extended_info best_peel
;
2372 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
2373 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
2374 peel_for_unknown_alignment
.peel_info
.count
= 0;
2377 && one_misalignment_unknown
)
2379 /* Check if the target requires to prefer stores over loads, i.e., if
2380 misaligned stores are more expensive than misaligned loads (taking
2381 drs with same alignment into account). */
2382 unsigned int load_inside_cost
= 0;
2383 unsigned int load_outside_cost
= 0;
2384 unsigned int store_inside_cost
= 0;
2385 unsigned int store_outside_cost
= 0;
2386 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
2388 stmt_vector_for_cost dummy
;
2390 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
2393 &dummy
, &dummy
, estimated_npeels
);
2399 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
2401 &store_outside_cost
,
2408 store_inside_cost
= INT_MAX
;
2409 store_outside_cost
= INT_MAX
;
2412 if (load_inside_cost
> store_inside_cost
2413 || (load_inside_cost
== store_inside_cost
2414 && load_outside_cost
> store_outside_cost
))
2416 dr0_info
= first_store
;
2417 dr0_same_align_drs
= first_store_same_align_drs
;
2418 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2419 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2423 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2424 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2427 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2428 prologue_cost_vec
.create (2);
2429 epilogue_cost_vec
.create (2);
2432 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2433 (loop_vinfo
, estimated_npeels
, &dummy2
,
2434 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2435 &prologue_cost_vec
, &epilogue_cost_vec
);
2437 prologue_cost_vec
.release ();
2438 epilogue_cost_vec
.release ();
2440 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2443 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2444 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2446 best_peel
= peel_for_unknown_alignment
;
2448 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2449 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2450 peel_for_known_alignment
.peel_info
.count
= 0;
2451 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2453 if (do_peeling
&& one_misalignment_known
)
2455 /* Peeling is possible, but there is no data access that is not supported
2456 unless aligned. So we try to choose the best possible peeling from
2458 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2459 (&peeling_htab
, loop_vinfo
);
2462 /* Compare costs of peeling for known and unknown alignment. */
2463 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2464 && peel_for_unknown_alignment
.inside_cost
2465 >= peel_for_known_alignment
.inside_cost
)
2467 best_peel
= peel_for_known_alignment
;
2469 /* If the best peeling for known alignment has NPEEL == 0, perform no
2470 peeling at all except if there is an unsupportable dr that we can
2472 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2476 /* If there is an unsupportable data ref, prefer this over all choices so far
2477 since we'd have to discard a chosen peeling except when it accidentally
2478 aligned the unsupportable data ref. */
2479 if (one_dr_unsupportable
)
2480 dr0_info
= unsupportable_dr_info
;
2481 else if (do_peeling
)
2483 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2484 TODO: Use nopeel_outside_cost or get rid of it? */
2485 unsigned nopeel_inside_cost
= 0;
2486 unsigned nopeel_outside_cost
= 0;
2488 stmt_vector_for_cost dummy
;
2490 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2491 &nopeel_outside_cost
, &dummy
, &dummy
, 0);
2494 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2495 costs will be recorded. */
2496 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2497 prologue_cost_vec
.create (2);
2498 epilogue_cost_vec
.create (2);
2501 nopeel_outside_cost
+= vect_get_known_peeling_cost
2502 (loop_vinfo
, 0, &dummy2
,
2503 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2504 &prologue_cost_vec
, &epilogue_cost_vec
);
2506 prologue_cost_vec
.release ();
2507 epilogue_cost_vec
.release ();
2509 npeel
= best_peel
.peel_info
.npeel
;
2510 dr0_info
= best_peel
.peel_info
.dr_info
;
2512 /* If no peeling is not more expensive than the best peeling we
2513 have so far, don't perform any peeling. */
2514 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2520 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2521 if (known_alignment_for_access_p (dr0_info
,
2522 STMT_VINFO_VECTYPE (stmt_info
)))
2524 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2525 size_zero_node
) < 0;
2528 /* Since it's known at compile time, compute the number of
2529 iterations in the peeled loop (the peeling factor) for use in
2530 updating DR_MISALIGNMENT values. The peeling factor is the
2531 vectorization factor minus the misalignment as an element
2533 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2536 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2537 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2539 = dr_misalignment (dr0_info
, vectype
, off
);
2540 mis
= negative
? mis
: -mis
;
2541 /* If known_alignment_for_access_p then we have set
2542 DR_MISALIGNMENT which is only done if we know it at compiler
2543 time, so it is safe to assume target alignment is constant.
2545 unsigned int target_align
=
2546 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2547 npeel
= ((mis
& (target_align
- 1))
2548 / vect_get_scalar_dr_size (dr0_info
));
2551 /* For interleaved data access every iteration accesses all the
2552 members of the group, therefore we divide the number of iterations
2553 by the group size. */
2554 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2555 npeel
/= DR_GROUP_SIZE (stmt_info
);
2557 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE
, vect_location
,
2559 "Try peeling by %d\n", npeel
);
2562 /* Ensure that all datarefs can be vectorized after the peel. */
2563 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2566 /* Check if all datarefs are supportable and log. */
2569 && known_alignment_for_access_p (dr0_info
,
2570 STMT_VINFO_VECTYPE (stmt_info
)))
2571 return opt_result::success ();
2573 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2576 unsigned max_allowed_peel
2577 = param_vect_max_peeling_for_alignment
;
2578 if (loop_cost_model (loop
) <= VECT_COST_MODEL_CHEAP
)
2579 max_allowed_peel
= 0;
2580 if (max_allowed_peel
!= (unsigned)-1)
2582 unsigned max_peel
= npeel
;
2585 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2586 unsigned HOST_WIDE_INT target_align_c
;
2587 if (target_align
.is_constant (&target_align_c
))
2589 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2593 if (dump_enabled_p ())
2594 dump_printf_loc (MSG_NOTE
, vect_location
,
2595 "Disable peeling, max peels set and vector"
2596 " alignment unknown\n");
2599 if (max_peel
> max_allowed_peel
)
2602 if (dump_enabled_p ())
2603 dump_printf_loc (MSG_NOTE
, vect_location
,
2604 "Disable peeling, max peels reached: %d\n", max_peel
);
2609 /* Cost model #2 - if peeling may result in a remaining loop not
2610 iterating enough to be vectorized then do not peel. Since this
2611 is a cost heuristic rather than a correctness decision, use the
2612 most likely runtime value for variable vectorization factors. */
2614 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2616 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2617 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2618 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2619 < assumed_vf
+ max_peel
)
2625 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2626 If the misalignment of DR_i is identical to that of dr0 then set
2627 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2628 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2629 by the peeling factor times the element size of DR_i (MOD the
2630 vectorization factor times the size). Otherwise, the
2631 misalignment of DR_i must be set to unknown. */
2632 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2633 if (dr
!= dr0_info
->dr
)
2635 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2636 if (!vect_relevant_for_alignment_p (dr_info
))
2639 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2642 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2644 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2646 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = -1;
2647 SET_DR_MISALIGNMENT (dr0_info
,
2648 vect_dr_misalign_for_aligned_access (dr0_info
));
2649 if (dump_enabled_p ())
2651 dump_printf_loc (MSG_NOTE
, vect_location
,
2652 "Alignment of access forced using peeling.\n");
2653 dump_printf_loc (MSG_NOTE
, vect_location
,
2654 "Peeling for alignment will be applied.\n");
2657 /* The inside-loop cost will be accounted for in vectorizable_load
2658 and vectorizable_store correctly with adjusted alignments.
2659 Drop the body_cst_vec on the floor here. */
2660 return opt_result::success ();
2664 /* (2) Versioning to force alignment. */
2666 /* Try versioning if:
2667 1) optimize loop for speed and the cost-model is not cheap
2668 2) there is at least one unsupported misaligned data ref with an unknown
2670 3) all misaligned data refs with a known misalignment are supported, and
2671 4) the number of runtime alignment checks is within reason. */
2674 = (optimize_loop_nest_for_speed_p (loop
)
2675 && !loop
->inner
/* FORNOW */
2676 && loop_cost_model (loop
) > VECT_COST_MODEL_CHEAP
);
2680 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2682 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2683 if (!vect_relevant_for_alignment_p (dr_info
))
2686 stmt_vec_info stmt_info
= dr_info
->stmt
;
2687 if (STMT_VINFO_STRIDED_P (stmt_info
))
2689 do_versioning
= false;
2693 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2694 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2695 size_zero_node
) < 0;
2698 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2699 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2701 if ((misalignment
= dr_misalignment (dr_info
, vectype
, off
)) == 0)
2704 enum dr_alignment_support supportable_dr_alignment
2705 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2707 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2709 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
2710 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2711 >= (unsigned) param_vect_max_version_for_alignment_checks
))
2713 do_versioning
= false;
2717 /* At present we don't support versioning for alignment
2718 with variable VF, since there's no guarantee that the
2719 VF is a power of two. We could relax this if we added
2720 a way of enforcing a power-of-two size. */
2721 unsigned HOST_WIDE_INT size
;
2722 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2724 do_versioning
= false;
2728 /* Forcing alignment in the first iteration is no good if
2729 we don't keep it across iterations. For now, just disable
2730 versioning in this case.
2731 ?? We could actually unroll the loop to achieve the required
2732 overall step alignment, and forcing the alignment could be
2733 done by doing some iterations of the non-vectorized loop. */
2734 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2735 * DR_STEP_ALIGNMENT (dr
),
2736 DR_TARGET_ALIGNMENT (dr_info
)))
2738 do_versioning
= false;
2742 /* The rightmost bits of an aligned address must be zeros.
2743 Construct the mask needed for this test. For example,
2744 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2745 mask must be 15 = 0xf. */
2746 int mask
= size
- 1;
2748 /* FORNOW: use the same mask to test all potentially unaligned
2749 references in the loop. */
2750 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2751 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2753 do_versioning
= false;
2757 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2758 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2762 /* Versioning requires at least one misaligned data reference. */
2763 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2764 do_versioning
= false;
2765 else if (!do_versioning
)
2766 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2771 const vec
<stmt_vec_info
> &may_misalign_stmts
2772 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2773 stmt_vec_info stmt_info
;
2775 /* It can now be assumed that the data references in the statements
2776 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2777 of the loop being vectorized. */
2778 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2780 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2781 SET_DR_MISALIGNMENT (dr_info
,
2782 vect_dr_misalign_for_aligned_access (dr_info
));
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE
, vect_location
,
2785 "Alignment of access forced using versioning.\n");
2788 if (dump_enabled_p ())
2789 dump_printf_loc (MSG_NOTE
, vect_location
,
2790 "Versioning for alignment will be applied.\n");
2792 /* Peeling and versioning can't be done together at this time. */
2793 gcc_assert (! (do_peeling
&& do_versioning
));
2795 return opt_result::success ();
2798 /* This point is reached if neither peeling nor versioning is being done. */
2799 gcc_assert (! (do_peeling
|| do_versioning
));
2801 return opt_result::success ();
2805 /* Function vect_analyze_data_refs_alignment
2807 Analyze the alignment of the data-references in the loop.
2808 Return FALSE if a data reference is found that cannot be vectorized. */
2811 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2813 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2815 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2816 struct data_reference
*dr
;
2819 vect_record_base_alignments (loop_vinfo
);
2820 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2822 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2823 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2825 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
)
2826 && DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
) != dr_info
->stmt
)
2828 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
,
2829 STMT_VINFO_VECTYPE (dr_info
->stmt
));
2833 return opt_result::success ();
2837 /* Analyze alignment of DRs of stmts in NODE. */
2840 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2842 /* Alignment is maintained in the first element of the group. */
2843 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2844 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2845 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2846 tree vectype
= SLP_TREE_VECTYPE (node
);
2847 poly_uint64 vector_alignment
2848 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
2850 if (dr_info
->misalignment
== DR_MISALIGNMENT_UNINITIALIZED
)
2851 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2852 /* Re-analyze alignment when we're facing a vectorization with a bigger
2853 alignment requirement. */
2854 else if (known_lt (dr_info
->target_alignment
, vector_alignment
))
2856 poly_uint64 old_target_alignment
= dr_info
->target_alignment
;
2857 int old_misalignment
= dr_info
->misalignment
;
2858 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2859 /* But keep knowledge about a smaller alignment. */
2860 if (old_misalignment
!= DR_MISALIGNMENT_UNKNOWN
2861 && dr_info
->misalignment
== DR_MISALIGNMENT_UNKNOWN
)
2863 dr_info
->target_alignment
= old_target_alignment
;
2864 dr_info
->misalignment
= old_misalignment
;
2867 /* When we ever face unordered target alignments the first one wins in terms
2868 of analyzing and the other will become unknown in dr_misalignment. */
2872 /* Function vect_slp_analyze_instance_alignment
2874 Analyze the alignment of the data-references in the SLP instance.
2875 Return FALSE if a data reference is found that cannot be vectorized. */
2878 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
2879 slp_instance instance
)
2881 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2885 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2886 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
2889 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2890 && ! vect_slp_analyze_node_alignment
2891 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2898 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2899 accesses of legal size, step, etc. Detect gaps, single element
2900 interleaving, and other special cases. Set grouped access info.
2901 Collect groups of strided stores for further use in SLP analysis.
2902 Worker for vect_analyze_group_access. */
2905 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2907 data_reference
*dr
= dr_info
->dr
;
2908 tree step
= DR_STEP (dr
);
2909 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2910 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2911 stmt_vec_info stmt_info
= dr_info
->stmt
;
2912 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2913 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2914 HOST_WIDE_INT dr_step
= -1;
2915 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2916 bool slp_impossible
= false;
2918 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2919 size of the interleaving group (including gaps). */
2920 if (tree_fits_shwi_p (step
))
2922 dr_step
= tree_to_shwi (step
);
2923 /* Check that STEP is a multiple of type size. Otherwise there is
2924 a non-element-sized gap at the end of the group which we
2925 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2926 ??? As we can handle non-constant step fine here we should
2927 simply remove uses of DR_GROUP_GAP between the last and first
2928 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2929 simply not include that gap. */
2930 if ((dr_step
% type_size
) != 0)
2932 if (dump_enabled_p ())
2933 dump_printf_loc (MSG_NOTE
, vect_location
,
2934 "Step %T is not a multiple of the element size"
2939 groupsize
= absu_hwi (dr_step
) / type_size
;
2944 /* Not consecutive access is possible only if it is a part of interleaving. */
2945 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2947 /* Check if it this DR is a part of interleaving, and is a single
2948 element of the group that is accessed in the loop. */
2950 /* Gaps are supported only for loads. STEP must be a multiple of the type
2953 && (dr_step
% type_size
) == 0
2955 /* This could be UINT_MAX but as we are generating code in a very
2956 inefficient way we have to cap earlier.
2957 See PR91403 for example. */
2958 && groupsize
<= 4096)
2960 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2961 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2962 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2963 if (dump_enabled_p ())
2964 dump_printf_loc (MSG_NOTE
, vect_location
,
2965 "Detected single element interleaving %T"
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2974 "not consecutive access %G", stmt_info
->stmt
);
2978 /* Mark the statement as unvectorizable. */
2979 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2983 if (dump_enabled_p ())
2984 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2985 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2989 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2991 /* First stmt in the interleaving chain. Check the chain. */
2992 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2993 struct data_reference
*data_ref
= dr
;
2994 unsigned int count
= 1;
2995 tree prev_init
= DR_INIT (data_ref
);
2996 HOST_WIDE_INT diff
, gaps
= 0;
2998 /* By construction, all group members have INTEGER_CST DR_INITs. */
3001 /* We never have the same DR multiple times. */
3002 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
3003 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
3005 data_ref
= STMT_VINFO_DATA_REF (next
);
3007 /* All group members have the same STEP by construction. */
3008 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
3010 /* Check that the distance between two accesses is equal to the type
3011 size. Otherwise, we have gaps. */
3012 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
3013 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
3014 if (diff
< 1 || diff
> UINT_MAX
)
3016 /* For artificial testcases with array accesses with large
3017 constant indices we can run into overflow issues which
3018 can end up fooling the groupsize constraint below so
3019 check the individual gaps (which are represented as
3020 unsigned int) as well. */
3021 if (dump_enabled_p ())
3022 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3023 "interleaved access with gap larger "
3024 "than representable\n");
3029 /* FORNOW: SLP of accesses with gaps is not supported. */
3030 slp_impossible
= true;
3031 if (DR_IS_WRITE (data_ref
))
3033 if (dump_enabled_p ())
3034 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3035 "interleaved store with gaps\n");
3042 last_accessed_element
+= diff
;
3044 /* Store the gap from the previous member of the group. If there is no
3045 gap in the access, DR_GROUP_GAP is always 1. */
3046 DR_GROUP_GAP (next
) = diff
;
3048 prev_init
= DR_INIT (data_ref
);
3049 next
= DR_GROUP_NEXT_ELEMENT (next
);
3050 /* Count the number of data-refs in the chain. */
3055 groupsize
= count
+ gaps
;
3057 /* This could be UINT_MAX but as we are generating code in a very
3058 inefficient way we have to cap earlier. See PR78699 for example. */
3059 if (groupsize
> 4096)
3061 if (dump_enabled_p ())
3062 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3063 "group is too large\n");
3067 /* Check that the size of the interleaving is equal to count for stores,
3068 i.e., that there are no gaps. */
3069 if (groupsize
!= count
3070 && !DR_IS_READ (dr
))
3073 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3076 /* If there is a gap after the last load in the group it is the
3077 difference between the groupsize and the last accessed
3079 When there is no gap, this difference should be 0. */
3080 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
3082 DR_GROUP_SIZE (stmt_info
) = groupsize
;
3083 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE
, vect_location
,
3086 "Detected interleaving ");
3087 if (DR_IS_READ (dr
))
3088 dump_printf (MSG_NOTE
, "load ");
3089 else if (STMT_VINFO_STRIDED_P (stmt_info
))
3090 dump_printf (MSG_NOTE
, "strided store ");
3092 dump_printf (MSG_NOTE
, "store ");
3093 dump_printf (MSG_NOTE
, "of size %u\n",
3094 (unsigned)groupsize
);
3095 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
3096 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3099 if (DR_GROUP_GAP (next
) != 1)
3100 dump_printf_loc (MSG_NOTE
, vect_location
,
3101 "\t<gap of %d elements>\n",
3102 DR_GROUP_GAP (next
) - 1);
3103 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
3104 next
= DR_GROUP_NEXT_ELEMENT (next
);
3106 if (DR_GROUP_GAP (stmt_info
) != 0)
3107 dump_printf_loc (MSG_NOTE
, vect_location
,
3108 "\t<gap of %d elements>\n",
3109 DR_GROUP_GAP (stmt_info
));
3112 /* SLP: create an SLP data structure for every interleaving group of
3113 stores for further analysis in vect_analyse_slp. */
3114 if (DR_IS_WRITE (dr
) && !slp_impossible
)
3117 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
3119 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3126 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3127 accesses of legal size, step, etc. Detect gaps, single element
3128 interleaving, and other special cases. Set grouped access info.
3129 Collect groups of strided stores for further use in SLP analysis. */
3132 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3134 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
3136 /* Dissolve the group if present. */
3137 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
3140 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3141 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3142 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
3150 /* Analyze the access pattern of the data-reference DR_INFO.
3151 In case of non-consecutive accesses call vect_analyze_group_access() to
3152 analyze groups of accesses. */
3155 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3157 data_reference
*dr
= dr_info
->dr
;
3158 tree step
= DR_STEP (dr
);
3159 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
3160 stmt_vec_info stmt_info
= dr_info
->stmt
;
3161 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
3162 class loop
*loop
= NULL
;
3164 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
3168 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3170 if (loop_vinfo
&& !step
)
3172 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3174 "bad data-ref access in loop\n");
3178 /* Allow loads with zero step in inner-loop vectorization. */
3179 if (loop_vinfo
&& integer_zerop (step
))
3181 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3182 if (!nested_in_vect_loop_p (loop
, stmt_info
))
3183 return DR_IS_READ (dr
);
3184 /* Allow references with zero step for outer loops marked
3185 with pragma omp simd only - it guarantees absence of
3186 loop-carried dependencies between inner loop iterations. */
3187 if (loop
->safelen
< 2)
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE
, vect_location
,
3191 "zero step in inner loop of nest\n");
3196 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
3198 /* Interleaved accesses are not yet supported within outer-loop
3199 vectorization for references in the inner-loop. */
3200 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3202 /* For the rest of the analysis we use the outer-loop step. */
3203 step
= STMT_VINFO_DR_STEP (stmt_info
);
3204 if (integer_zerop (step
))
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_NOTE
, vect_location
,
3208 "zero step in outer loop.\n");
3209 return DR_IS_READ (dr
);
3214 if (TREE_CODE (step
) == INTEGER_CST
)
3216 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
3217 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
3219 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
3221 /* Mark that it is not interleaving. */
3222 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3227 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
3229 if (dump_enabled_p ())
3230 dump_printf_loc (MSG_NOTE
, vect_location
,
3231 "grouped access in outer loop.\n");
3236 /* Assume this is a DR handled by non-constant strided load case. */
3237 if (TREE_CODE (step
) != INTEGER_CST
)
3238 return (STMT_VINFO_STRIDED_P (stmt_info
)
3239 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3240 || vect_analyze_group_access (vinfo
, dr_info
)));
3242 /* Not consecutive access - check if it's a part of interleaving group. */
3243 return vect_analyze_group_access (vinfo
, dr_info
);
3246 /* Compare two data-references DRA and DRB to group them into chunks
3247 suitable for grouping. */
3250 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
3252 dr_vec_info
*dra_info
= *(dr_vec_info
**)const_cast<void *>(dra_
);
3253 dr_vec_info
*drb_info
= *(dr_vec_info
**)const_cast<void *>(drb_
);
3254 data_reference_p dra
= dra_info
->dr
;
3255 data_reference_p drb
= drb_info
->dr
;
3258 /* Stabilize sort. */
3262 /* Different group IDs lead never belong to the same group. */
3263 if (dra_info
->group
!= drb_info
->group
)
3264 return dra_info
->group
< drb_info
->group
? -1 : 1;
3266 /* Ordering of DRs according to base. */
3267 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3268 DR_BASE_ADDRESS (drb
));
3272 /* And according to DR_OFFSET. */
3273 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
3277 /* Put reads before writes. */
3278 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
3279 return DR_IS_READ (dra
) ? -1 : 1;
3281 /* Then sort after access size. */
3282 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
3283 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
3287 /* And after step. */
3288 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
3292 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3293 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
3295 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
3299 /* If OP is the result of a conversion, return the unconverted value,
3300 otherwise return null. */
3303 strip_conversion (tree op
)
3305 if (TREE_CODE (op
) != SSA_NAME
)
3307 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
3308 if (!is_gimple_assign (stmt
)
3309 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
3311 return gimple_assign_rhs1 (stmt
);
3314 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3315 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3316 be grouped in SLP mode. */
3319 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
3322 if (gimple_assign_single_p (stmt1_info
->stmt
))
3323 return gimple_assign_single_p (stmt2_info
->stmt
);
3325 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
3326 if (call1
&& gimple_call_internal_p (call1
))
3328 /* Check for two masked loads or two masked stores. */
3329 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
3330 if (!call2
|| !gimple_call_internal_p (call2
))
3332 internal_fn ifn
= gimple_call_internal_fn (call1
);
3333 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
3335 if (ifn
!= gimple_call_internal_fn (call2
))
3338 /* Check that the masks are the same. Cope with casts of masks,
3339 like those created by build_mask_conversion. */
3340 tree mask1
= gimple_call_arg (call1
, 2);
3341 tree mask2
= gimple_call_arg (call2
, 2);
3342 if (!operand_equal_p (mask1
, mask2
, 0) && !allow_slp_p
)
3344 mask1
= strip_conversion (mask1
);
3347 mask2
= strip_conversion (mask2
);
3350 if (!operand_equal_p (mask1
, mask2
, 0))
3359 /* Function vect_analyze_data_ref_accesses.
3361 Analyze the access pattern of all the data references in the loop.
3363 FORNOW: the only access pattern that is considered vectorizable is a
3364 simple step 1 (consecutive) access.
3366 FORNOW: handle only arrays and pointer accesses. */
3369 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
3370 vec
<int> *dataref_groups
)
3373 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
3375 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3377 if (datarefs
.is_empty ())
3378 return opt_result::success ();
3380 /* Sort the array of datarefs to make building the interleaving chains
3381 linear. Don't modify the original vector's order, it is needed for
3382 determining what dependencies are reversed. */
3383 vec
<dr_vec_info
*> datarefs_copy
;
3384 datarefs_copy
.create (datarefs
.length ());
3385 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
3387 dr_vec_info
*dr_info
= vinfo
->lookup_dr (datarefs
[i
]);
3388 /* If the caller computed DR grouping use that, otherwise group by
3391 dr_info
->group
= (*dataref_groups
)[i
];
3393 dr_info
->group
= gimple_bb (DR_STMT (datarefs
[i
]))->index
;
3394 datarefs_copy
.quick_push (dr_info
);
3396 datarefs_copy
.qsort (dr_group_sort_cmp
);
3397 hash_set
<stmt_vec_info
> to_fixup
;
3399 /* Build the interleaving chains. */
3400 for (i
= 0; i
< datarefs_copy
.length () - 1;)
3402 dr_vec_info
*dr_info_a
= datarefs_copy
[i
];
3403 data_reference_p dra
= dr_info_a
->dr
;
3404 int dra_group_id
= dr_info_a
->group
;
3405 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
3406 stmt_vec_info lastinfo
= NULL
;
3407 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
3408 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
3413 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
3415 dr_vec_info
*dr_info_b
= datarefs_copy
[i
];
3416 data_reference_p drb
= dr_info_b
->dr
;
3417 int drb_group_id
= dr_info_b
->group
;
3418 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
3419 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
3420 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
3423 /* ??? Imperfect sorting (non-compatible types, non-modulo
3424 accesses, same accesses) can lead to a group to be artificially
3425 split here as we don't just skip over those. If it really
3426 matters we can push those to a worklist and re-iterate
3427 over them. The we can just skip ahead to the next DR here. */
3429 /* DRs in a different DR group should not be put into the same
3430 interleaving group. */
3431 if (dra_group_id
!= drb_group_id
)
3434 /* Check that the data-refs have same first location (except init)
3435 and they are both either store or load (not load and store,
3436 not masked loads or stores). */
3437 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3438 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3439 DR_BASE_ADDRESS (drb
)) != 0
3440 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3441 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3444 /* Check that the data-refs have the same constant size. */
3445 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3446 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3447 if (!tree_fits_uhwi_p (sza
)
3448 || !tree_fits_uhwi_p (szb
)
3449 || !tree_int_cst_equal (sza
, szb
))
3452 /* Check that the data-refs have the same step. */
3453 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3456 /* Check the types are compatible.
3457 ??? We don't distinguish this during sorting. */
3458 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3459 TREE_TYPE (DR_REF (drb
))))
3462 /* Check that the DR_INITs are compile-time constants. */
3463 if (!tree_fits_shwi_p (DR_INIT (dra
))
3464 || !tree_fits_shwi_p (DR_INIT (drb
)))
3467 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3468 just hold extra information. */
3469 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3470 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3471 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3474 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3475 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3476 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3477 HOST_WIDE_INT init_prev
3478 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]->dr
));
3479 gcc_assert (init_a
<= init_b
3480 && init_a
<= init_prev
3481 && init_prev
<= init_b
);
3483 /* Do not place the same access in the interleaving chain twice. */
3484 if (init_b
== init_prev
)
3486 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]->dr
))
3487 < gimple_uid (DR_STMT (drb
)));
3488 /* Simply link in duplicates and fix up the chain below. */
3492 /* If init_b == init_a + the size of the type * k, we have an
3493 interleaving, and DRA is accessed before DRB. */
3494 unsigned HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3495 if (type_size_a
== 0
3496 || (((unsigned HOST_WIDE_INT
)init_b
- init_a
)
3497 % type_size_a
!= 0))
3500 /* If we have a store, the accesses are adjacent. This splits
3501 groups into chunks we support (we don't support vectorization
3502 of stores with gaps). */
3503 if (!DR_IS_READ (dra
)
3504 && (((unsigned HOST_WIDE_INT
)init_b
- init_prev
)
3508 /* If the step (if not zero or non-constant) is smaller than the
3509 difference between data-refs' inits this splits groups into
3511 if (tree_fits_shwi_p (DR_STEP (dra
)))
3513 unsigned HOST_WIDE_INT step
3514 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3516 && step
<= ((unsigned HOST_WIDE_INT
)init_b
- init_a
))
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE
, vect_location
,
3524 ? "Detected interleaving load %T and %T\n"
3525 : "Detected interleaving store %T and %T\n",
3526 DR_REF (dra
), DR_REF (drb
));
3528 /* Link the found element into the group list. */
3529 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3531 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3532 lastinfo
= stmtinfo_a
;
3534 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3535 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3536 lastinfo
= stmtinfo_b
;
3538 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3539 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3541 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3542 dump_printf_loc (MSG_NOTE
, vect_location
,
3543 "Load suitable for SLP vectorization only.\n");
3545 if (init_b
== init_prev
3546 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3547 && dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE
, vect_location
,
3549 "Queuing group with duplicate access for fixup\n");
3553 /* Fixup groups with duplicate entries by splitting it. */
3556 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3557 if (!(it
!= to_fixup
.end ()))
3559 stmt_vec_info grp
= *it
;
3560 to_fixup
.remove (grp
);
3562 /* Find the earliest duplicate group member. */
3563 unsigned first_duplicate
= -1u;
3564 stmt_vec_info next
, g
= grp
;
3565 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3567 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3568 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3569 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3570 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3573 if (first_duplicate
== -1U)
3576 /* Then move all stmts after the first duplicate to a new group.
3577 Note this is a heuristic but one with the property that *it
3578 is fixed up completely. */
3580 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3581 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3583 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3585 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3589 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3591 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3594 g
= DR_GROUP_NEXT_ELEMENT (g
);
3596 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3598 /* Fixup the new group which still may contain duplicates. */
3599 to_fixup
.add (newgroup
);
3602 dr_vec_info
*dr_info
;
3603 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_info
)
3605 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3606 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3608 if (dump_enabled_p ())
3609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3610 "not vectorized: complicated access pattern.\n");
3612 if (is_a
<bb_vec_info
> (vinfo
))
3614 /* Mark the statement as not vectorizable. */
3615 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3620 datarefs_copy
.release ();
3621 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3623 " complicated access pattern.\n");
3628 datarefs_copy
.release ();
3629 return opt_result::success ();
3632 /* Function vect_vfa_segment_size.
3635 DR_INFO: The data reference.
3636 LENGTH_FACTOR: segment length to consider.
3638 Return a value suitable for the dr_with_seg_len::seg_len field.
3639 This is the "distance travelled" by the pointer from the first
3640 iteration in the segment to the last. Note that it does not include
3641 the size of the access; in effect it only describes the first byte. */
3644 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3646 length_factor
= size_binop (MINUS_EXPR
,
3647 fold_convert (sizetype
, length_factor
),
3649 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3653 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3654 gives the worst-case number of bytes covered by the segment. */
3656 static unsigned HOST_WIDE_INT
3657 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3659 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3660 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3661 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3662 unsigned HOST_WIDE_INT access_size
= ref_size
;
3663 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3665 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3666 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3668 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3670 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3671 && ((misalignment
= dr_misalignment (dr_info
, vectype
)), true)
3672 && (vect_supportable_dr_alignment (vinfo
, dr_info
, vectype
, misalignment
)
3673 == dr_explicit_realign_optimized
))
3675 /* We might access a full vector's worth. */
3676 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3681 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3685 vect_vfa_align (dr_vec_info
*dr_info
)
3687 return dr_alignment (dr_info
->dr
);
3690 /* Function vect_no_alias_p.
3692 Given data references A and B with equal base and offset, see whether
3693 the alias relation can be decided at compilation time. Return 1 if
3694 it can and the references alias, 0 if it can and the references do
3695 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3696 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3697 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3700 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3701 tree segment_length_a
, tree segment_length_b
,
3702 unsigned HOST_WIDE_INT access_size_a
,
3703 unsigned HOST_WIDE_INT access_size_b
)
3705 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3706 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3707 poly_uint64 const_length_a
;
3708 poly_uint64 const_length_b
;
3710 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3711 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3713 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3715 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3716 offset_a
-= const_length_a
;
3719 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3720 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3722 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3723 offset_b
-= const_length_b
;
3726 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3728 const_length_a
+= access_size_a
;
3729 const_length_b
+= access_size_b
;
3731 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3732 offset_b
, const_length_b
))
3735 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3736 offset_b
, const_length_b
))
3742 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3746 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3747 unsigned int loop_depth
, poly_uint64 vf
)
3749 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3750 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3753 /* If the dependence is exact, we should have limited the VF instead. */
3754 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3757 lambda_vector dist_v
;
3758 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3760 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3762 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3763 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3767 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_NOTE
, vect_location
,
3769 "dependence distance between %T and %T is >= VF\n",
3770 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3775 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3778 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3780 dump_printf (dump_kind
, "%s (%T) >= ",
3781 lower_bound
.unsigned_p
? "unsigned" : "abs",
3783 dump_dec (dump_kind
, lower_bound
.min_value
);
3786 /* Record that the vectorized loop requires the vec_lower_bound described
3787 by EXPR, UNSIGNED_P and MIN_VALUE. */
3790 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3791 poly_uint64 min_value
)
3793 vec
<vec_lower_bound
> &lower_bounds
3794 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3795 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3796 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3798 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3799 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3800 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3801 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3803 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3804 lower_bounds
[i
].min_value
= min_value
;
3805 if (dump_enabled_p ())
3807 dump_printf_loc (MSG_NOTE
, vect_location
,
3808 "updating run-time check to ");
3809 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3810 dump_printf (MSG_NOTE
, "\n");
3816 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3817 if (dump_enabled_p ())
3819 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3820 dump_lower_bound (MSG_NOTE
, lower_bound
);
3821 dump_printf (MSG_NOTE
, "\n");
3823 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3826 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3827 will span fewer than GAP bytes. */
3830 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3833 stmt_vec_info stmt_info
= dr_info
->stmt
;
3835 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3836 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3837 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3838 return (estimated_poly_value (gap
)
3839 <= count
* vect_get_scalar_dr_size (dr_info
));
3842 /* Return true if we know that there is no alias between DR_INFO_A and
3843 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3844 When returning true, set *LOWER_BOUND_OUT to this N. */
3847 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3848 poly_uint64
*lower_bound_out
)
3850 /* Check that there is a constant gap of known sign between DR_A
3852 data_reference
*dr_a
= dr_info_a
->dr
;
3853 data_reference
*dr_b
= dr_info_b
->dr
;
3854 poly_int64 init_a
, init_b
;
3855 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3856 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3857 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3858 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3859 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3860 || !ordered_p (init_a
, init_b
))
3863 /* Sort DR_A and DR_B by the address they access. */
3864 if (maybe_lt (init_b
, init_a
))
3866 std::swap (init_a
, init_b
);
3867 std::swap (dr_info_a
, dr_info_b
);
3868 std::swap (dr_a
, dr_b
);
3871 /* If the two accesses could be dependent within a scalar iteration,
3872 make sure that we'd retain their order. */
3873 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3874 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3877 /* There is no alias if abs (DR_STEP) is greater than or equal to
3878 the bytes spanned by the combination of the two accesses. */
3879 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3883 /* Function vect_prune_runtime_alias_test_list.
3885 Prune a list of ddrs to be tested at run-time by versioning for alias.
3886 Merge several alias checks into one if possible.
3887 Return FALSE if resulting list of ddrs is longer then allowed by
3888 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3891 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3893 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3894 hash_set
<tree_pair_hash
> compared_objects
;
3896 const vec
<ddr_p
> &may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3897 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3898 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3899 const vec
<vec_object_pair
> &check_unequal_addrs
3900 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3901 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3902 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3908 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3910 /* Step values are irrelevant for aliasing if the number of vector
3911 iterations is equal to the number of scalar iterations (which can
3912 happen for fully-SLP loops). */
3913 bool vf_one_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3917 /* Convert the checks for nonzero steps into bound tests. */
3919 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3920 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3923 if (may_alias_ddrs
.is_empty ())
3924 return opt_result::success ();
3926 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3928 unsigned int loop_depth
3929 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3930 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3932 /* First, we collect all data ref pairs for aliasing checks. */
3933 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3935 poly_uint64 lower_bound
;
3936 tree segment_length_a
, segment_length_b
;
3937 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3938 unsigned int align_a
, align_b
;
3940 /* Ignore the alias if the VF we chose ended up being no greater
3941 than the dependence distance. */
3942 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3945 if (DDR_OBJECT_A (ddr
))
3947 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3948 if (!compared_objects
.add (new_pair
))
3950 if (dump_enabled_p ())
3951 dump_printf_loc (MSG_NOTE
, vect_location
,
3952 "checking that %T and %T"
3953 " have different addresses\n",
3954 new_pair
.first
, new_pair
.second
);
3955 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3960 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3961 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3963 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3964 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3966 bool preserves_scalar_order_p
3967 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3970 && (preserves_scalar_order_p
3971 || operand_equal_p (DR_STEP (dr_info_a
->dr
),
3972 DR_STEP (dr_info_b
->dr
))));
3974 /* Skip the pair if inter-iteration dependencies are irrelevant
3975 and intra-iteration dependencies are guaranteed to be honored. */
3977 && (preserves_scalar_order_p
3978 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3981 if (dump_enabled_p ())
3982 dump_printf_loc (MSG_NOTE
, vect_location
,
3983 "no need for alias check between "
3984 "%T and %T when VF is 1\n",
3985 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3989 /* See whether we can handle the alias using a bounds check on
3990 the step, and whether that's likely to be the best approach.
3991 (It might not be, for example, if the minimum step is much larger
3992 than the number of bytes handled by one vector iteration.) */
3994 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3995 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3997 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3998 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
4000 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
4001 if (dump_enabled_p ())
4003 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
4004 "%T and %T when the step %T is outside ",
4005 DR_REF (dr_info_a
->dr
),
4006 DR_REF (dr_info_b
->dr
),
4007 DR_STEP (dr_info_a
->dr
));
4009 dump_printf (MSG_NOTE
, "[0");
4012 dump_printf (MSG_NOTE
, "(");
4013 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
4015 dump_printf (MSG_NOTE
, ", ");
4016 dump_dec (MSG_NOTE
, lower_bound
);
4017 dump_printf (MSG_NOTE
, ")\n");
4019 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
4020 unsigned_p
, lower_bound
);
4024 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
4025 if (dr_group_first_a
)
4027 stmt_info_a
= dr_group_first_a
;
4028 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
4031 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
4032 if (dr_group_first_b
)
4034 stmt_info_b
= dr_group_first_b
;
4035 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
4040 segment_length_a
= size_zero_node
;
4041 segment_length_b
= size_zero_node
;
4045 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
4046 DR_STEP (dr_info_b
->dr
), 0))
4047 length_factor
= scalar_loop_iters
;
4049 length_factor
= size_int (vect_factor
);
4050 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
4051 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
4053 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
4054 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
4055 align_a
= vect_vfa_align (dr_info_a
);
4056 align_b
= vect_vfa_align (dr_info_b
);
4058 /* See whether the alias is known at compilation time. */
4059 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
4060 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
4061 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
4062 DR_OFFSET (dr_info_b
->dr
), 0)
4063 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
4064 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
4065 && poly_int_tree_p (segment_length_a
)
4066 && poly_int_tree_p (segment_length_b
))
4068 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
4073 if (res
>= 0 && dump_enabled_p ())
4075 dump_printf_loc (MSG_NOTE
, vect_location
,
4076 "can tell at compile time that %T and %T",
4077 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
4079 dump_printf (MSG_NOTE
, " do not alias\n");
4081 dump_printf (MSG_NOTE
, " alias\n");
4088 return opt_result::failure_at (stmt_info_b
->stmt
,
4090 " compilation time alias: %G%G",
4095 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
4096 access_size_a
, align_a
);
4097 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
4098 access_size_b
, align_b
);
4099 /* Canonicalize the order to be the one that's needed for accurate
4100 RAW, WAR and WAW flags, in cases where the data references are
4101 well-ordered. The order doesn't really matter otherwise,
4102 but we might as well be consistent. */
4103 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
4104 std::swap (dr_a
, dr_b
);
4106 dr_with_seg_len_pair_t dr_with_seg_len_pair
4107 (dr_a
, dr_b
, (preserves_scalar_order_p
4108 ? dr_with_seg_len_pair_t::WELL_ORDERED
4109 : dr_with_seg_len_pair_t::REORDERED
));
4111 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
4114 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
4116 unsigned int count
= (comp_alias_ddrs
.length ()
4117 + check_unequal_addrs
.length ());
4120 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
))
4121 == VECT_COST_MODEL_VERY_CHEAP
))
4122 return opt_result::failure_at
4123 (vect_location
, "would need a runtime alias check\n");
4125 if (dump_enabled_p ())
4126 dump_printf_loc (MSG_NOTE
, vect_location
,
4127 "improved number of alias checks from %d to %d\n",
4128 may_alias_ddrs
.length (), count
);
4129 unsigned limit
= param_vect_max_version_for_alias_checks
;
4130 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)) == VECT_COST_MODEL_CHEAP
)
4131 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
4133 return opt_result::failure_at
4135 "number of versioning for alias run-time tests exceeds %d "
4136 "(--param vect-max-version-for-alias-checks)\n", limit
);
4138 return opt_result::success ();
4141 /* Check whether we can use an internal function for a gather load
4142 or scatter store. READ_P is true for loads and false for stores.
4143 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4144 the type of the memory elements being loaded or stored. OFFSET_TYPE
4145 is the type of the offset that is being applied to the invariant
4146 base address. SCALE is the amount by which the offset should
4147 be multiplied *after* it has been converted to address width.
4149 Return true if the function is supported, storing the function id in
4150 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4153 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
4154 tree vectype
, tree memory_type
, tree offset_type
,
4155 int scale
, internal_fn
*ifn_out
,
4156 tree
*offset_vectype_out
)
4158 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
4159 unsigned int element_bits
= vector_element_bits (vectype
);
4160 if (element_bits
!= memory_bits
)
4161 /* For now the vector elements must be the same width as the
4165 /* Work out which function we need. */
4166 internal_fn ifn
, alt_ifn
, alt_ifn2
;
4169 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
4170 alt_ifn
= IFN_MASK_GATHER_LOAD
;
4171 /* When target supports MASK_LEN_GATHER_LOAD, we always
4172 use MASK_LEN_GATHER_LOAD regardless whether len and
4173 mask are valid or not. */
4174 alt_ifn2
= IFN_MASK_LEN_GATHER_LOAD
;
4178 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
4179 alt_ifn
= IFN_MASK_SCATTER_STORE
;
4180 /* When target supports MASK_LEN_SCATTER_STORE, we always
4181 use MASK_LEN_SCATTER_STORE regardless whether len and
4182 mask are valid or not. */
4183 alt_ifn2
= IFN_MASK_LEN_SCATTER_STORE
;
4188 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
4189 if (!offset_vectype
)
4192 /* Test whether the target supports this combination. */
4193 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
4194 offset_vectype
, scale
))
4197 *offset_vectype_out
= offset_vectype
;
4201 && internal_gather_scatter_fn_supported_p (alt_ifn
, vectype
,
4207 *offset_vectype_out
= offset_vectype
;
4210 else if (internal_gather_scatter_fn_supported_p (alt_ifn2
, vectype
,
4212 offset_vectype
, scale
))
4214 *ifn_out
= alt_ifn2
;
4215 *offset_vectype_out
= offset_vectype
;
4219 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
4220 && TYPE_PRECISION (offset_type
) >= element_bits
)
4223 offset_type
= build_nonstandard_integer_type
4224 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
4228 /* STMT_INFO is a call to an internal gather load or scatter store function.
4229 Describe the operation in INFO. */
4232 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
4233 gather_scatter_info
*info
)
4235 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
4236 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4237 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4239 info
->ifn
= gimple_call_internal_fn (call
);
4240 info
->decl
= NULL_TREE
;
4241 info
->base
= gimple_call_arg (call
, 0);
4242 info
->offset
= gimple_call_arg (call
, 1);
4243 info
->offset_dt
= vect_unknown_def_type
;
4244 info
->offset_vectype
= NULL_TREE
;
4245 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
4246 info
->element_type
= TREE_TYPE (vectype
);
4247 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
4250 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4251 gather load or scatter store. Describe the operation in *INFO if so. */
4254 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
4255 gather_scatter_info
*info
)
4257 HOST_WIDE_INT scale
= 1;
4258 poly_int64 pbitpos
, pbitsize
;
4259 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4260 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4261 tree offtype
= NULL_TREE
;
4262 tree decl
= NULL_TREE
, base
, off
;
4263 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4264 tree memory_type
= TREE_TYPE (DR_REF (dr
));
4266 int punsignedp
, reversep
, pvolatilep
= 0;
4268 tree offset_vectype
;
4269 bool masked_p
= false;
4271 /* See whether this is already a call to a gather/scatter internal function.
4272 If not, see whether it's a masked load or store. */
4273 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4274 if (call
&& gimple_call_internal_p (call
))
4276 ifn
= gimple_call_internal_fn (call
);
4277 if (internal_gather_scatter_fn_p (ifn
))
4279 vect_describe_gather_scatter_call (stmt_info
, info
);
4282 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
4285 /* True if we should aim to use internal functions rather than
4286 built-in functions. */
4287 bool use_ifn_p
= (DR_IS_READ (dr
)
4288 ? supports_vec_gather_load_p (TYPE_MODE (vectype
))
4289 : supports_vec_scatter_store_p (TYPE_MODE (vectype
)));
4292 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4293 see if we can use the def stmt of the address. */
4295 && TREE_CODE (base
) == MEM_REF
4296 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
4297 && integer_zerop (TREE_OPERAND (base
, 1))
4298 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
4300 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
4301 if (is_gimple_assign (def_stmt
)
4302 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
4303 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
4306 /* The gather and scatter builtins need address of the form
4307 loop_invariant + vector * {1, 2, 4, 8}
4309 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4310 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4311 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4312 multiplications and additions in it. To get a vector, we need
4313 a single SSA_NAME that will be defined in the loop and will
4314 contain everything that is not loop invariant and that can be
4315 vectorized. The following code attempts to find such a preexistng
4316 SSA_NAME OFF and put the loop invariants into a tree BASE
4317 that can be gimplified before the loop. */
4318 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
4319 &punsignedp
, &reversep
, &pvolatilep
);
4323 /* PR 107346. Packed structs can have fields at offsets that are not
4324 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4325 if (!multiple_p (pbitpos
, BITS_PER_UNIT
))
4328 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
4330 if (TREE_CODE (base
) == MEM_REF
)
4332 if (!integer_zerop (TREE_OPERAND (base
, 1)))
4334 if (off
== NULL_TREE
)
4335 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
4337 off
= size_binop (PLUS_EXPR
, off
,
4338 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
4340 base
= TREE_OPERAND (base
, 0);
4343 base
= build_fold_addr_expr (base
);
4345 if (off
== NULL_TREE
)
4346 off
= size_zero_node
;
4348 /* If base is not loop invariant, either off is 0, then we start with just
4349 the constant offset in the loop invariant BASE and continue with base
4350 as OFF, otherwise give up.
4351 We could handle that case by gimplifying the addition of base + off
4352 into some SSA_NAME and use that as off, but for now punt. */
4353 if (!expr_invariant_in_loop_p (loop
, base
))
4355 if (!integer_zerop (off
))
4358 base
= size_int (pbytepos
);
4360 /* Otherwise put base + constant offset into the loop invariant BASE
4361 and continue with OFF. */
4364 base
= fold_convert (sizetype
, base
);
4365 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
4368 /* OFF at this point may be either a SSA_NAME or some tree expression
4369 from get_inner_reference. Try to peel off loop invariants from it
4370 into BASE as long as possible. */
4372 while (offtype
== NULL_TREE
)
4374 enum tree_code code
;
4375 tree op0
, op1
, add
= NULL_TREE
;
4377 if (TREE_CODE (off
) == SSA_NAME
)
4379 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
4381 if (expr_invariant_in_loop_p (loop
, off
))
4384 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
4387 op0
= gimple_assign_rhs1 (def_stmt
);
4388 code
= gimple_assign_rhs_code (def_stmt
);
4389 op1
= gimple_assign_rhs2 (def_stmt
);
4393 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
4395 code
= TREE_CODE (off
);
4396 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
4400 case POINTER_PLUS_EXPR
:
4402 if (expr_invariant_in_loop_p (loop
, op0
))
4407 add
= fold_convert (sizetype
, add
);
4409 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
4410 base
= size_binop (PLUS_EXPR
, base
, add
);
4413 if (expr_invariant_in_loop_p (loop
, op1
))
4421 if (expr_invariant_in_loop_p (loop
, op1
))
4423 add
= fold_convert (sizetype
, op1
);
4424 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
4430 if (scale
== 1 && tree_fits_shwi_p (op1
))
4432 int new_scale
= tree_to_shwi (op1
);
4433 /* Only treat this as a scaling operation if the target
4434 supports it for at least some offset type. */
4436 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4437 masked_p
, vectype
, memory_type
,
4438 signed_char_type_node
,
4441 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4442 masked_p
, vectype
, memory_type
,
4443 unsigned_char_type_node
,
4456 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
4457 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
4460 /* Don't include the conversion if the target is happy with
4461 the current offset type. */
4463 && TREE_CODE (off
) == SSA_NAME
4464 && !POINTER_TYPE_P (TREE_TYPE (off
))
4465 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4466 masked_p
, vectype
, memory_type
,
4467 TREE_TYPE (off
), scale
, &ifn
,
4471 if (TYPE_PRECISION (TREE_TYPE (op0
))
4472 == TYPE_PRECISION (TREE_TYPE (off
)))
4478 /* Include the conversion if it is widening and we're using
4479 the IFN path or the target can handle the converted from
4480 offset or the current size is not already the same as the
4481 data vector element size. */
4482 if ((TYPE_PRECISION (TREE_TYPE (op0
))
4483 < TYPE_PRECISION (TREE_TYPE (off
)))
4486 ? (targetm
.vectorize
.builtin_gather
4487 && targetm
.vectorize
.builtin_gather (vectype
,
4490 : (targetm
.vectorize
.builtin_scatter
4491 && targetm
.vectorize
.builtin_scatter (vectype
,
4494 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off
)),
4495 TYPE_SIZE (TREE_TYPE (vectype
)), 0)))
4498 offtype
= TREE_TYPE (off
);
4509 /* If at the end OFF still isn't a SSA_NAME or isn't
4510 defined in the loop, punt. */
4511 if (TREE_CODE (off
) != SSA_NAME
4512 || expr_invariant_in_loop_p (loop
, off
))
4515 if (offtype
== NULL_TREE
)
4516 offtype
= TREE_TYPE (off
);
4520 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4521 vectype
, memory_type
, offtype
, scale
,
4522 &ifn
, &offset_vectype
))
4528 if (DR_IS_READ (dr
))
4530 if (targetm
.vectorize
.builtin_gather
)
4531 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4535 if (targetm
.vectorize
.builtin_scatter
)
4536 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4539 /* The offset vector type will be read from DECL when needed. */
4540 offset_vectype
= NULL_TREE
;
4547 info
->offset_dt
= vect_unknown_def_type
;
4548 info
->offset_vectype
= offset_vectype
;
4549 info
->scale
= scale
;
4550 info
->element_type
= TREE_TYPE (vectype
);
4551 info
->memory_type
= memory_type
;
4555 /* Find the data references in STMT, analyze them with respect to LOOP and
4556 append them to DATAREFS. Return false if datarefs in this stmt cannot
4560 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4561 vec
<data_reference_p
> *datarefs
,
4562 vec
<int> *dataref_groups
, int group_id
)
4564 /* We can ignore clobbers for dataref analysis - they are removed during
4565 loop vectorization and BB vectorization checks dependences with a
4567 if (gimple_clobber_p (stmt
))
4568 return opt_result::success ();
4570 if (gimple_has_volatile_ops (stmt
))
4571 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4574 if (stmt_can_throw_internal (cfun
, stmt
))
4575 return opt_result::failure_at (stmt
,
4577 " statement can throw an exception: %G",
4580 auto_vec
<data_reference_p
, 2> refs
;
4581 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4585 if (refs
.is_empty ())
4586 return opt_result::success ();
4588 if (refs
.length () > 1)
4590 while (!refs
.is_empty ())
4591 free_data_ref (refs
.pop ());
4592 return opt_result::failure_at (stmt
,
4593 "not vectorized: more than one "
4594 "data ref in stmt: %G", stmt
);
4597 data_reference_p dr
= refs
.pop ();
4598 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4599 if (!gimple_call_internal_p (call
)
4600 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4601 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4604 return opt_result::failure_at (stmt
,
4605 "not vectorized: dr in a call %G", stmt
);
4608 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4609 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4612 return opt_result::failure_at (stmt
,
4614 " statement is an unsupported"
4615 " bitfield access %G", stmt
);
4618 if (DR_BASE_ADDRESS (dr
)
4619 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4622 return opt_result::failure_at (stmt
,
4624 " base addr of dr is a constant\n");
4627 /* Check whether this may be a SIMD lane access and adjust the
4628 DR to make it easier for us to handle it. */
4631 && (!DR_BASE_ADDRESS (dr
)
4636 struct data_reference
*newdr
4637 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4638 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4639 if (DR_BASE_ADDRESS (newdr
)
4640 && DR_OFFSET (newdr
)
4643 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4644 && integer_zerop (DR_STEP (newdr
)))
4646 tree base_address
= DR_BASE_ADDRESS (newdr
);
4647 tree off
= DR_OFFSET (newdr
);
4648 tree step
= ssize_int (1);
4649 if (integer_zerop (off
)
4650 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4652 off
= TREE_OPERAND (base_address
, 1);
4653 base_address
= TREE_OPERAND (base_address
, 0);
4656 if (TREE_CODE (off
) == MULT_EXPR
4657 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4659 step
= TREE_OPERAND (off
, 1);
4660 off
= TREE_OPERAND (off
, 0);
4663 if (CONVERT_EXPR_P (off
)
4664 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4665 < TYPE_PRECISION (TREE_TYPE (off
))))
4666 off
= TREE_OPERAND (off
, 0);
4667 if (TREE_CODE (off
) == SSA_NAME
)
4669 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4670 /* Look through widening conversion. */
4671 if (is_gimple_assign (def
)
4672 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4674 tree rhs1
= gimple_assign_rhs1 (def
);
4675 if (TREE_CODE (rhs1
) == SSA_NAME
4676 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4677 && (TYPE_PRECISION (TREE_TYPE (off
))
4678 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4679 def
= SSA_NAME_DEF_STMT (rhs1
);
4681 if (is_gimple_call (def
)
4682 && gimple_call_internal_p (def
)
4683 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4685 tree arg
= gimple_call_arg (def
, 0);
4686 tree reft
= TREE_TYPE (DR_REF (newdr
));
4687 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4688 arg
= SSA_NAME_VAR (arg
);
4689 if (arg
== loop
->simduid
4691 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4693 DR_BASE_ADDRESS (newdr
) = base_address
;
4694 DR_OFFSET (newdr
) = ssize_int (0);
4695 DR_STEP (newdr
) = step
;
4696 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4697 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4698 /* Mark as simd-lane access. */
4699 tree arg2
= gimple_call_arg (def
, 1);
4700 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4702 datarefs
->safe_push (newdr
);
4704 dataref_groups
->safe_push (group_id
);
4705 return opt_result::success ();
4710 free_data_ref (newdr
);
4713 datarefs
->safe_push (dr
);
4715 dataref_groups
->safe_push (group_id
);
4716 return opt_result::success ();
4719 /* Function vect_analyze_data_refs.
4721 Find all the data references in the loop or basic block.
4723 The general structure of the analysis of data refs in the vectorizer is as
4725 1- vect_analyze_data_refs(loop/bb): call
4726 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4727 in the loop/bb and their dependences.
4728 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4729 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4730 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4735 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4737 class loop
*loop
= NULL
;
4739 struct data_reference
*dr
;
4742 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4744 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4745 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4747 /* Go through the data-refs, check that the analysis succeeded. Update
4748 pointer from stmt_vec_info struct to DR and vectype. */
4750 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4751 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4753 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4756 gcc_assert (DR_REF (dr
));
4757 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4758 gcc_assert (!stmt_info
->dr_aux
.dr
);
4759 stmt_info
->dr_aux
.dr
= dr
;
4760 stmt_info
->dr_aux
.stmt
= stmt_info
;
4762 /* Check that analysis of the data-ref succeeded. */
4763 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4768 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4771 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4773 /* If target supports vector gather loads or scatter stores,
4774 see if they can't be used. */
4775 if (is_a
<loop_vec_info
> (vinfo
)
4776 && !nested_in_vect_loop_p (loop
, stmt_info
))
4778 if (maybe_gather
|| maybe_scatter
)
4781 gatherscatter
= GATHER
;
4783 gatherscatter
= SCATTER
;
4787 if (gatherscatter
== SG_NONE
)
4789 if (dump_enabled_p ())
4790 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4791 "not vectorized: data ref analysis "
4792 "failed %G", stmt_info
->stmt
);
4793 if (is_a
<bb_vec_info
> (vinfo
))
4795 /* In BB vectorization the ref can still participate
4796 in dependence analysis, we just can't vectorize it. */
4797 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4800 return opt_result::failure_at (stmt_info
->stmt
,
4802 " data ref analysis failed: %G",
4807 /* See if this was detected as SIMD lane access. */
4808 if (dr
->aux
== (void *)-1
4809 || dr
->aux
== (void *)-2
4810 || dr
->aux
== (void *)-3
4811 || dr
->aux
== (void *)-4)
4813 if (nested_in_vect_loop_p (loop
, stmt_info
))
4814 return opt_result::failure_at (stmt_info
->stmt
,
4816 " data ref analysis failed: %G",
4818 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4819 = -(uintptr_t) dr
->aux
;
4822 tree base
= get_base_address (DR_REF (dr
));
4823 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4825 if (dump_enabled_p ())
4826 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4827 "not vectorized: base object not addressable "
4828 "for stmt: %G", stmt_info
->stmt
);
4829 if (is_a
<bb_vec_info
> (vinfo
))
4831 /* In BB vectorization the ref can still participate
4832 in dependence analysis, we just can't vectorize it. */
4833 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4836 return opt_result::failure_at (stmt_info
->stmt
,
4837 "not vectorized: base object not"
4838 " addressable for stmt: %G",
4842 if (is_a
<loop_vec_info
> (vinfo
)
4844 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4846 if (nested_in_vect_loop_p (loop
, stmt_info
))
4847 return opt_result::failure_at (stmt_info
->stmt
,
4849 "not suitable for strided load %G",
4851 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4854 /* Update DR field in stmt_vec_info struct. */
4856 /* If the dataref is in an inner-loop of the loop that is considered for
4857 for vectorization, we also want to analyze the access relative to
4858 the outer-loop (DR contains information only relative to the
4859 inner-most enclosing loop). We do that by building a reference to the
4860 first location accessed by the inner-loop, and analyze it relative to
4862 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4864 /* Build a reference to the first location accessed by the
4865 inner loop: *(BASE + INIT + OFFSET). By construction,
4866 this address must be invariant in the inner loop, so we
4867 can consider it as being used in the outer loop. */
4868 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4869 tree offset
= unshare_expr (DR_OFFSET (dr
));
4870 tree init
= unshare_expr (DR_INIT (dr
));
4871 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4873 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4874 tree init_ref
= build_fold_indirect_ref (init_addr
);
4876 if (dump_enabled_p ())
4877 dump_printf_loc (MSG_NOTE
, vect_location
,
4878 "analyze in outer loop: %T\n", init_ref
);
4881 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4882 init_ref
, loop
, stmt_info
->stmt
);
4884 /* dr_analyze_innermost already explained the failure. */
4887 if (dump_enabled_p ())
4888 dump_printf_loc (MSG_NOTE
, vect_location
,
4889 "\touter base_address: %T\n"
4890 "\touter offset from base address: %T\n"
4891 "\touter constant offset from base address: %T\n"
4892 "\touter step: %T\n"
4893 "\touter base alignment: %d\n\n"
4894 "\touter base misalignment: %d\n"
4895 "\touter offset alignment: %d\n"
4896 "\touter step alignment: %d\n",
4897 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4898 STMT_VINFO_DR_OFFSET (stmt_info
),
4899 STMT_VINFO_DR_INIT (stmt_info
),
4900 STMT_VINFO_DR_STEP (stmt_info
),
4901 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4902 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4903 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4904 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4907 /* Set vectype for STMT. */
4908 scalar_type
= TREE_TYPE (DR_REF (dr
));
4909 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4912 if (dump_enabled_p ())
4914 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4915 "not vectorized: no vectype for stmt: %G",
4917 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4918 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4920 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4923 if (is_a
<bb_vec_info
> (vinfo
))
4925 /* No vector type is fine, the ref can still participate
4926 in dependence analysis, we just can't vectorize it. */
4927 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4932 return opt_result::failure_at (stmt_info
->stmt
,
4934 " no vectype for stmt: %G"
4935 " scalar_type: %T\n",
4936 stmt_info
->stmt
, scalar_type
);
4940 if (dump_enabled_p ())
4941 dump_printf_loc (MSG_NOTE
, vect_location
,
4942 "got vectype for stmt: %G%T\n",
4943 stmt_info
->stmt
, vectype
);
4946 /* Adjust the minimal vectorization factor according to the
4948 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4949 *min_vf
= upper_bound (*min_vf
, vf
);
4951 /* Leave the BB vectorizer to pick the vector type later, based on
4952 the final dataref group size and SLP node size. */
4953 if (is_a
<loop_vec_info
> (vinfo
))
4954 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4956 if (gatherscatter
!= SG_NONE
)
4958 gather_scatter_info gs_info
;
4959 if (!vect_check_gather_scatter (stmt_info
,
4960 as_a
<loop_vec_info
> (vinfo
),
4962 || !get_vectype_for_scalar_type (vinfo
,
4963 TREE_TYPE (gs_info
.offset
)))
4967 return opt_result::failure_at
4969 (gatherscatter
== GATHER
)
4970 ? "not vectorized: not suitable for gather load %G"
4971 : "not vectorized: not suitable for scatter store %G",
4974 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4978 /* We used to stop processing and prune the list here. Verify we no
4980 gcc_assert (i
== datarefs
.length ());
4982 return opt_result::success ();
4986 /* Function vect_get_new_vect_var.
4988 Returns a name for a new variable. The current naming scheme appends the
4989 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4990 the name of vectorizer generated variables, and appends that to NAME if
4994 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
5001 case vect_simple_var
:
5004 case vect_scalar_var
:
5010 case vect_pointer_var
:
5019 char* tmp
= concat (prefix
, "_", name
, NULL
);
5020 new_vect_var
= create_tmp_reg (type
, tmp
);
5024 new_vect_var
= create_tmp_reg (type
, prefix
);
5026 return new_vect_var
;
5029 /* Like vect_get_new_vect_var but return an SSA name. */
5032 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
5039 case vect_simple_var
:
5042 case vect_scalar_var
:
5045 case vect_pointer_var
:
5054 char* tmp
= concat (prefix
, "_", name
, NULL
);
5055 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
5059 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
5061 return new_vect_var
;
5064 /* Duplicate points-to info on NAME from DR_INFO. */
5067 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
5069 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
5070 /* DR_PTR_INFO is for a base SSA name, not including constant or
5071 variable offsets in the ref so its alignment info does not apply. */
5072 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
5075 /* Function vect_create_addr_base_for_vector_ref.
5077 Create an expression that computes the address of the first memory location
5078 that will be accessed for a data reference.
5081 STMT_INFO: The statement containing the data reference.
5082 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5083 OFFSET: Optional. If supplied, it is be added to the initial address.
5084 LOOP: Specify relative to which loop-nest should the address be computed.
5085 For example, when the dataref is in an inner-loop nested in an
5086 outer-loop that is now being vectorized, LOOP can be either the
5087 outer-loop, or the inner-loop. The first memory location accessed
5088 by the following dataref ('in' points to short):
5095 if LOOP=i_loop: &in (relative to i_loop)
5096 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5099 1. Return an SSA_NAME whose value is the address of the memory location of
5100 the first vector of the data reference.
5101 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5102 these statement(s) which define the returned SSA_NAME.
5104 FORNOW: We are only handling array accesses with step 1. */
5107 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5108 gimple_seq
*new_stmt_list
,
5111 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5112 struct data_reference
*dr
= dr_info
->dr
;
5113 const char *base_name
;
5116 gimple_seq seq
= NULL
;
5118 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5119 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
5121 tree data_ref_base
= unshare_expr (drb
->base_address
);
5122 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
5123 tree init
= unshare_expr (drb
->init
);
5126 base_name
= get_name (data_ref_base
);
5129 base_offset
= ssize_int (0);
5130 init
= ssize_int (0);
5131 base_name
= get_name (DR_REF (dr
));
5134 /* Create base_offset */
5135 base_offset
= size_binop (PLUS_EXPR
,
5136 fold_convert (sizetype
, base_offset
),
5137 fold_convert (sizetype
, init
));
5141 offset
= fold_convert (sizetype
, offset
);
5142 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
5143 base_offset
, offset
);
5146 /* base + base_offset */
5148 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
5150 addr_base
= build1 (ADDR_EXPR
,
5151 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
5152 /* Strip zero offset components since we don't need
5153 them and they can confuse late diagnostics if
5154 we CSE them wrongly. See PR106904 for example. */
5155 unshare_expr (strip_zero_offset_components
5158 vect_ptr_type
= build_pointer_type (TREE_TYPE (DR_REF (dr
)));
5159 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
5160 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
5161 gimple_seq_add_seq (new_stmt_list
, seq
);
5163 if (DR_PTR_INFO (dr
)
5164 && TREE_CODE (addr_base
) == SSA_NAME
5165 /* We should only duplicate pointer info to newly created SSA names. */
5166 && SSA_NAME_VAR (addr_base
) == dest
)
5168 gcc_assert (!SSA_NAME_PTR_INFO (addr_base
));
5169 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
5172 if (dump_enabled_p ())
5173 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
5179 /* Function vect_create_data_ref_ptr.
5181 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5182 location accessed in the loop by STMT_INFO, along with the def-use update
5183 chain to appropriately advance the pointer through the loop iterations.
5184 Also set aliasing information for the pointer. This pointer is used by
5185 the callers to this function to create a memory reference expression for
5186 vector load/store access.
5189 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5190 GIMPLE_ASSIGN <name, data-ref> or
5191 GIMPLE_ASSIGN <data-ref, name>.
5192 2. AGGR_TYPE: the type of the reference, which should be either a vector
5194 3. AT_LOOP: the loop where the vector memref is to be created.
5195 4. OFFSET (optional): a byte offset to be added to the initial address
5196 accessed by the data-ref in STMT_INFO.
5197 5. BSI: location where the new stmts are to be placed if there is no loop
5198 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5199 pointing to the initial address.
5200 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5201 to the IV during each iteration of the loop. NULL says to move
5202 by one copy of AGGR_TYPE up or down, depending on the step of the
5206 1. Declare a new ptr to vector_type, and have it point to the base of the
5207 data reference (initial addressed accessed by the data reference).
5208 For example, for vector of type V8HI, the following code is generated:
5211 ap = (v8hi *)initial_address;
5213 if OFFSET is not supplied:
5214 initial_address = &a[init];
5215 if OFFSET is supplied:
5216 initial_address = &a[init] + OFFSET;
5217 if BYTE_OFFSET is supplied:
5218 initial_address = &a[init] + BYTE_OFFSET;
5220 Return the initial_address in INITIAL_ADDRESS.
5222 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5223 update the pointer in each iteration of the loop.
5225 Return the increment stmt that updates the pointer in PTR_INCR.
5227 3. Return the pointer. */
5230 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5231 tree aggr_type
, class loop
*at_loop
, tree offset
,
5232 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
5233 gimple
**ptr_incr
, bool only_init
,
5236 const char *base_name
;
5237 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5238 class loop
*loop
= NULL
;
5239 bool nested_in_vect_loop
= false;
5240 class loop
*containing_loop
= NULL
;
5244 gimple_seq new_stmt_list
= NULL
;
5248 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5249 struct data_reference
*dr
= dr_info
->dr
;
5251 gimple_stmt_iterator incr_gsi
;
5253 tree indx_before_incr
, indx_after_incr
;
5255 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
5257 gcc_assert (iv_step
!= NULL_TREE
5258 || TREE_CODE (aggr_type
) == ARRAY_TYPE
5259 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
5263 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5264 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5265 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5266 pe
= loop_preheader_edge (loop
);
5270 gcc_assert (bb_vinfo
);
5275 /* Create an expression for the first address accessed by this load
5277 base_name
= get_name (DR_BASE_ADDRESS (dr
));
5279 if (dump_enabled_p ())
5281 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
5282 dump_printf_loc (MSG_NOTE
, vect_location
,
5283 "create %s-pointer variable to type: %T",
5284 get_tree_code_name (TREE_CODE (aggr_type
)),
5286 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
5287 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
5288 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
5289 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
5290 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
5291 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
5293 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
5294 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
5297 /* (1) Create the new aggregate-pointer variable.
5298 Vector and array types inherit the alias set of their component
5299 type by default so we need to use a ref-all pointer if the data
5300 reference does not conflict with the created aggregated data
5301 reference because it is not addressable. */
5302 bool need_ref_all
= false;
5303 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5304 get_alias_set (DR_REF (dr
))))
5305 need_ref_all
= true;
5306 /* Likewise for any of the data references in the stmt group. */
5307 else if (DR_GROUP_SIZE (stmt_info
) > 1)
5309 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5312 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
5313 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5314 get_alias_set (DR_REF (sdr
))))
5316 need_ref_all
= true;
5319 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
5323 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
5325 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
5328 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5329 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5330 def-use update cycles for the pointer: one relative to the outer-loop
5331 (LOOP), which is what steps (3) and (4) below do. The other is relative
5332 to the inner-loop (which is the inner-most loop containing the dataref),
5333 and this is done be step (5) below.
5335 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5336 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5337 redundant. Steps (3),(4) create the following:
5340 LOOP: vp1 = phi(vp0,vp2)
5346 If there is an inner-loop nested in loop, then step (5) will also be
5347 applied, and an additional update in the inner-loop will be created:
5350 LOOP: vp1 = phi(vp0,vp2)
5352 inner: vp3 = phi(vp1,vp4)
5353 vp4 = vp3 + inner_step
5359 /* (2) Calculate the initial address of the aggregate-pointer, and set
5360 the aggregate-pointer to point to it before the loop. */
5362 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5364 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
5365 stmt_info
, &new_stmt_list
,
5371 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
5372 gcc_assert (!new_bb
);
5375 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
5378 *initial_address
= new_temp
;
5379 aggr_ptr_init
= new_temp
;
5381 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5382 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5383 inner-loop nested in LOOP (during outer-loop vectorization). */
5385 /* No update in loop is required. */
5386 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
5387 aptr
= aggr_ptr_init
;
5390 /* Accesses to invariant addresses should be handled specially
5392 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
5393 gcc_assert (!integer_zerop (step
));
5395 if (iv_step
== NULL_TREE
)
5397 /* The step of the aggregate pointer is the type size,
5398 negated for downward accesses. */
5399 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
5400 if (tree_int_cst_sgn (step
) == -1)
5401 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
5404 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5406 create_iv (aggr_ptr_init
, PLUS_EXPR
,
5407 fold_convert (aggr_ptr_type
, iv_step
),
5408 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
5409 &indx_before_incr
, &indx_after_incr
);
5410 incr
= gsi_stmt (incr_gsi
);
5412 /* Copy the points-to information if it exists. */
5413 if (DR_PTR_INFO (dr
))
5415 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5416 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5421 aptr
= indx_before_incr
;
5424 if (!nested_in_vect_loop
|| only_init
)
5428 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5429 nested in LOOP, if exists. */
5431 gcc_assert (nested_in_vect_loop
);
5434 standard_iv_increment_position (containing_loop
, &incr_gsi
,
5436 create_iv (aptr
, PLUS_EXPR
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)),
5437 aggr_ptr
, containing_loop
, &incr_gsi
, insert_after
,
5438 &indx_before_incr
, &indx_after_incr
);
5439 incr
= gsi_stmt (incr_gsi
);
5441 /* Copy the points-to information if it exists. */
5442 if (DR_PTR_INFO (dr
))
5444 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5445 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5450 return indx_before_incr
;
5457 /* Function bump_vector_ptr
5459 Increment a pointer (to a vector type) by vector-size. If requested,
5460 i.e. if PTR-INCR is given, then also connect the new increment stmt
5461 to the existing def-use update-chain of the pointer, by modifying
5462 the PTR_INCR as illustrated below:
5464 The pointer def-use update-chain before this function:
5465 DATAREF_PTR = phi (p_0, p_2)
5467 PTR_INCR: p_2 = DATAREF_PTR + step
5469 The pointer def-use update-chain after this function:
5470 DATAREF_PTR = phi (p_0, p_2)
5472 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5474 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5477 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5479 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5480 the loop. The increment amount across iterations is expected
5482 BSI - location where the new update stmt is to be placed.
5483 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5484 BUMP - optional. The offset by which to bump the pointer. If not given,
5485 the offset is assumed to be vector_size.
5487 Output: Return NEW_DATAREF_PTR as illustrated above.
5492 bump_vector_ptr (vec_info
*vinfo
,
5493 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5494 stmt_vec_info stmt_info
, tree bump
)
5496 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5497 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5498 tree update
= TYPE_SIZE_UNIT (vectype
);
5501 use_operand_p use_p
;
5502 tree new_dataref_ptr
;
5507 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5508 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5509 else if (is_gimple_min_invariant (dataref_ptr
))
5510 /* When possible avoid emitting a separate increment stmt that will
5511 force the addressed object addressable. */
5512 return build1 (ADDR_EXPR
, TREE_TYPE (dataref_ptr
),
5513 fold_build2 (MEM_REF
,
5514 TREE_TYPE (TREE_TYPE (dataref_ptr
)),
5516 fold_convert (ptr_type_node
, update
)));
5518 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5519 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5520 dataref_ptr
, update
);
5521 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5522 /* Fold the increment, avoiding excessive chains use-def chains of
5523 those, leading to compile-time issues for passes until the next
5524 forwprop pass which would do this as well. */
5525 gimple_stmt_iterator fold_gsi
= gsi_for_stmt (incr_stmt
);
5526 if (fold_stmt (&fold_gsi
, follow_all_ssa_edges
))
5528 incr_stmt
= gsi_stmt (fold_gsi
);
5529 update_stmt (incr_stmt
);
5532 /* Copy the points-to information if it exists. */
5533 if (DR_PTR_INFO (dr
))
5535 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5536 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5540 return new_dataref_ptr
;
5542 /* Update the vector-pointer's cross-iteration increment. */
5543 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5545 tree use
= USE_FROM_PTR (use_p
);
5547 if (use
== dataref_ptr
)
5548 SET_USE (use_p
, new_dataref_ptr
);
5550 gcc_assert (operand_equal_p (use
, update
, 0));
5553 return new_dataref_ptr
;
5557 /* Copy memory reference info such as base/clique from the SRC reference
5558 to the DEST MEM_REF. */
5561 vect_copy_ref_info (tree dest
, tree src
)
5563 if (TREE_CODE (dest
) != MEM_REF
)
5566 tree src_base
= src
;
5567 while (handled_component_p (src_base
))
5568 src_base
= TREE_OPERAND (src_base
, 0);
5569 if (TREE_CODE (src_base
) != MEM_REF
5570 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5573 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5574 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5578 /* Function vect_create_destination_var.
5580 Create a new temporary of type VECTYPE. */
5583 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5589 enum vect_var_kind kind
;
5592 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5596 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5598 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5600 name
= get_name (scalar_dest
);
5602 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5604 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5605 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5611 /* Function vect_grouped_store_supported.
5613 Returns TRUE if interleave high and interleave low permutations
5614 are supported, and FALSE otherwise. */
5617 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5619 machine_mode mode
= TYPE_MODE (vectype
);
5621 /* vect_permute_store_chain requires the group size to be equal to 3 or
5622 be a power of two. */
5623 if (count
!= 3 && exact_log2 (count
) == -1)
5625 if (dump_enabled_p ())
5626 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5627 "the size of the group of accesses"
5628 " is not a power of 2 or not eqaul to 3\n");
5632 /* Check that the permutation is supported. */
5633 if (VECTOR_MODE_P (mode
))
5638 unsigned int j0
= 0, j1
= 0, j2
= 0;
5642 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5644 if (dump_enabled_p ())
5645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5646 "cannot handle groups of 3 stores for"
5647 " variable-length vectors\n");
5651 vec_perm_builder
sel (nelt
, nelt
, 1);
5652 sel
.quick_grow (nelt
);
5653 vec_perm_indices indices
;
5654 for (j
= 0; j
< 3; j
++)
5656 int nelt0
= ((3 - j
) * nelt
) % 3;
5657 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5658 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5659 for (i
= 0; i
< nelt
; i
++)
5661 if (3 * i
+ nelt0
< nelt
)
5662 sel
[3 * i
+ nelt0
] = j0
++;
5663 if (3 * i
+ nelt1
< nelt
)
5664 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5665 if (3 * i
+ nelt2
< nelt
)
5666 sel
[3 * i
+ nelt2
] = 0;
5668 indices
.new_vector (sel
, 2, nelt
);
5669 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5671 if (dump_enabled_p ())
5672 dump_printf (MSG_MISSED_OPTIMIZATION
,
5673 "permutation op not supported by target.\n");
5677 for (i
= 0; i
< nelt
; i
++)
5679 if (3 * i
+ nelt0
< nelt
)
5680 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5681 if (3 * i
+ nelt1
< nelt
)
5682 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5683 if (3 * i
+ nelt2
< nelt
)
5684 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5686 indices
.new_vector (sel
, 2, nelt
);
5687 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5689 if (dump_enabled_p ())
5690 dump_printf (MSG_MISSED_OPTIMIZATION
,
5691 "permutation op not supported by target.\n");
5699 /* If length is not equal to 3 then only power of 2 is supported. */
5700 gcc_assert (pow2p_hwi (count
));
5701 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5703 /* The encoding has 2 interleaved stepped patterns. */
5704 if(!multiple_p (nelt
, 2))
5706 vec_perm_builder
sel (nelt
, 2, 3);
5708 for (i
= 0; i
< 3; i
++)
5711 sel
[i
* 2 + 1] = i
+ nelt
;
5713 vec_perm_indices
indices (sel
, 2, nelt
);
5714 if (can_vec_perm_const_p (mode
, mode
, indices
))
5716 for (i
= 0; i
< 6; i
++)
5717 sel
[i
] += exact_div (nelt
, 2);
5718 indices
.new_vector (sel
, 2, nelt
);
5719 if (can_vec_perm_const_p (mode
, mode
, indices
))
5725 if (dump_enabled_p ())
5726 dump_printf (MSG_MISSED_OPTIMIZATION
,
5727 "permutation op not supported by target.\n");
5731 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5732 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5735 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5738 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5739 vec_mask_len_store_lanes_optab
, vectype
,
5741 return IFN_MASK_LEN_STORE_LANES
;
5744 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5745 vec_mask_store_lanes_optab
, vectype
,
5747 return IFN_MASK_STORE_LANES
;
5751 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5752 vec_store_lanes_optab
, vectype
, count
))
5753 return IFN_STORE_LANES
;
5759 /* Function vect_permute_store_chain.
5761 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5762 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5763 the data correctly for the stores. Return the final references for stores
5766 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5767 The input is 4 vectors each containing 8 elements. We assign a number to
5768 each element, the input sequence is:
5770 1st vec: 0 1 2 3 4 5 6 7
5771 2nd vec: 8 9 10 11 12 13 14 15
5772 3rd vec: 16 17 18 19 20 21 22 23
5773 4th vec: 24 25 26 27 28 29 30 31
5775 The output sequence should be:
5777 1st vec: 0 8 16 24 1 9 17 25
5778 2nd vec: 2 10 18 26 3 11 19 27
5779 3rd vec: 4 12 20 28 5 13 21 30
5780 4th vec: 6 14 22 30 7 15 23 31
5782 i.e., we interleave the contents of the four vectors in their order.
5784 We use interleave_high/low instructions to create such output. The input of
5785 each interleave_high/low operation is two vectors:
5788 the even elements of the result vector are obtained left-to-right from the
5789 high/low elements of the first vector. The odd elements of the result are
5790 obtained left-to-right from the high/low elements of the second vector.
5791 The output of interleave_high will be: 0 4 1 5
5792 and of interleave_low: 2 6 3 7
5795 The permutation is done in log LENGTH stages. In each stage interleave_high
5796 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5797 where the first argument is taken from the first half of DR_CHAIN and the
5798 second argument from it's second half.
5801 I1: interleave_high (1st vec, 3rd vec)
5802 I2: interleave_low (1st vec, 3rd vec)
5803 I3: interleave_high (2nd vec, 4th vec)
5804 I4: interleave_low (2nd vec, 4th vec)
5806 The output for the first stage is:
5808 I1: 0 16 1 17 2 18 3 19
5809 I2: 4 20 5 21 6 22 7 23
5810 I3: 8 24 9 25 10 26 11 27
5811 I4: 12 28 13 29 14 30 15 31
5813 The output of the second stage, i.e. the final result is:
5815 I1: 0 8 16 24 1 9 17 25
5816 I2: 2 10 18 26 3 11 19 27
5817 I3: 4 12 20 28 5 13 21 30
5818 I4: 6 14 22 30 7 15 23 31. */
5821 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> &dr_chain
,
5822 unsigned int length
,
5823 stmt_vec_info stmt_info
,
5824 gimple_stmt_iterator
*gsi
,
5825 vec
<tree
> *result_chain
)
5827 tree vect1
, vect2
, high
, low
;
5829 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5830 tree perm_mask_low
, perm_mask_high
;
5832 tree perm3_mask_low
, perm3_mask_high
;
5833 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5835 result_chain
->quick_grow (length
);
5836 memcpy (result_chain
->address (), dr_chain
.address (),
5837 length
* sizeof (tree
));
5841 /* vect_grouped_store_supported ensures that this is constant. */
5842 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5843 unsigned int j0
= 0, j1
= 0, j2
= 0;
5845 vec_perm_builder
sel (nelt
, nelt
, 1);
5846 sel
.quick_grow (nelt
);
5847 vec_perm_indices indices
;
5848 for (j
= 0; j
< 3; j
++)
5850 int nelt0
= ((3 - j
) * nelt
) % 3;
5851 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5852 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5854 for (i
= 0; i
< nelt
; i
++)
5856 if (3 * i
+ nelt0
< nelt
)
5857 sel
[3 * i
+ nelt0
] = j0
++;
5858 if (3 * i
+ nelt1
< nelt
)
5859 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5860 if (3 * i
+ nelt2
< nelt
)
5861 sel
[3 * i
+ nelt2
] = 0;
5863 indices
.new_vector (sel
, 2, nelt
);
5864 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5866 for (i
= 0; i
< nelt
; i
++)
5868 if (3 * i
+ nelt0
< nelt
)
5869 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5870 if (3 * i
+ nelt1
< nelt
)
5871 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5872 if (3 * i
+ nelt2
< nelt
)
5873 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5875 indices
.new_vector (sel
, 2, nelt
);
5876 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5878 vect1
= dr_chain
[0];
5879 vect2
= dr_chain
[1];
5881 /* Create interleaving stmt:
5882 low = VEC_PERM_EXPR <vect1, vect2,
5883 {j, nelt, *, j + 1, nelt + j + 1, *,
5884 j + 2, nelt + j + 2, *, ...}> */
5885 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5886 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5887 vect2
, perm3_mask_low
);
5888 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5891 vect2
= dr_chain
[2];
5892 /* Create interleaving stmt:
5893 low = VEC_PERM_EXPR <vect1, vect2,
5894 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5895 6, 7, nelt + j + 2, ...}> */
5896 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5897 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5898 vect2
, perm3_mask_high
);
5899 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5900 (*result_chain
)[j
] = data_ref
;
5905 /* If length is not equal to 3 then only power of 2 is supported. */
5906 gcc_assert (pow2p_hwi (length
));
5908 /* The encoding has 2 interleaved stepped patterns. */
5909 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5910 vec_perm_builder
sel (nelt
, 2, 3);
5912 for (i
= 0; i
< 3; i
++)
5915 sel
[i
* 2 + 1] = i
+ nelt
;
5917 vec_perm_indices
indices (sel
, 2, nelt
);
5918 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5920 for (i
= 0; i
< 6; i
++)
5921 sel
[i
] += exact_div (nelt
, 2);
5922 indices
.new_vector (sel
, 2, nelt
);
5923 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5925 for (i
= 0, n
= log_length
; i
< n
; i
++)
5927 for (j
= 0; j
< length
/2; j
++)
5929 vect1
= dr_chain
[j
];
5930 vect2
= dr_chain
[j
+length
/2];
5932 /* Create interleaving stmt:
5933 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5935 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5936 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5937 vect2
, perm_mask_high
);
5938 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5939 (*result_chain
)[2*j
] = high
;
5941 /* Create interleaving stmt:
5942 low = VEC_PERM_EXPR <vect1, vect2,
5943 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5945 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5946 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5947 vect2
, perm_mask_low
);
5948 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5949 (*result_chain
)[2*j
+1] = low
;
5951 memcpy (dr_chain
.address (), result_chain
->address (),
5952 length
* sizeof (tree
));
5957 /* Function vect_setup_realignment
5959 This function is called when vectorizing an unaligned load using
5960 the dr_explicit_realign[_optimized] scheme.
5961 This function generates the following code at the loop prolog:
5964 x msq_init = *(floor(p)); # prolog load
5965 realignment_token = call target_builtin;
5967 x msq = phi (msq_init, ---)
5969 The stmts marked with x are generated only for the case of
5970 dr_explicit_realign_optimized.
5972 The code above sets up a new (vector) pointer, pointing to the first
5973 location accessed by STMT_INFO, and a "floor-aligned" load using that
5974 pointer. It also generates code to compute the "realignment-token"
5975 (if the relevant target hook was defined), and creates a phi-node at the
5976 loop-header bb whose arguments are the result of the prolog-load (created
5977 by this function) and the result of a load that takes place in the loop
5978 (to be created by the caller to this function).
5980 For the case of dr_explicit_realign_optimized:
5981 The caller to this function uses the phi-result (msq) to create the
5982 realignment code inside the loop, and sets up the missing phi argument,
5985 msq = phi (msq_init, lsq)
5986 lsq = *(floor(p')); # load in loop
5987 result = realign_load (msq, lsq, realignment_token);
5989 For the case of dr_explicit_realign:
5991 msq = *(floor(p)); # load in loop
5993 lsq = *(floor(p')); # load in loop
5994 result = realign_load (msq, lsq, realignment_token);
5997 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5998 a memory location that may be unaligned.
5999 BSI - place where new code is to be inserted.
6000 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6004 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6005 target hook, if defined.
6006 Return value - the result of the loop-header phi node. */
6009 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6010 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
6011 enum dr_alignment_support alignment_support_scheme
,
6013 class loop
**at_loop
)
6015 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6016 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6017 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
6018 struct data_reference
*dr
= dr_info
->dr
;
6019 class loop
*loop
= NULL
;
6021 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
6027 tree msq_init
= NULL_TREE
;
6030 tree msq
= NULL_TREE
;
6031 gimple_seq stmts
= NULL
;
6032 bool compute_in_loop
= false;
6033 bool nested_in_vect_loop
= false;
6034 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
6035 class loop
*loop_for_initial_load
= NULL
;
6039 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6040 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
6043 gcc_assert (alignment_support_scheme
== dr_explicit_realign
6044 || alignment_support_scheme
== dr_explicit_realign_optimized
);
6046 /* We need to generate three things:
6047 1. the misalignment computation
6048 2. the extra vector load (for the optimized realignment scheme).
6049 3. the phi node for the two vectors from which the realignment is
6050 done (for the optimized realignment scheme). */
6052 /* 1. Determine where to generate the misalignment computation.
6054 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6055 calculation will be generated by this function, outside the loop (in the
6056 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6057 caller, inside the loop.
6059 Background: If the misalignment remains fixed throughout the iterations of
6060 the loop, then both realignment schemes are applicable, and also the
6061 misalignment computation can be done outside LOOP. This is because we are
6062 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6063 are a multiple of VS (the Vector Size), and therefore the misalignment in
6064 different vectorized LOOP iterations is always the same.
6065 The problem arises only if the memory access is in an inner-loop nested
6066 inside LOOP, which is now being vectorized using outer-loop vectorization.
6067 This is the only case when the misalignment of the memory access may not
6068 remain fixed throughout the iterations of the inner-loop (as explained in
6069 detail in vect_supportable_dr_alignment). In this case, not only is the
6070 optimized realignment scheme not applicable, but also the misalignment
6071 computation (and generation of the realignment token that is passed to
6072 REALIGN_LOAD) have to be done inside the loop.
6074 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6075 or not, which in turn determines if the misalignment is computed inside
6076 the inner-loop, or outside LOOP. */
6078 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
6080 compute_in_loop
= true;
6081 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
6085 /* 2. Determine where to generate the extra vector load.
6087 For the optimized realignment scheme, instead of generating two vector
6088 loads in each iteration, we generate a single extra vector load in the
6089 preheader of the loop, and in each iteration reuse the result of the
6090 vector load from the previous iteration. In case the memory access is in
6091 an inner-loop nested inside LOOP, which is now being vectorized using
6092 outer-loop vectorization, we need to determine whether this initial vector
6093 load should be generated at the preheader of the inner-loop, or can be
6094 generated at the preheader of LOOP. If the memory access has no evolution
6095 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6096 to be generated inside LOOP (in the preheader of the inner-loop). */
6098 if (nested_in_vect_loop
)
6100 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
6101 bool invariant_in_outerloop
=
6102 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
6103 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
6106 loop_for_initial_load
= loop
;
6108 *at_loop
= loop_for_initial_load
;
6110 tree vuse
= NULL_TREE
;
6111 if (loop_for_initial_load
)
6113 pe
= loop_preheader_edge (loop_for_initial_load
);
6114 if (gphi
*vphi
= get_virtual_phi (loop_for_initial_load
->header
))
6115 vuse
= PHI_ARG_DEF_FROM_EDGE (vphi
, pe
);
6118 vuse
= gimple_vuse (gsi_stmt (*gsi
));
6120 /* 3. For the case of the optimized realignment, create the first vector
6121 load at the loop preheader. */
6123 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
6125 /* Create msq_init = *(floor(p1)) in the loop preheader */
6128 gcc_assert (!compute_in_loop
);
6129 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6130 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
6131 loop_for_initial_load
, NULL_TREE
,
6132 &init_addr
, NULL
, &inc
, true);
6133 if (TREE_CODE (ptr
) == SSA_NAME
)
6134 new_temp
= copy_ssa_name (ptr
);
6136 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
6137 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
6138 tree type
= TREE_TYPE (ptr
);
6139 new_stmt
= gimple_build_assign
6140 (new_temp
, BIT_AND_EXPR
, ptr
,
6141 fold_build2 (MINUS_EXPR
, type
,
6142 build_int_cst (type
, 0),
6143 build_int_cst (type
, align
)));
6144 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6145 gcc_assert (!new_bb
);
6147 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
6148 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
6149 vect_copy_ref_info (data_ref
, DR_REF (dr
));
6150 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
6151 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6152 gimple_assign_set_lhs (new_stmt
, new_temp
);
6153 gimple_set_vuse (new_stmt
, vuse
);
6156 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6157 gcc_assert (!new_bb
);
6160 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6162 msq_init
= gimple_assign_lhs (new_stmt
);
6165 /* 4. Create realignment token using a target builtin, if available.
6166 It is done either inside the containing loop, or before LOOP (as
6167 determined above). */
6169 if (targetm
.vectorize
.builtin_mask_for_load
)
6174 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6177 /* Generate the INIT_ADDR computation outside LOOP. */
6178 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
6183 pe
= loop_preheader_edge (loop
);
6184 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6185 gcc_assert (!new_bb
);
6188 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
6191 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
6192 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
6194 vect_create_destination_var (scalar_dest
,
6195 gimple_call_return_type (new_stmt
));
6196 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6197 gimple_call_set_lhs (new_stmt
, new_temp
);
6199 if (compute_in_loop
)
6200 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6203 /* Generate the misalignment computation outside LOOP. */
6204 pe
= loop_preheader_edge (loop
);
6205 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6206 gcc_assert (!new_bb
);
6209 *realignment_token
= gimple_call_lhs (new_stmt
);
6211 /* The result of the CALL_EXPR to this builtin is determined from
6212 the value of the parameter and no global variables are touched
6213 which makes the builtin a "const" function. Requiring the
6214 builtin to have the "const" attribute makes it unnecessary
6215 to call mark_call_clobbered. */
6216 gcc_assert (TREE_READONLY (builtin_decl
));
6219 if (alignment_support_scheme
== dr_explicit_realign
)
6222 gcc_assert (!compute_in_loop
);
6223 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
6226 /* 5. Create msq = phi <msq_init, lsq> in loop */
6228 pe
= loop_preheader_edge (containing_loop
);
6229 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6230 msq
= make_ssa_name (vec_dest
);
6231 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
6232 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
6238 /* Function vect_grouped_load_supported.
6240 COUNT is the size of the load group (the number of statements plus the
6241 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6242 only one statement, with a gap of COUNT - 1.
6244 Returns true if a suitable permute exists. */
6247 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
6248 unsigned HOST_WIDE_INT count
)
6250 machine_mode mode
= TYPE_MODE (vectype
);
6252 /* If this is single-element interleaving with an element distance
6253 that leaves unused vector loads around punt - we at least create
6254 very sub-optimal code in that case (and blow up memory,
6256 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6260 "single-element interleaving not supported "
6261 "for not adjacent vector loads\n");
6265 /* vect_permute_load_chain requires the group size to be equal to 3 or
6266 be a power of two. */
6267 if (count
!= 3 && exact_log2 (count
) == -1)
6269 if (dump_enabled_p ())
6270 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6271 "the size of the group of accesses"
6272 " is not a power of 2 or not equal to 3\n");
6276 /* Check that the permutation is supported. */
6277 if (VECTOR_MODE_P (mode
))
6283 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
6285 if (dump_enabled_p ())
6286 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6287 "cannot handle groups of 3 loads for"
6288 " variable-length vectors\n");
6292 vec_perm_builder
sel (nelt
, nelt
, 1);
6293 sel
.quick_grow (nelt
);
6294 vec_perm_indices indices
;
6296 for (k
= 0; k
< 3; k
++)
6298 for (i
= 0; i
< nelt
; i
++)
6299 if (3 * i
+ k
< 2 * nelt
)
6303 indices
.new_vector (sel
, 2, nelt
);
6304 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6306 if (dump_enabled_p ())
6307 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6308 "shuffle of 3 loads is not supported by"
6312 for (i
= 0, j
= 0; i
< nelt
; i
++)
6313 if (3 * i
+ k
< 2 * nelt
)
6316 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6317 indices
.new_vector (sel
, 2, nelt
);
6318 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6320 if (dump_enabled_p ())
6321 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6322 "shuffle of 3 loads is not supported by"
6331 /* If length is not equal to 3 then only power of 2 is supported. */
6332 gcc_assert (pow2p_hwi (count
));
6333 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
6335 /* The encoding has a single stepped pattern. */
6336 vec_perm_builder
sel (nelt
, 1, 3);
6338 for (i
= 0; i
< 3; i
++)
6340 vec_perm_indices
indices (sel
, 2, nelt
);
6341 if (can_vec_perm_const_p (mode
, mode
, indices
))
6343 for (i
= 0; i
< 3; i
++)
6345 indices
.new_vector (sel
, 2, nelt
);
6346 if (can_vec_perm_const_p (mode
, mode
, indices
))
6352 if (dump_enabled_p ())
6353 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6354 "extract even/odd not supported by target\n");
6358 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6359 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6362 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
6365 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6366 vec_mask_len_load_lanes_optab
, vectype
,
6368 return IFN_MASK_LEN_LOAD_LANES
;
6371 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6372 vec_mask_load_lanes_optab
, vectype
,
6374 return IFN_MASK_LOAD_LANES
;
6378 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab
,
6380 return IFN_LOAD_LANES
;
6385 /* Function vect_permute_load_chain.
6387 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6388 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6389 the input data correctly. Return the final references for loads in
6392 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6393 The input is 4 vectors each containing 8 elements. We assign a number to each
6394 element, the input sequence is:
6396 1st vec: 0 1 2 3 4 5 6 7
6397 2nd vec: 8 9 10 11 12 13 14 15
6398 3rd vec: 16 17 18 19 20 21 22 23
6399 4th vec: 24 25 26 27 28 29 30 31
6401 The output sequence should be:
6403 1st vec: 0 4 8 12 16 20 24 28
6404 2nd vec: 1 5 9 13 17 21 25 29
6405 3rd vec: 2 6 10 14 18 22 26 30
6406 4th vec: 3 7 11 15 19 23 27 31
6408 i.e., the first output vector should contain the first elements of each
6409 interleaving group, etc.
6411 We use extract_even/odd instructions to create such output. The input of
6412 each extract_even/odd operation is two vectors
6416 and the output is the vector of extracted even/odd elements. The output of
6417 extract_even will be: 0 2 4 6
6418 and of extract_odd: 1 3 5 7
6421 The permutation is done in log LENGTH stages. In each stage extract_even
6422 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6423 their order. In our example,
6425 E1: extract_even (1st vec, 2nd vec)
6426 E2: extract_odd (1st vec, 2nd vec)
6427 E3: extract_even (3rd vec, 4th vec)
6428 E4: extract_odd (3rd vec, 4th vec)
6430 The output for the first stage will be:
6432 E1: 0 2 4 6 8 10 12 14
6433 E2: 1 3 5 7 9 11 13 15
6434 E3: 16 18 20 22 24 26 28 30
6435 E4: 17 19 21 23 25 27 29 31
6437 In order to proceed and create the correct sequence for the next stage (or
6438 for the correct output, if the second stage is the last one, as in our
6439 example), we first put the output of extract_even operation and then the
6440 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6441 The input for the second stage is:
6443 1st vec (E1): 0 2 4 6 8 10 12 14
6444 2nd vec (E3): 16 18 20 22 24 26 28 30
6445 3rd vec (E2): 1 3 5 7 9 11 13 15
6446 4th vec (E4): 17 19 21 23 25 27 29 31
6448 The output of the second stage:
6450 E1: 0 4 8 12 16 20 24 28
6451 E2: 2 6 10 14 18 22 26 30
6452 E3: 1 5 9 13 17 21 25 29
6453 E4: 3 7 11 15 19 23 27 31
6455 And RESULT_CHAIN after reordering:
6457 1st vec (E1): 0 4 8 12 16 20 24 28
6458 2nd vec (E3): 1 5 9 13 17 21 25 29
6459 3rd vec (E2): 2 6 10 14 18 22 26 30
6460 4th vec (E4): 3 7 11 15 19 23 27 31. */
6463 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6464 unsigned int length
,
6465 stmt_vec_info stmt_info
,
6466 gimple_stmt_iterator
*gsi
,
6467 vec
<tree
> *result_chain
)
6469 tree data_ref
, first_vect
, second_vect
;
6470 tree perm_mask_even
, perm_mask_odd
;
6471 tree perm3_mask_low
, perm3_mask_high
;
6473 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6474 unsigned int i
, j
, log_length
= exact_log2 (length
);
6476 result_chain
->quick_grow (length
);
6477 memcpy (result_chain
->address (), dr_chain
.address (),
6478 length
* sizeof (tree
));
6482 /* vect_grouped_load_supported ensures that this is constant. */
6483 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6486 vec_perm_builder
sel (nelt
, nelt
, 1);
6487 sel
.quick_grow (nelt
);
6488 vec_perm_indices indices
;
6489 for (k
= 0; k
< 3; k
++)
6491 for (i
= 0; i
< nelt
; i
++)
6492 if (3 * i
+ k
< 2 * nelt
)
6496 indices
.new_vector (sel
, 2, nelt
);
6497 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6499 for (i
= 0, j
= 0; i
< nelt
; i
++)
6500 if (3 * i
+ k
< 2 * nelt
)
6503 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6504 indices
.new_vector (sel
, 2, nelt
);
6505 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6507 first_vect
= dr_chain
[0];
6508 second_vect
= dr_chain
[1];
6510 /* Create interleaving stmt (low part of):
6511 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6513 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6514 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6515 second_vect
, perm3_mask_low
);
6516 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6518 /* Create interleaving stmt (high part of):
6519 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6521 first_vect
= data_ref
;
6522 second_vect
= dr_chain
[2];
6523 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6524 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6525 second_vect
, perm3_mask_high
);
6526 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6527 (*result_chain
)[k
] = data_ref
;
6532 /* If length is not equal to 3 then only power of 2 is supported. */
6533 gcc_assert (pow2p_hwi (length
));
6535 /* The encoding has a single stepped pattern. */
6536 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6537 vec_perm_builder
sel (nelt
, 1, 3);
6539 for (i
= 0; i
< 3; ++i
)
6541 vec_perm_indices
indices (sel
, 2, nelt
);
6542 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6544 for (i
= 0; i
< 3; ++i
)
6546 indices
.new_vector (sel
, 2, nelt
);
6547 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6549 for (i
= 0; i
< log_length
; i
++)
6551 for (j
= 0; j
< length
; j
+= 2)
6553 first_vect
= dr_chain
[j
];
6554 second_vect
= dr_chain
[j
+1];
6556 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6557 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6558 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6559 first_vect
, second_vect
,
6561 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6562 (*result_chain
)[j
/2] = data_ref
;
6564 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6565 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6566 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6567 first_vect
, second_vect
,
6569 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6570 (*result_chain
)[j
/2+length
/2] = data_ref
;
6572 memcpy (dr_chain
.address (), result_chain
->address (),
6573 length
* sizeof (tree
));
6578 /* Function vect_shift_permute_load_chain.
6580 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6581 sequence of stmts to reorder the input data accordingly.
6582 Return the final references for loads in RESULT_CHAIN.
6583 Return true if successed, false otherwise.
6585 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6586 The input is 3 vectors each containing 8 elements. We assign a
6587 number to each element, the input sequence is:
6589 1st vec: 0 1 2 3 4 5 6 7
6590 2nd vec: 8 9 10 11 12 13 14 15
6591 3rd vec: 16 17 18 19 20 21 22 23
6593 The output sequence should be:
6595 1st vec: 0 3 6 9 12 15 18 21
6596 2nd vec: 1 4 7 10 13 16 19 22
6597 3rd vec: 2 5 8 11 14 17 20 23
6599 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6601 First we shuffle all 3 vectors to get correct elements order:
6603 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6604 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6605 3rd vec: (16 19 22) (17 20 23) (18 21)
6607 Next we unite and shift vector 3 times:
6610 shift right by 6 the concatenation of:
6611 "1st vec" and "2nd vec"
6612 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6613 "2nd vec" and "3rd vec"
6614 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6615 "3rd vec" and "1st vec"
6616 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6619 So that now new vectors are:
6621 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6622 2nd vec: (10 13) (16 19 22) (17 20 23)
6623 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6626 shift right by 5 the concatenation of:
6627 "1st vec" and "3rd vec"
6628 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6629 "2nd vec" and "1st vec"
6630 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6631 "3rd vec" and "2nd vec"
6632 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6635 So that now new vectors are:
6637 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6638 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6639 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6642 shift right by 5 the concatenation of:
6643 "1st vec" and "1st vec"
6644 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6645 shift right by 3 the concatenation of:
6646 "2nd vec" and "2nd vec"
6647 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6650 So that now all vectors are READY:
6651 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6652 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6653 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6655 This algorithm is faster than one in vect_permute_load_chain if:
6656 1. "shift of a concatination" is faster than general permutation.
6658 2. The TARGET machine can't execute vector instructions in parallel.
6659 This is because each step of the algorithm depends on previous.
6660 The algorithm in vect_permute_load_chain is much more parallel.
6662 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6666 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6667 unsigned int length
,
6668 stmt_vec_info stmt_info
,
6669 gimple_stmt_iterator
*gsi
,
6670 vec
<tree
> *result_chain
)
6672 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6673 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6674 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6677 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6678 machine_mode vmode
= TYPE_MODE (vectype
);
6680 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6682 unsigned HOST_WIDE_INT nelt
, vf
;
6683 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6684 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6685 /* Not supported for variable-length vectors. */
6688 vec_perm_builder
sel (nelt
, nelt
, 1);
6689 sel
.quick_grow (nelt
);
6691 result_chain
->quick_grow (length
);
6692 memcpy (result_chain
->address (), dr_chain
.address (),
6693 length
* sizeof (tree
));
6695 if (pow2p_hwi (length
) && vf
> 4)
6697 unsigned int j
, log_length
= exact_log2 (length
);
6698 for (i
= 0; i
< nelt
/ 2; ++i
)
6700 for (i
= 0; i
< nelt
/ 2; ++i
)
6701 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6702 vec_perm_indices
indices (sel
, 2, nelt
);
6703 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6705 if (dump_enabled_p ())
6706 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6707 "shuffle of 2 fields structure is not \
6708 supported by target\n");
6711 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6713 for (i
= 0; i
< nelt
/ 2; ++i
)
6715 for (i
= 0; i
< nelt
/ 2; ++i
)
6716 sel
[nelt
/ 2 + i
] = i
* 2;
6717 indices
.new_vector (sel
, 2, nelt
);
6718 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6720 if (dump_enabled_p ())
6721 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6722 "shuffle of 2 fields structure is not \
6723 supported by target\n");
6726 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6728 /* Generating permutation constant to shift all elements.
6729 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6730 for (i
= 0; i
< nelt
; i
++)
6731 sel
[i
] = nelt
/ 2 + i
;
6732 indices
.new_vector (sel
, 2, nelt
);
6733 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6735 if (dump_enabled_p ())
6736 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6737 "shift permutation is not supported by target\n");
6740 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6742 /* Generating permutation constant to select vector from 2.
6743 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6744 for (i
= 0; i
< nelt
/ 2; i
++)
6746 for (i
= nelt
/ 2; i
< nelt
; i
++)
6748 indices
.new_vector (sel
, 2, nelt
);
6749 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6751 if (dump_enabled_p ())
6752 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6753 "select is not supported by target\n");
6756 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6758 for (i
= 0; i
< log_length
; i
++)
6760 for (j
= 0; j
< length
; j
+= 2)
6762 first_vect
= dr_chain
[j
];
6763 second_vect
= dr_chain
[j
+ 1];
6765 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6766 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6767 first_vect
, first_vect
,
6769 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6772 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6773 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6774 second_vect
, second_vect
,
6776 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6779 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6780 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6781 vect
[0], vect
[1], shift1_mask
);
6782 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6783 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6785 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6786 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6787 vect
[0], vect
[1], select_mask
);
6788 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6789 (*result_chain
)[j
/2] = data_ref
;
6791 memcpy (dr_chain
.address (), result_chain
->address (),
6792 length
* sizeof (tree
));
6796 if (length
== 3 && vf
> 2)
6798 unsigned int k
= 0, l
= 0;
6800 /* Generating permutation constant to get all elements in rigth order.
6801 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6802 for (i
= 0; i
< nelt
; i
++)
6804 if (3 * k
+ (l
% 3) >= nelt
)
6807 l
+= (3 - (nelt
% 3));
6809 sel
[i
] = 3 * k
+ (l
% 3);
6812 vec_perm_indices
indices (sel
, 2, nelt
);
6813 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6815 if (dump_enabled_p ())
6816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6817 "shuffle of 3 fields structure is not \
6818 supported by target\n");
6821 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6823 /* Generating permutation constant to shift all elements.
6824 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6825 for (i
= 0; i
< nelt
; i
++)
6826 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6827 indices
.new_vector (sel
, 2, nelt
);
6828 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6830 if (dump_enabled_p ())
6831 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6832 "shift permutation is not supported by target\n");
6835 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6837 /* Generating permutation constant to shift all elements.
6838 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6839 for (i
= 0; i
< nelt
; i
++)
6840 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6841 indices
.new_vector (sel
, 2, nelt
);
6842 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6844 if (dump_enabled_p ())
6845 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6846 "shift permutation is not supported by target\n");
6849 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6851 /* Generating permutation constant to shift all elements.
6852 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6853 for (i
= 0; i
< nelt
; i
++)
6854 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6855 indices
.new_vector (sel
, 2, nelt
);
6856 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6858 if (dump_enabled_p ())
6859 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6860 "shift permutation is not supported by target\n");
6863 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6865 /* Generating permutation constant to shift all elements.
6866 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6867 for (i
= 0; i
< nelt
; i
++)
6868 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6869 indices
.new_vector (sel
, 2, nelt
);
6870 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6872 if (dump_enabled_p ())
6873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6874 "shift permutation is not supported by target\n");
6877 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6879 for (k
= 0; k
< 3; k
++)
6881 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6882 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6883 dr_chain
[k
], dr_chain
[k
],
6885 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6889 for (k
= 0; k
< 3; k
++)
6891 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6892 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6893 vect
[k
% 3], vect
[(k
+ 1) % 3],
6895 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6896 vect_shift
[k
] = data_ref
;
6899 for (k
= 0; k
< 3; k
++)
6901 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6902 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6903 vect_shift
[(4 - k
) % 3],
6904 vect_shift
[(3 - k
) % 3],
6906 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6910 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6912 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6913 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6914 vect
[0], shift3_mask
);
6915 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6916 (*result_chain
)[nelt
% 3] = data_ref
;
6918 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6919 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6920 vect
[1], shift4_mask
);
6921 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6922 (*result_chain
)[0] = data_ref
;
6928 /* Function vect_transform_grouped_load.
6930 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6931 to perform their permutation and ascribe the result vectorized statements to
6932 the scalar statements.
6936 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6938 int size
, gimple_stmt_iterator
*gsi
)
6941 vec
<tree
> result_chain
= vNULL
;
6943 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6944 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6945 vectors, that are ready for vector computation. */
6946 result_chain
.create (size
);
6948 /* If reassociation width for vector type is 2 or greater target machine can
6949 execute 2 or more vector instructions in parallel. Otherwise try to
6950 get chain for loads group using vect_shift_permute_load_chain. */
6951 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6952 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6954 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6955 gsi
, &result_chain
))
6956 vect_permute_load_chain (vinfo
, dr_chain
,
6957 size
, stmt_info
, gsi
, &result_chain
);
6958 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6959 result_chain
.release ();
6962 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6963 generated as part of the vectorization of STMT_INFO. Assign the statement
6964 for each vector to the associated scalar statement. */
6967 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
6968 vec
<tree
> result_chain
)
6970 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6971 unsigned int i
, gap_count
;
6974 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6975 Since we scan the chain starting from it's first node, their order
6976 corresponds the order of data-refs in RESULT_CHAIN. */
6977 stmt_vec_info next_stmt_info
= first_stmt_info
;
6979 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6981 if (!next_stmt_info
)
6984 /* Skip the gaps. Loads created for the gaps will be removed by dead
6985 code elimination pass later. No need to check for the first stmt in
6986 the group, since it always exists.
6987 DR_GROUP_GAP is the number of steps in elements from the previous
6988 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6989 correspond to the gaps. */
6990 if (next_stmt_info
!= first_stmt_info
6991 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6997 /* ??? The following needs cleanup after the removal of
6998 DR_GROUP_SAME_DR_STMT. */
7001 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
7002 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7003 copies, and we put the new vector statement last. */
7004 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
7006 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
7012 /* Function vect_force_dr_alignment_p.
7014 Returns whether the alignment of a DECL can be forced to be aligned
7015 on ALIGNMENT bit boundary. */
7018 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
7023 if (decl_in_symtab_p (decl
)
7024 && !symtab_node::get (decl
)->can_increase_alignment_p ())
7027 if (TREE_STATIC (decl
))
7028 return (known_le (alignment
,
7029 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
7031 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
7034 /* Return whether the data reference DR_INFO is supported with respect to its
7036 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7037 it is aligned, i.e., check if it is possible to vectorize it with different
7040 enum dr_alignment_support
7041 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
7042 tree vectype
, int misalignment
)
7044 data_reference
*dr
= dr_info
->dr
;
7045 stmt_vec_info stmt_info
= dr_info
->stmt
;
7046 machine_mode mode
= TYPE_MODE (vectype
);
7047 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7048 class loop
*vect_loop
= NULL
;
7049 bool nested_in_vect_loop
= false;
7051 if (misalignment
== 0)
7054 /* For now assume all conditional loads/stores support unaligned
7055 access without any special code. */
7056 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
7057 if (gimple_call_internal_p (stmt
)
7058 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
7059 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
7060 return dr_unaligned_supported
;
7064 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7065 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
7068 /* Possibly unaligned access. */
7070 /* We can choose between using the implicit realignment scheme (generating
7071 a misaligned_move stmt) and the explicit realignment scheme (generating
7072 aligned loads with a REALIGN_LOAD). There are two variants to the
7073 explicit realignment scheme: optimized, and unoptimized.
7074 We can optimize the realignment only if the step between consecutive
7075 vector loads is equal to the vector size. Since the vector memory
7076 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7077 is guaranteed that the misalignment amount remains the same throughout the
7078 execution of the vectorized loop. Therefore, we can create the
7079 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7080 at the loop preheader.
7082 However, in the case of outer-loop vectorization, when vectorizing a
7083 memory access in the inner-loop nested within the LOOP that is now being
7084 vectorized, while it is guaranteed that the misalignment of the
7085 vectorized memory access will remain the same in different outer-loop
7086 iterations, it is *not* guaranteed that is will remain the same throughout
7087 the execution of the inner-loop. This is because the inner-loop advances
7088 with the original scalar step (and not in steps of VS). If the inner-loop
7089 step happens to be a multiple of VS, then the misalignment remains fixed
7090 and we can use the optimized realignment scheme. For example:
7096 When vectorizing the i-loop in the above example, the step between
7097 consecutive vector loads is 1, and so the misalignment does not remain
7098 fixed across the execution of the inner-loop, and the realignment cannot
7099 be optimized (as illustrated in the following pseudo vectorized loop):
7101 for (i=0; i<N; i+=4)
7102 for (j=0; j<M; j++){
7103 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7104 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7105 // (assuming that we start from an aligned address).
7108 We therefore have to use the unoptimized realignment scheme:
7110 for (i=0; i<N; i+=4)
7111 for (j=k; j<M; j+=4)
7112 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7113 // that the misalignment of the initial address is
7116 The loop can then be vectorized as follows:
7118 for (k=0; k<4; k++){
7119 rt = get_realignment_token (&vp[k]);
7120 for (i=0; i<N; i+=4){
7122 for (j=k; j<M; j+=4){
7124 va = REALIGN_LOAD <v1,v2,rt>;
7131 if (DR_IS_READ (dr
))
7133 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
7134 && (!targetm
.vectorize
.builtin_mask_for_load
7135 || targetm
.vectorize
.builtin_mask_for_load ()))
7137 /* If we are doing SLP then the accesses need not have the
7138 same alignment, instead it depends on the SLP group size. */
7140 && STMT_SLP_TYPE (stmt_info
)
7141 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
7142 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7144 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
7145 TYPE_VECTOR_SUBPARTS (vectype
))))
7147 else if (!loop_vinfo
7148 || (nested_in_vect_loop
7149 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
7150 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
7151 return dr_explicit_realign
;
7153 return dr_explicit_realign_optimized
;
7157 bool is_packed
= false;
7158 tree type
= TREE_TYPE (DR_REF (dr
));
7159 if (misalignment
== DR_MISALIGNMENT_UNKNOWN
)
7160 is_packed
= not_size_aligned (DR_REF (dr
));
7161 if (targetm
.vectorize
.support_vector_misalignment (mode
, type
, misalignment
,
7163 return dr_unaligned_supported
;
7166 return dr_unaligned_unsupported
;