1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode
), count
);
83 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
87 "cannot use %s<%s><%s>\n", name
,
88 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE
, vect_location
,
94 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
95 GET_MODE_NAME (mode
));
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
, tree scalar_type
)
121 HOST_WIDE_INT lhs
, rhs
;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
128 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
130 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
133 scalar_type
= TREE_TYPE (gimple_assign_lhs (assign
));
134 if (gimple_assign_cast_p (assign
)
135 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
141 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
143 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
145 scalar_type
= rhs_type
;
148 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
151 if (gimple_call_internal_p (call
))
153 internal_fn ifn
= gimple_call_internal_fn (call
);
154 if (internal_load_fn_p (ifn
))
155 /* For loads the LHS type does the trick. */
157 else if (internal_store_fn_p (ifn
))
159 /* For stores use the tyep of the stored value. */
160 i
= internal_fn_stored_value_index (ifn
);
161 scalar_type
= TREE_TYPE (gimple_call_arg (call
, i
));
164 else if (internal_fn_mask_index (ifn
) == 0)
167 if (i
< gimple_call_num_args (call
))
169 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
170 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
172 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
174 scalar_type
= rhs_type
;
183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
184 tested at run-time. Return TRUE if DDR was successfully inserted.
185 Return false if versioning is not supported. */
188 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
190 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
192 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
193 return opt_result::failure_at (vect_location
,
194 "will not create alias checks, as"
195 " --param vect-max-version-for-alias-checks"
199 = runtime_alias_check_p (ddr
, loop
,
200 optimize_loop_nest_for_speed_p (loop
));
204 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
205 return opt_result::success ();
208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
211 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
213 const vec
<tree
> &checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
214 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
215 if (checks
[i
] == value
)
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE
, vect_location
,
220 "need run-time check that %T is nonzero\n",
222 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
225 /* Return true if we know that the order of vectorized DR_INFO_A and
226 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
227 DR_INFO_B. At least one of the accesses is a write. */
230 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
232 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
233 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
235 /* Single statements are always kept in their original order. */
236 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
237 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
240 /* STMT_A and STMT_B belong to overlapping groups. All loads are
241 emitted at the position of the first scalar load.
242 Stores in a group are emitted at the position of the last scalar store.
243 Compute that position and check whether the resulting order matches
245 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
248 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
249 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
250 s
= DR_GROUP_NEXT_ELEMENT (s
))
251 il_a
= get_later_stmt (il_a
, s
);
252 else /* DR_IS_READ */
253 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
254 s
= DR_GROUP_NEXT_ELEMENT (s
))
255 if (get_later_stmt (il_a
, s
) == il_a
)
260 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
263 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
264 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
265 s
= DR_GROUP_NEXT_ELEMENT (s
))
266 il_b
= get_later_stmt (il_b
, s
);
267 else /* DR_IS_READ */
268 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
269 s
= DR_GROUP_NEXT_ELEMENT (s
))
270 if (get_later_stmt (il_b
, s
) == il_b
)
275 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
276 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
279 /* A subroutine of vect_analyze_data_ref_dependence. Handle
280 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
281 distances. These distances are conservatively correct but they don't
282 reflect a guaranteed dependence.
284 Return true if this function does all the work necessary to avoid
285 an alias or false if the caller should use the dependence distances
286 to limit the vectorization factor in the usual way. LOOP_DEPTH is
287 the depth of the loop described by LOOP_VINFO and the other arguments
288 are as for vect_analyze_data_ref_dependence. */
291 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
292 loop_vec_info loop_vinfo
,
293 int loop_depth
, unsigned int *max_vf
)
295 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
296 for (lambda_vector
&dist_v
: DDR_DIST_VECTS (ddr
))
298 int dist
= dist_v
[loop_depth
];
299 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
301 /* If the user asserted safelen >= DIST consecutive iterations
302 can be executed concurrently, assume independence.
304 ??? An alternative would be to add the alias check even
305 in this case, and vectorize the fallback loop with the
306 maximum VF set to safelen. However, if the user has
307 explicitly given a length, it's less likely that that
309 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
311 if ((unsigned int) loop
->safelen
< *max_vf
)
312 *max_vf
= loop
->safelen
;
313 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
317 /* For dependence distances of 2 or more, we have the option
318 of limiting VF or checking for an alias at runtime.
319 Prefer to check at runtime if we can, to avoid limiting
320 the VF unnecessarily when the bases are in fact independent.
322 Note that the alias checks will be removed if the VF ends up
323 being small enough. */
324 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
325 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
326 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
327 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
328 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
335 /* Function vect_analyze_data_ref_dependence.
337 FIXME: I needed to change the sense of the returned flag.
339 Return FALSE if there (might) exist a dependence between a memory-reference
340 DRA and a memory-reference DRB. When versioning for alias may check a
341 dependence at run-time, return TRUE. Adjust *MAX_VF according to
342 the data dependence. */
345 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
346 loop_vec_info loop_vinfo
,
347 unsigned int *max_vf
)
350 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
351 struct data_reference
*dra
= DDR_A (ddr
);
352 struct data_reference
*drb
= DDR_B (ddr
);
353 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
354 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
355 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
356 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
357 lambda_vector dist_v
;
358 unsigned int loop_depth
;
360 /* If user asserted safelen consecutive iterations can be
361 executed concurrently, assume independence. */
362 auto apply_safelen
= [&]()
364 if (loop
->safelen
>= 2)
366 if ((unsigned int) loop
->safelen
< *max_vf
)
367 *max_vf
= loop
->safelen
;
368 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
374 /* In loop analysis all data references should be vectorizable. */
375 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
376 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
379 /* Independent data accesses. */
380 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
381 return opt_result::success ();
384 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
385 return opt_result::success ();
387 /* We do not have to consider dependences between accesses that belong
388 to the same group, unless the stride could be smaller than the
390 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
391 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
392 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
393 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
394 return opt_result::success ();
396 /* Even if we have an anti-dependence then, as the vectorized loop covers at
397 least two scalar iterations, there is always also a true dependence.
398 As the vectorizer does not re-order loads and stores we can ignore
399 the anti-dependence if TBAA can disambiguate both DRs similar to the
400 case with known negative distance anti-dependences (positive
401 distance anti-dependences would violate TBAA constraints). */
402 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
403 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
404 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
405 get_alias_set (DR_REF (drb
))))
406 return opt_result::success ();
408 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
409 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
411 if (apply_safelen ())
412 return opt_result::success ();
414 return opt_result::failure_at
416 "possible alias involving gather/scatter between %T and %T\n",
417 DR_REF (dra
), DR_REF (drb
));
420 /* Unknown data dependence. */
421 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
423 if (apply_safelen ())
424 return opt_result::success ();
426 if (dump_enabled_p ())
427 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
428 "versioning for alias required: "
429 "can't determine dependence between %T and %T\n",
430 DR_REF (dra
), DR_REF (drb
));
432 /* Add to list of ddrs that need to be tested at run-time. */
433 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
436 /* Known data dependence. */
437 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
439 if (apply_safelen ())
440 return opt_result::success ();
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
444 "versioning for alias required: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra
), DR_REF (drb
));
447 /* Add to list of ddrs that need to be tested at run-time. */
448 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
451 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
453 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
454 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
456 return opt_result::success ();
458 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
460 int dist
= dist_v
[loop_depth
];
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE
, vect_location
,
464 "dependence distance = %d.\n", dist
);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE
, vect_location
,
470 "dependence distance == 0 between %T and %T\n",
471 DR_REF (dra
), DR_REF (drb
));
473 /* When we perform grouped accesses and perform implicit CSE
474 by detecting equal accesses and doing disambiguation with
475 runtime alias tests like for
483 where we will end up loading { a[i], a[i+1] } once, make
484 sure that inserting group loads before the first load and
485 stores after the last store will do the right thing.
486 Similar for groups like
490 where loads from the group interleave with the store. */
491 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
492 return opt_result::failure_at (stmtinfo_a
->stmt
,
493 "READ_WRITE dependence"
494 " in interleaving.\n");
496 if (loop
->safelen
< 2)
498 tree indicator
= dr_zero_step_indicator (dra
);
499 if (!indicator
|| integer_zerop (indicator
))
500 return opt_result::failure_at (stmtinfo_a
->stmt
,
501 "access also has a zero step\n");
502 else if (TREE_CODE (indicator
) != INTEGER_CST
)
503 vect_check_nonzero_value (loop_vinfo
, indicator
);
508 if (dist
> 0 && DDR_REVERSED_P (ddr
))
510 /* If DDR_REVERSED_P the order of the data-refs in DDR was
511 reversed (to make distance vector positive), and the actual
512 distance is negative. */
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE
, vect_location
,
515 "dependence distance negative.\n");
516 /* When doing outer loop vectorization, we need to check if there is
517 a backward dependence at the inner loop level if the dependence
518 at the outer loop is reversed. See PR81740. */
519 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
520 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
522 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
523 DDR_LOOP_NEST (ddr
));
524 if (dist_v
[inner_depth
] < 0)
525 return opt_result::failure_at (stmtinfo_a
->stmt
,
526 "not vectorized, dependence "
527 "between data-refs %T and %T\n",
528 DR_REF (dra
), DR_REF (drb
));
530 /* Record a negative dependence distance to later limit the
531 amount of stmt copying / unrolling we can perform.
532 Only need to handle read-after-write dependence. */
534 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
535 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
536 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
540 unsigned int abs_dist
= abs (dist
);
541 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
543 /* The dependence distance requires reduction of the maximal
544 vectorization factor. */
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE
, vect_location
,
548 "adjusting maximal vectorization factor to %i\n",
552 if (abs_dist
>= *max_vf
)
554 /* Dependence distance does not create dependence, as far as
555 vectorization is concerned, in this case. */
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE
, vect_location
,
558 "dependence distance >= VF.\n");
562 return opt_result::failure_at (stmtinfo_a
->stmt
,
563 "not vectorized, possible dependence "
564 "between data-refs %T and %T\n",
565 DR_REF (dra
), DR_REF (drb
));
568 return opt_result::success ();
571 /* Function vect_analyze_data_ref_dependences.
573 Examine all the data references in the loop, and make sure there do not
574 exist any data dependences between them. Set *MAX_VF according to
575 the maximum vectorization factor the data dependences allow. */
578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
579 unsigned int *max_vf
)
582 struct data_dependence_relation
*ddr
;
584 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
586 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
588 LOOP_VINFO_DDRS (loop_vinfo
)
589 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
590 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
591 /* We do not need read-read dependences. */
592 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
593 &LOOP_VINFO_DDRS (loop_vinfo
),
594 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
599 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
601 /* For epilogues we either have no aliases or alias versioning
602 was applied to original loop. Therefore we may just get max_vf
603 using VF of original loop. */
604 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
605 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
607 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
610 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
615 return opt_result::success ();
619 /* Function vect_slp_analyze_data_ref_dependence.
621 Return TRUE if there (might) exist a dependence between a memory-reference
622 DRA and a memory-reference DRB for VINFO. When versioning for alias
623 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
624 according to the data dependence. */
627 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
628 struct data_dependence_relation
*ddr
)
630 struct data_reference
*dra
= DDR_A (ddr
);
631 struct data_reference
*drb
= DDR_B (ddr
);
632 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
633 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
635 /* We need to check dependences of statements marked as unvectorizable
636 as well, they still can prohibit vectorization. */
638 /* Independent data accesses. */
639 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
645 /* Read-read is OK. */
646 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
649 /* If dra and drb are part of the same interleaving chain consider
651 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
652 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
653 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
656 /* Unknown data dependence. */
657 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
661 "can't determine dependence between %T and %T\n",
662 DR_REF (dra
), DR_REF (drb
));
664 else if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE
, vect_location
,
666 "determined dependence between %T and %T\n",
667 DR_REF (dra
), DR_REF (drb
));
673 /* Analyze dependences involved in the transform of a store SLP NODE. */
676 vect_slp_analyze_store_dependences (vec_info
*vinfo
, slp_tree node
)
678 /* This walks over all stmts involved in the SLP store done
679 in NODE verifying we can sink them up to the last stmt in the
681 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
682 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info
)));
684 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
686 stmt_vec_info access_info
687 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
688 if (access_info
== last_access_info
)
690 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
692 bool ref_initialized_p
= false;
693 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
694 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
696 gimple
*stmt
= gsi_stmt (gsi
);
697 if (! gimple_vuse (stmt
))
700 /* If we couldn't record a (single) data reference for this
701 stmt we have to resort to the alias oracle. */
702 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
703 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
706 /* We are moving a store - this means
707 we cannot use TBAA for disambiguation. */
708 if (!ref_initialized_p
)
709 ao_ref_init (&ref
, DR_REF (dr_a
));
710 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
711 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
716 gcc_assert (!gimple_visited_p (stmt
));
718 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
720 bool dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
721 free_dependence_relation (ddr
);
729 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
730 contain the vector of scalar stores of this instance if we are
731 disambiguating the loads. */
734 vect_slp_analyze_load_dependences (vec_info
*vinfo
, slp_tree node
,
735 vec
<stmt_vec_info
> stores
,
736 stmt_vec_info last_store_info
)
738 /* This walks over all stmts involved in the SLP load done
739 in NODE verifying we can hoist them up to the first stmt in the
741 stmt_vec_info first_access_info
= vect_find_first_scalar_stmt_in_slp (node
);
742 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info
)));
744 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
746 stmt_vec_info access_info
747 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
748 if (access_info
== first_access_info
)
750 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
752 bool ref_initialized_p
= false;
753 hash_set
<stmt_vec_info
> grp_visited
;
754 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
755 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
757 gimple
*stmt
= gsi_stmt (gsi
);
758 if (! gimple_vdef (stmt
))
761 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
763 /* If we run into a store of this same instance (we've just
764 marked those) then delay dependence checking until we run
765 into the last store because this is where it will have
766 been sunk to (and we verified that we can do that already). */
767 if (gimple_visited_p (stmt
))
769 if (stmt_info
!= last_store_info
)
772 for (stmt_vec_info
&store_info
: stores
)
774 data_reference
*store_dr
= STMT_VINFO_DATA_REF (store_info
);
775 ddr_p ddr
= initialize_data_dependence_relation
776 (dr_a
, store_dr
, vNULL
);
778 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
779 free_dependence_relation (ddr
);
786 auto check_hoist
= [&] (stmt_vec_info stmt_info
) -> bool
788 /* We are hoisting a load - this means we can use TBAA for
790 if (!ref_initialized_p
)
791 ao_ref_init (&ref
, DR_REF (dr_a
));
792 if (stmt_may_clobber_ref_p_1 (stmt_info
->stmt
, &ref
, true))
794 /* If we couldn't record a (single) data reference for this
795 stmt we have to give up now. */
796 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
799 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
802 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
803 free_dependence_relation (ddr
);
810 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
812 /* When we run into a store group we have to honor
813 that earlier stores might be moved here. We don't
814 know exactly which and where to since we lack a
815 back-mapping from DR to SLP node, so assume all
816 earlier stores are sunk here. It's enough to
817 consider the last stmt of a group for this.
818 ??? Both this and the fact that we disregard that
819 the conflicting instance might be removed later
820 is overly conservative. */
821 if (!grp_visited
.add (DR_GROUP_FIRST_ELEMENT (stmt_info
)))
822 for (auto store_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
824 store_info
= DR_GROUP_NEXT_ELEMENT (store_info
))
825 if ((store_info
== stmt_info
826 || get_later_stmt (store_info
, stmt_info
) == stmt_info
)
827 && !check_hoist (store_info
))
832 if (!check_hoist (stmt_info
))
841 /* Function vect_analyze_data_ref_dependences.
843 Examine all the data references in the basic-block, and make sure there
844 do not exist any data dependences between them. Set *MAX_VF according to
845 the maximum vectorization factor the data dependences allow. */
848 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
850 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
852 /* The stores of this instance are at the root of the SLP tree. */
853 slp_tree store
= NULL
;
854 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
)
855 store
= SLP_INSTANCE_TREE (instance
);
857 /* Verify we can sink stores to the vectorized stmt insert location. */
858 stmt_vec_info last_store_info
= NULL
;
861 if (! vect_slp_analyze_store_dependences (vinfo
, store
))
864 /* Mark stores in this instance and remember the last one. */
865 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
866 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
867 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
872 /* Verify we can sink loads to the vectorized stmt insert location,
873 special-casing stores of this instance. */
874 for (slp_tree
&load
: SLP_INSTANCE_LOADS (instance
))
875 if (! vect_slp_analyze_load_dependences (vinfo
, load
,
877 ? SLP_TREE_SCALAR_STMTS (store
)
878 : vNULL
, last_store_info
))
884 /* Unset the visited flag. */
886 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
887 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
892 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
896 dr_misalignment (dr_vec_info
*dr_info
, tree vectype
, poly_int64 offset
)
898 HOST_WIDE_INT diff
= 0;
899 /* Alignment is only analyzed for the first element of a DR group,
900 use that but adjust misalignment by the offset of the access. */
901 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
))
903 dr_vec_info
*first_dr
904 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
));
905 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
906 INTEGER_CSTs and the first element in the group has the lowest
908 diff
= (TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
))
909 - TREE_INT_CST_LOW (DR_INIT (first_dr
->dr
)));
910 gcc_assert (diff
>= 0);
914 int misalign
= dr_info
->misalignment
;
915 gcc_assert (misalign
!= DR_MISALIGNMENT_UNINITIALIZED
);
916 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
919 /* If the access is only aligned for a vector type with smaller alignment
920 requirement the access has unknown misalignment. */
921 if (maybe_lt (dr_info
->target_alignment
* BITS_PER_UNIT
,
922 targetm
.vectorize
.preferred_vector_alignment (vectype
)))
923 return DR_MISALIGNMENT_UNKNOWN
;
925 /* Apply the offset from the DR group start and the externally supplied
926 offset which can for example result from a negative stride access. */
927 poly_int64 misalignment
= misalign
+ diff
+ offset
;
929 /* vect_compute_data_ref_alignment will have ensured that target_alignment
930 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
931 unsigned HOST_WIDE_INT target_alignment_c
932 = dr_info
->target_alignment
.to_constant ();
933 if (!known_misalignment (misalignment
, target_alignment_c
, &misalign
))
934 return DR_MISALIGNMENT_UNKNOWN
;
938 /* Record the base alignment guarantee given by DRB, which occurs
942 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
943 innermost_loop_behavior
*drb
)
946 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> &entry
947 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
948 if (!existed
|| entry
.second
->base_alignment
< drb
->base_alignment
)
950 entry
= std::make_pair (stmt_info
, drb
);
951 if (dump_enabled_p ())
952 dump_printf_loc (MSG_NOTE
, vect_location
,
953 "recording new base alignment for %T\n"
955 " misalignment: %d\n"
959 drb
->base_misalignment
,
964 /* If the region we're going to vectorize is reached, all unconditional
965 data references occur at least once. We can therefore pool the base
966 alignment guarantees from each unconditional reference. Do this by
967 going through all the data references in VINFO and checking whether
968 the containing statement makes the reference unconditionally. If so,
969 record the alignment of the base address in VINFO so that it can be
970 used for all other references with the same base. */
973 vect_record_base_alignments (vec_info
*vinfo
)
975 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
976 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
977 for (data_reference
*dr
: vinfo
->shared
->datarefs
)
979 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
980 stmt_vec_info stmt_info
= dr_info
->stmt
;
981 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
982 && STMT_VINFO_VECTORIZABLE (stmt_info
)
983 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
985 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
987 /* If DR is nested in the loop that is being vectorized, we can also
988 record the alignment of the base wrt the outer loop. */
989 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
990 vect_record_base_alignment
991 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
996 /* Function vect_compute_data_ref_alignment
998 Compute the misalignment of the data reference DR_INFO when vectorizing
1002 1. initialized misalignment info for DR_INFO
1004 FOR NOW: No analysis is actually performed. Misalignment is calculated
1005 only for trivial cases. TODO. */
1008 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1011 stmt_vec_info stmt_info
= dr_info
->stmt
;
1012 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
1013 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1014 class loop
*loop
= NULL
;
1015 tree ref
= DR_REF (dr_info
->dr
);
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_NOTE
, vect_location
,
1019 "vect_compute_data_ref_alignment:\n");
1022 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1024 /* Initialize misalignment to unknown. */
1025 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1027 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1030 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
1031 bool step_preserves_misalignment_p
;
1033 poly_uint64 vector_alignment
1034 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
1036 SET_DR_TARGET_ALIGNMENT (dr_info
, vector_alignment
);
1038 /* If the main loop has peeled for alignment we have no way of knowing
1039 whether the data accesses in the epilogues are aligned. We can't at
1040 compile time answer the question whether we have entered the main loop or
1041 not. Fixes PR 92351. */
1044 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1046 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1050 unsigned HOST_WIDE_INT vect_align_c
;
1051 if (!vector_alignment
.is_constant (&vect_align_c
))
1054 /* No step for BB vectorization. */
1057 gcc_assert (integer_zerop (drb
->step
));
1058 step_preserves_misalignment_p
= true;
1061 /* In case the dataref is in an inner-loop of the loop that is being
1062 vectorized (LOOP), we use the base and misalignment information
1063 relative to the outer-loop (LOOP). This is ok only if the misalignment
1064 stays the same throughout the execution of the inner-loop, which is why
1065 we have to check that the stride of the dataref in the inner-loop evenly
1066 divides by the vector alignment. */
1067 else if (nested_in_vect_loop_p (loop
, stmt_info
))
1069 step_preserves_misalignment_p
1070 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1072 if (dump_enabled_p ())
1074 if (step_preserves_misalignment_p
)
1075 dump_printf_loc (MSG_NOTE
, vect_location
,
1076 "inner step divides the vector alignment.\n");
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1079 "inner step doesn't divide the vector"
1084 /* Similarly we can only use base and misalignment information relative to
1085 an innermost loop if the misalignment stays the same throughout the
1086 execution of the loop. As above, this is the case if the stride of
1087 the dataref evenly divides by the alignment. */
1090 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1091 step_preserves_misalignment_p
1092 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
1094 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1096 "step doesn't divide the vector alignment.\n");
1099 unsigned int base_alignment
= drb
->base_alignment
;
1100 unsigned int base_misalignment
= drb
->base_misalignment
;
1102 /* Calculate the maximum of the pooled base address alignment and the
1103 alignment that we can compute for DR itself. */
1104 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> *entry
1105 = base_alignments
->get (drb
->base_address
);
1107 && base_alignment
< (*entry
).second
->base_alignment
1109 || (dominated_by_p (CDI_DOMINATORS
, gimple_bb (stmt_info
->stmt
),
1110 gimple_bb (entry
->first
->stmt
))
1111 && (gimple_bb (stmt_info
->stmt
) != gimple_bb (entry
->first
->stmt
)
1112 || (entry
->first
->dr_aux
.group
<= dr_info
->group
)))))
1114 base_alignment
= entry
->second
->base_alignment
;
1115 base_misalignment
= entry
->second
->base_misalignment
;
1118 if (drb
->offset_alignment
< vect_align_c
1119 || !step_preserves_misalignment_p
1120 /* We need to know whether the step wrt the vectorized loop is
1121 negative when computing the starting misalignment below. */
1122 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1124 if (dump_enabled_p ())
1125 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1126 "Unknown alignment for access: %T\n", ref
);
1130 if (base_alignment
< vect_align_c
)
1132 unsigned int max_alignment
;
1133 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1134 if (max_alignment
< vect_align_c
1135 || !vect_can_force_dr_alignment_p (base
,
1136 vect_align_c
* BITS_PER_UNIT
))
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_NOTE
, vect_location
,
1140 "can't force alignment of ref: %T\n", ref
);
1144 /* Force the alignment of the decl.
1145 NOTE: This is the only change to the code we make during
1146 the analysis phase, before deciding to vectorize the loop. */
1147 if (dump_enabled_p ())
1148 dump_printf_loc (MSG_NOTE
, vect_location
,
1149 "force alignment of %T\n", ref
);
1151 dr_info
->base_decl
= base
;
1152 dr_info
->base_misaligned
= true;
1153 base_misalignment
= 0;
1155 poly_int64 misalignment
1156 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1158 unsigned int const_misalignment
;
1159 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1161 if (dump_enabled_p ())
1162 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1163 "Non-constant misalignment for access: %T\n", ref
);
1167 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1169 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1171 "misalign = %d bytes of ref %T\n",
1172 const_misalignment
, ref
);
1177 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1178 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1179 is made aligned via peeling. */
1182 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1183 dr_vec_info
*dr_peel_info
)
1185 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1186 DR_TARGET_ALIGNMENT (dr_info
)))
1188 poly_offset_int diff
1189 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1190 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1191 if (known_eq (diff
, 0)
1192 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1198 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1199 aligned via peeling. */
1202 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1203 dr_vec_info
*dr_peel_info
)
1205 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1206 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1207 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1208 DR_OFFSET (dr_peel_info
->dr
), 0)
1209 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1210 DR_STEP (dr_peel_info
->dr
), 0))
1213 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1216 /* Compute the value for dr_info->misalign so that the access appears
1217 aligned. This is used by peeling to compensate for dr_misalignment
1218 applying the offset for negative step. */
1221 vect_dr_misalign_for_aligned_access (dr_vec_info
*dr_info
)
1223 if (tree_int_cst_sgn (DR_STEP (dr_info
->dr
)) >= 0)
1226 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1227 poly_int64 misalignment
1228 = ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1229 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1231 unsigned HOST_WIDE_INT target_alignment_c
;
1233 if (!dr_info
->target_alignment
.is_constant (&target_alignment_c
)
1234 || !known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1235 return DR_MISALIGNMENT_UNKNOWN
;
1239 /* Function vect_update_misalignment_for_peel.
1240 Sets DR_INFO's misalignment
1241 - to 0 if it has the same alignment as DR_PEEL_INFO,
1242 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1243 - to -1 (unknown) otherwise.
1245 DR_INFO - the data reference whose misalignment is to be adjusted.
1246 DR_PEEL_INFO - the data reference whose misalignment is being made
1247 zero in the vector loop by the peel.
1248 NPEEL - the number of iterations in the peel loop if the misalignment
1249 of DR_PEEL_INFO is known at compile time. */
1252 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1253 dr_vec_info
*dr_peel_info
, int npeel
)
1255 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1256 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1258 SET_DR_MISALIGNMENT (dr_info
,
1259 vect_dr_misalign_for_aligned_access (dr_peel_info
));
1263 unsigned HOST_WIDE_INT alignment
;
1264 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1265 && known_alignment_for_access_p (dr_info
,
1266 STMT_VINFO_VECTYPE (dr_info
->stmt
))
1267 && known_alignment_for_access_p (dr_peel_info
,
1268 STMT_VINFO_VECTYPE (dr_peel_info
->stmt
)))
1270 int misal
= dr_info
->misalignment
;
1271 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1272 misal
&= alignment
- 1;
1273 set_dr_misalignment (dr_info
, misal
);
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1279 "to unknown (-1).\n");
1280 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1283 /* Return true if alignment is relevant for DR_INFO. */
1286 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1288 stmt_vec_info stmt_info
= dr_info
->stmt
;
1290 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1293 /* For interleaving, only the alignment of the first access matters. */
1294 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1295 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1298 /* Scatter-gather and invariant accesses continue to address individual
1299 scalars, so vector-level alignment is irrelevant. */
1300 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1301 || integer_zerop (DR_STEP (dr_info
->dr
)))
1304 /* Strided accesses perform only component accesses, alignment is
1305 irrelevant for them. */
1306 if (STMT_VINFO_STRIDED_P (stmt_info
)
1307 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1313 /* Given an memory reference EXP return whether its alignment is less
1317 not_size_aligned (tree exp
)
1319 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1322 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1323 > get_object_alignment (exp
));
1326 /* Function vector_alignment_reachable_p
1328 Return true if vector alignment for DR_INFO is reachable by peeling
1329 a few loop iterations. Return false otherwise. */
1332 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1334 stmt_vec_info stmt_info
= dr_info
->stmt
;
1335 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1337 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1339 /* For interleaved access we peel only if number of iterations in
1340 the prolog loop ({VF - misalignment}), is a multiple of the
1341 number of the interleaved accesses. */
1342 int elem_size
, mis_in_elements
;
1344 /* FORNOW: handle only known alignment. */
1345 if (!known_alignment_for_access_p (dr_info
, vectype
))
1348 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1349 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1350 elem_size
= vector_element_size (vector_size
, nelements
);
1351 mis_in_elements
= dr_misalignment (dr_info
, vectype
) / elem_size
;
1353 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1357 /* If misalignment is known at the compile time then allow peeling
1358 only if natural alignment is reachable through peeling. */
1359 if (known_alignment_for_access_p (dr_info
, vectype
)
1360 && !aligned_access_p (dr_info
, vectype
))
1362 HOST_WIDE_INT elmsize
=
1363 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1364 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_NOTE
, vect_location
,
1367 "data size = %wd. misalignment = %d.\n", elmsize
,
1368 dr_misalignment (dr_info
, vectype
));
1370 if (dr_misalignment (dr_info
, vectype
) % elmsize
)
1372 if (dump_enabled_p ())
1373 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1374 "data size does not divide the misalignment.\n");
1379 if (!known_alignment_for_access_p (dr_info
, vectype
))
1381 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1382 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1385 "Unknown misalignment, %snaturally aligned\n",
1386 is_packed
? "not " : "");
1387 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1394 /* Calculate the cost of the memory access represented by DR_INFO. */
1397 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1398 dr_alignment_support alignment_support_scheme
,
1400 unsigned int *inside_cost
,
1401 unsigned int *outside_cost
,
1402 stmt_vector_for_cost
*body_cost_vec
,
1403 stmt_vector_for_cost
*prologue_cost_vec
)
1405 stmt_vec_info stmt_info
= dr_info
->stmt
;
1406 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1409 if (PURE_SLP_STMT (stmt_info
))
1412 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1414 if (DR_IS_READ (dr_info
->dr
))
1415 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, alignment_support_scheme
,
1416 misalignment
, true, inside_cost
,
1417 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1419 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, alignment_support_scheme
,
1420 misalignment
, inside_cost
, body_cost_vec
);
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE
, vect_location
,
1424 "vect_get_data_access_cost: inside_cost = %d, "
1425 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1429 typedef struct _vect_peel_info
1431 dr_vec_info
*dr_info
;
1436 typedef struct _vect_peel_extended_info
1439 struct _vect_peel_info peel_info
;
1440 unsigned int inside_cost
;
1441 unsigned int outside_cost
;
1442 } *vect_peel_extended_info
;
1445 /* Peeling hashtable helpers. */
1447 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1449 static inline hashval_t
hash (const _vect_peel_info
*);
1450 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1454 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1456 return (hashval_t
) peel_info
->npeel
;
1460 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1462 return (a
->npeel
== b
->npeel
);
1466 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1469 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1470 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1471 int npeel
, bool supportable_if_not_aligned
)
1473 struct _vect_peel_info elem
, *slot
;
1474 _vect_peel_info
**new_slot
;
1477 slot
= peeling_htab
->find (&elem
);
1482 slot
= XNEW (struct _vect_peel_info
);
1483 slot
->npeel
= npeel
;
1484 slot
->dr_info
= dr_info
;
1486 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1490 /* If this DR is not supported with unknown misalignment then bias
1491 this slot when the cost model is disabled. */
1492 if (!supportable_if_not_aligned
1493 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1494 slot
->count
+= VECT_MAX_COST
;
1498 /* Traverse peeling hash table to find peeling option that aligns maximum
1499 number of data accesses. */
1502 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1503 _vect_peel_extended_info
*max
)
1505 vect_peel_info elem
= *slot
;
1507 if (elem
->count
> max
->peel_info
.count
1508 || (elem
->count
== max
->peel_info
.count
1509 && max
->peel_info
.npeel
> elem
->npeel
))
1511 max
->peel_info
.npeel
= elem
->npeel
;
1512 max
->peel_info
.count
= elem
->count
;
1513 max
->peel_info
.dr_info
= elem
->dr_info
;
1519 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1520 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1521 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1525 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1526 dr_vec_info
*dr0_info
,
1527 unsigned int *inside_cost
,
1528 unsigned int *outside_cost
,
1529 stmt_vector_for_cost
*body_cost_vec
,
1530 stmt_vector_for_cost
*prologue_cost_vec
,
1533 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1535 bool dr0_alignment_known_p
1537 && known_alignment_for_access_p (dr0_info
,
1538 STMT_VINFO_VECTYPE (dr0_info
->stmt
)));
1540 for (data_reference
*dr
: datarefs
)
1542 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1543 if (!vect_relevant_for_alignment_p (dr_info
))
1546 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1547 dr_alignment_support alignment_support_scheme
;
1549 unsigned HOST_WIDE_INT alignment
;
1551 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1552 size_zero_node
) < 0;
1555 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1556 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1559 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1560 else if (dr_info
== dr0_info
1561 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1563 else if (!dr0_alignment_known_p
1564 || !known_alignment_for_access_p (dr_info
, vectype
)
1565 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1566 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1569 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1570 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1571 misalignment
&= alignment
- 1;
1573 alignment_support_scheme
1574 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1577 vect_get_data_access_cost (loop_vinfo
, dr_info
,
1578 alignment_support_scheme
, misalignment
,
1579 inside_cost
, outside_cost
,
1580 body_cost_vec
, prologue_cost_vec
);
1584 /* Traverse peeling hash table and calculate cost for each peeling option.
1585 Find the one with the lowest cost. */
1588 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1589 _vect_peel_extended_info
*min
)
1591 vect_peel_info elem
= *slot
;
1593 unsigned int inside_cost
= 0, outside_cost
= 0;
1594 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1595 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1598 prologue_cost_vec
.create (2);
1599 body_cost_vec
.create (2);
1600 epilogue_cost_vec
.create (2);
1602 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1603 &outside_cost
, &body_cost_vec
,
1604 &prologue_cost_vec
, elem
->npeel
);
1606 body_cost_vec
.release ();
1608 outside_cost
+= vect_get_known_peeling_cost
1609 (loop_vinfo
, elem
->npeel
, &dummy
,
1610 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1611 &prologue_cost_vec
, &epilogue_cost_vec
);
1613 /* Prologue and epilogue costs are added to the target model later.
1614 These costs depend only on the scalar iteration cost, the
1615 number of peeling iterations finally chosen, and the number of
1616 misaligned statements. So discard the information found here. */
1617 prologue_cost_vec
.release ();
1618 epilogue_cost_vec
.release ();
1620 if (inside_cost
< min
->inside_cost
1621 || (inside_cost
== min
->inside_cost
1622 && outside_cost
< min
->outside_cost
))
1624 min
->inside_cost
= inside_cost
;
1625 min
->outside_cost
= outside_cost
;
1626 min
->peel_info
.dr_info
= elem
->dr_info
;
1627 min
->peel_info
.npeel
= elem
->npeel
;
1628 min
->peel_info
.count
= elem
->count
;
1635 /* Choose best peeling option by traversing peeling hash table and either
1636 choosing an option with the lowest cost (if cost model is enabled) or the
1637 option that aligns as many accesses as possible. */
1639 static struct _vect_peel_extended_info
1640 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1641 loop_vec_info loop_vinfo
)
1643 struct _vect_peel_extended_info res
;
1645 res
.peel_info
.dr_info
= NULL
;
1646 res
.vinfo
= loop_vinfo
;
1648 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1650 res
.inside_cost
= INT_MAX
;
1651 res
.outside_cost
= INT_MAX
;
1652 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1653 vect_peeling_hash_get_lowest_cost
> (&res
);
1657 res
.peel_info
.count
= 0;
1658 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1659 vect_peeling_hash_get_most_frequent
> (&res
);
1660 res
.inside_cost
= 0;
1661 res
.outside_cost
= 0;
1667 /* Return true if the new peeling NPEEL is supported. */
1670 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1673 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1674 enum dr_alignment_support supportable_dr_alignment
;
1676 bool dr0_alignment_known_p
1677 = known_alignment_for_access_p (dr0_info
,
1678 STMT_VINFO_VECTYPE (dr0_info
->stmt
));
1680 /* Ensure that all data refs can be vectorized after the peel. */
1681 for (data_reference
*dr
: datarefs
)
1683 if (dr
== dr0_info
->dr
)
1686 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1687 if (!vect_relevant_for_alignment_p (dr_info
)
1688 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1691 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1693 unsigned HOST_WIDE_INT alignment
;
1694 if (!dr0_alignment_known_p
1695 || !known_alignment_for_access_p (dr_info
, vectype
)
1696 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1697 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1700 misalignment
= dr_misalignment (dr_info
, vectype
);
1701 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1702 misalignment
&= alignment
- 1;
1704 supportable_dr_alignment
1705 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1707 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
1714 /* Compare two data-references DRA and DRB to group them into chunks
1715 with related alignment. */
1718 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
1720 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
1721 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
1724 /* Stabilize sort. */
1728 /* Ordering of DRs according to base. */
1729 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
1730 DR_BASE_ADDRESS (drb
));
1734 /* And according to DR_OFFSET. */
1735 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
1739 /* And after step. */
1740 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
1744 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1745 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
1747 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
1751 /* Function vect_enhance_data_refs_alignment
1753 This pass will use loop versioning and loop peeling in order to enhance
1754 the alignment of data references in the loop.
1756 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1757 original loop is to be vectorized. Any other loops that are created by
1758 the transformations performed in this pass - are not supposed to be
1759 vectorized. This restriction will be relaxed.
1761 This pass will require a cost model to guide it whether to apply peeling
1762 or versioning or a combination of the two. For example, the scheme that
1763 intel uses when given a loop with several memory accesses, is as follows:
1764 choose one memory access ('p') which alignment you want to force by doing
1765 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1766 other accesses are not necessarily aligned, or (2) use loop versioning to
1767 generate one loop in which all accesses are aligned, and another loop in
1768 which only 'p' is necessarily aligned.
1770 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1771 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1772 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1774 Devising a cost model is the most critical aspect of this work. It will
1775 guide us on which access to peel for, whether to use loop versioning, how
1776 many versions to create, etc. The cost model will probably consist of
1777 generic considerations as well as target specific considerations (on
1778 powerpc for example, misaligned stores are more painful than misaligned
1781 Here are the general steps involved in alignment enhancements:
1783 -- original loop, before alignment analysis:
1784 for (i=0; i<N; i++){
1785 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1786 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1789 -- After vect_compute_data_refs_alignment:
1790 for (i=0; i<N; i++){
1791 x = q[i]; # DR_MISALIGNMENT(q) = 3
1792 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1795 -- Possibility 1: we do loop versioning:
1797 for (i=0; i<N; i++){ # loop 1A
1798 x = q[i]; # DR_MISALIGNMENT(q) = 3
1799 p[i] = y; # DR_MISALIGNMENT(p) = 0
1803 for (i=0; i<N; i++){ # loop 1B
1804 x = q[i]; # DR_MISALIGNMENT(q) = 3
1805 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1809 -- Possibility 2: we do loop peeling:
1810 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1814 for (i = 3; i < N; i++){ # loop 2A
1815 x = q[i]; # DR_MISALIGNMENT(q) = 0
1816 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1819 -- Possibility 3: combination of loop peeling and versioning:
1820 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1825 for (i = 3; i<N; i++){ # loop 3A
1826 x = q[i]; # DR_MISALIGNMENT(q) = 0
1827 p[i] = y; # DR_MISALIGNMENT(p) = 0
1831 for (i = 3; i<N; i++){ # loop 3B
1832 x = q[i]; # DR_MISALIGNMENT(q) = 0
1833 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1837 These loops are later passed to loop_transform to be vectorized. The
1838 vectorizer will use the alignment information to guide the transformation
1839 (whether to generate regular loads/stores, or with special handling for
1843 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1845 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1846 dr_vec_info
*first_store
= NULL
;
1847 dr_vec_info
*dr0_info
= NULL
;
1848 struct data_reference
*dr
;
1850 bool do_peeling
= false;
1851 bool do_versioning
= false;
1852 unsigned int npeel
= 0;
1853 bool one_misalignment_known
= false;
1854 bool one_misalignment_unknown
= false;
1855 bool one_dr_unsupportable
= false;
1856 dr_vec_info
*unsupportable_dr_info
= NULL
;
1857 unsigned int dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
1858 hash_table
<peel_info_hasher
> peeling_htab (1);
1860 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1862 /* Reset data so we can safely be called multiple times. */
1863 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1864 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1866 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
1867 return opt_result::success ();
1869 /* Sort the vector of datarefs so DRs that have the same or dependent
1870 alignment are next to each other. */
1871 auto_vec
<data_reference_p
> datarefs
1872 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
1873 datarefs
.qsort (dr_align_group_sort_cmp
);
1875 /* Compute the number of DRs that become aligned when we peel
1876 a dataref so it becomes aligned. */
1877 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
1878 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
1880 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
1881 if (DR_BASE_ADDRESS (datarefs
[i0
]))
1883 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
1885 if (i
== datarefs
.length ()
1886 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
1887 DR_BASE_ADDRESS (datarefs
[i
]), 0)
1888 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
1889 DR_OFFSET (datarefs
[i
]), 0)
1890 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
1891 DR_STEP (datarefs
[i
]), 0))
1893 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1894 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1895 will get known misalignment if we align one of the refs
1896 with the largest DR_TARGET_ALIGNMENT. */
1897 for (unsigned j
= i0
; j
< i
; ++j
)
1899 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
1900 for (unsigned k
= i0
; k
< i
; ++k
)
1904 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
1905 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
1907 n_same_align_refs
[j
]++;
1914 /* While cost model enhancements are expected in the future, the high level
1915 view of the code at this time is as follows:
1917 A) If there is a misaligned access then see if peeling to align
1918 this access can make all data references satisfy
1919 vect_supportable_dr_alignment. If so, update data structures
1920 as needed and return true.
1922 B) If peeling wasn't possible and there is a data reference with an
1923 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1924 then see if loop versioning checks can be used to make all data
1925 references satisfy vect_supportable_dr_alignment. If so, update
1926 data structures as needed and return true.
1928 C) If neither peeling nor versioning were successful then return false if
1929 any data reference does not satisfy vect_supportable_dr_alignment.
1931 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1933 Note, Possibility 3 above (which is peeling and versioning together) is not
1934 being done at this time. */
1936 /* (1) Peeling to force alignment. */
1938 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1940 + How many accesses will become aligned due to the peeling
1941 - How many accesses will become unaligned due to the peeling,
1942 and the cost of misaligned accesses.
1943 - The cost of peeling (the extra runtime checks, the increase
1946 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1948 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1949 if (!vect_relevant_for_alignment_p (dr_info
))
1952 stmt_vec_info stmt_info
= dr_info
->stmt
;
1953 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1954 do_peeling
= vector_alignment_reachable_p (dr_info
);
1957 if (known_alignment_for_access_p (dr_info
, vectype
))
1959 unsigned int npeel_tmp
= 0;
1960 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1961 size_zero_node
) < 0;
1963 /* If known_alignment_for_access_p then we have set
1964 DR_MISALIGNMENT which is only done if we know it at compiler
1965 time, so it is safe to assume target alignment is constant.
1967 unsigned int target_align
=
1968 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
1969 unsigned HOST_WIDE_INT dr_size
= vect_get_scalar_dr_size (dr_info
);
1972 off
= (TYPE_VECTOR_SUBPARTS (vectype
) - 1) * -dr_size
;
1973 unsigned int mis
= dr_misalignment (dr_info
, vectype
, off
);
1974 mis
= negative
? mis
: -mis
;
1976 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1978 /* For multiple types, it is possible that the bigger type access
1979 will have more than one peeling option. E.g., a loop with two
1980 types: one of size (vector size / 4), and the other one of
1981 size (vector size / 8). Vectorization factor will 8. If both
1982 accesses are misaligned by 3, the first one needs one scalar
1983 iteration to be aligned, and the second one needs 5. But the
1984 first one will be aligned also by peeling 5 scalar
1985 iterations, and in that case both accesses will be aligned.
1986 Hence, except for the immediate peeling amount, we also want
1987 to try to add full vector size, while we don't exceed
1988 vectorization factor.
1989 We do this automatically for cost model, since we calculate
1990 cost for every peeling option. */
1991 poly_uint64 nscalars
= npeel_tmp
;
1992 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1994 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1995 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1996 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1999 /* Save info about DR in the hash table. Also include peeling
2000 amounts according to the explanation above. Indicate
2001 the alignment status when the ref is not aligned.
2002 ??? Rather than using unknown alignment here we should
2003 prune all entries from the peeling hashtable which cause
2004 DRs to be not supported. */
2005 bool supportable_if_not_aligned
2006 = vect_supportable_dr_alignment
2007 (loop_vinfo
, dr_info
, vectype
, DR_MISALIGNMENT_UNKNOWN
);
2008 while (known_le (npeel_tmp
, nscalars
))
2010 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
2012 supportable_if_not_aligned
);
2013 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
2016 one_misalignment_known
= true;
2020 /* If we don't know any misalignment values, we prefer
2021 peeling for data-ref that has the maximum number of data-refs
2022 with the same alignment, unless the target prefers to align
2023 stores over load. */
2024 unsigned same_align_drs
= n_same_align_refs
[i
];
2026 || dr0_same_align_drs
< same_align_drs
)
2028 dr0_same_align_drs
= same_align_drs
;
2031 /* For data-refs with the same number of related
2032 accesses prefer the one where the misalign
2033 computation will be invariant in the outermost loop. */
2034 else if (dr0_same_align_drs
== same_align_drs
)
2036 class loop
*ivloop0
, *ivloop
;
2037 ivloop0
= outermost_invariant_loop_for_expr
2038 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
2039 ivloop
= outermost_invariant_loop_for_expr
2040 (loop
, DR_BASE_ADDRESS (dr
));
2041 if ((ivloop
&& !ivloop0
)
2042 || (ivloop
&& ivloop0
2043 && flow_loop_nested_p (ivloop
, ivloop0
)))
2047 one_misalignment_unknown
= true;
2049 /* Check for data refs with unsupportable alignment that
2051 enum dr_alignment_support supportable_dr_alignment
2052 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2053 DR_MISALIGNMENT_UNKNOWN
);
2054 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2056 one_dr_unsupportable
= true;
2057 unsupportable_dr_info
= dr_info
;
2060 if (!first_store
&& DR_IS_WRITE (dr
))
2062 first_store
= dr_info
;
2063 first_store_same_align_drs
= same_align_drs
;
2069 if (!aligned_access_p (dr_info
, vectype
))
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2073 "vector alignment may not be reachable\n");
2079 /* Check if we can possibly peel the loop. */
2080 if (!vect_can_advance_ivs_p (loop_vinfo
)
2081 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
2085 struct _vect_peel_extended_info peel_for_known_alignment
;
2086 struct _vect_peel_extended_info peel_for_unknown_alignment
;
2087 struct _vect_peel_extended_info best_peel
;
2089 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
2090 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
2091 peel_for_unknown_alignment
.peel_info
.count
= 0;
2094 && one_misalignment_unknown
)
2096 /* Check if the target requires to prefer stores over loads, i.e., if
2097 misaligned stores are more expensive than misaligned loads (taking
2098 drs with same alignment into account). */
2099 unsigned int load_inside_cost
= 0;
2100 unsigned int load_outside_cost
= 0;
2101 unsigned int store_inside_cost
= 0;
2102 unsigned int store_outside_cost
= 0;
2103 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
2105 stmt_vector_for_cost dummy
;
2107 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
2110 &dummy
, &dummy
, estimated_npeels
);
2116 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
2118 &store_outside_cost
,
2125 store_inside_cost
= INT_MAX
;
2126 store_outside_cost
= INT_MAX
;
2129 if (load_inside_cost
> store_inside_cost
2130 || (load_inside_cost
== store_inside_cost
2131 && load_outside_cost
> store_outside_cost
))
2133 dr0_info
= first_store
;
2134 dr0_same_align_drs
= first_store_same_align_drs
;
2135 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2136 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2140 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2141 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2144 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2145 prologue_cost_vec
.create (2);
2146 epilogue_cost_vec
.create (2);
2149 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2150 (loop_vinfo
, estimated_npeels
, &dummy2
,
2151 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2152 &prologue_cost_vec
, &epilogue_cost_vec
);
2154 prologue_cost_vec
.release ();
2155 epilogue_cost_vec
.release ();
2157 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2160 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2161 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2163 best_peel
= peel_for_unknown_alignment
;
2165 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2166 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2167 peel_for_known_alignment
.peel_info
.count
= 0;
2168 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2170 if (do_peeling
&& one_misalignment_known
)
2172 /* Peeling is possible, but there is no data access that is not supported
2173 unless aligned. So we try to choose the best possible peeling from
2175 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2176 (&peeling_htab
, loop_vinfo
);
2179 /* Compare costs of peeling for known and unknown alignment. */
2180 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2181 && peel_for_unknown_alignment
.inside_cost
2182 >= peel_for_known_alignment
.inside_cost
)
2184 best_peel
= peel_for_known_alignment
;
2186 /* If the best peeling for known alignment has NPEEL == 0, perform no
2187 peeling at all except if there is an unsupportable dr that we can
2189 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2193 /* If there is an unsupportable data ref, prefer this over all choices so far
2194 since we'd have to discard a chosen peeling except when it accidentally
2195 aligned the unsupportable data ref. */
2196 if (one_dr_unsupportable
)
2197 dr0_info
= unsupportable_dr_info
;
2198 else if (do_peeling
)
2200 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2201 TODO: Use nopeel_outside_cost or get rid of it? */
2202 unsigned nopeel_inside_cost
= 0;
2203 unsigned nopeel_outside_cost
= 0;
2205 stmt_vector_for_cost dummy
;
2207 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2208 &nopeel_outside_cost
, &dummy
, &dummy
, 0);
2211 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2212 costs will be recorded. */
2213 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2214 prologue_cost_vec
.create (2);
2215 epilogue_cost_vec
.create (2);
2218 nopeel_outside_cost
+= vect_get_known_peeling_cost
2219 (loop_vinfo
, 0, &dummy2
,
2220 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2221 &prologue_cost_vec
, &epilogue_cost_vec
);
2223 prologue_cost_vec
.release ();
2224 epilogue_cost_vec
.release ();
2226 npeel
= best_peel
.peel_info
.npeel
;
2227 dr0_info
= best_peel
.peel_info
.dr_info
;
2229 /* If no peeling is not more expensive than the best peeling we
2230 have so far, don't perform any peeling. */
2231 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2237 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2238 if (known_alignment_for_access_p (dr0_info
,
2239 STMT_VINFO_VECTYPE (stmt_info
)))
2241 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2242 size_zero_node
) < 0;
2245 /* Since it's known at compile time, compute the number of
2246 iterations in the peeled loop (the peeling factor) for use in
2247 updating DR_MISALIGNMENT values. The peeling factor is the
2248 vectorization factor minus the misalignment as an element
2250 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2253 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2254 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2256 = dr_misalignment (dr0_info
, vectype
, off
);
2257 mis
= negative
? mis
: -mis
;
2258 /* If known_alignment_for_access_p then we have set
2259 DR_MISALIGNMENT which is only done if we know it at compiler
2260 time, so it is safe to assume target alignment is constant.
2262 unsigned int target_align
=
2263 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2264 npeel
= ((mis
& (target_align
- 1))
2265 / vect_get_scalar_dr_size (dr0_info
));
2268 /* For interleaved data access every iteration accesses all the
2269 members of the group, therefore we divide the number of iterations
2270 by the group size. */
2271 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2272 npeel
/= DR_GROUP_SIZE (stmt_info
);
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE
, vect_location
,
2276 "Try peeling by %d\n", npeel
);
2279 /* Ensure that all datarefs can be vectorized after the peel. */
2280 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2283 /* Check if all datarefs are supportable and log. */
2286 && known_alignment_for_access_p (dr0_info
,
2287 STMT_VINFO_VECTYPE (stmt_info
)))
2288 return opt_result::success ();
2290 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2293 unsigned max_allowed_peel
2294 = param_vect_max_peeling_for_alignment
;
2295 if (loop_cost_model (loop
) <= VECT_COST_MODEL_CHEAP
)
2296 max_allowed_peel
= 0;
2297 if (max_allowed_peel
!= (unsigned)-1)
2299 unsigned max_peel
= npeel
;
2302 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2303 unsigned HOST_WIDE_INT target_align_c
;
2304 if (target_align
.is_constant (&target_align_c
))
2306 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE
, vect_location
,
2312 "Disable peeling, max peels set and vector"
2313 " alignment unknown\n");
2316 if (max_peel
> max_allowed_peel
)
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE
, vect_location
,
2321 "Disable peeling, max peels reached: %d\n", max_peel
);
2326 /* Cost model #2 - if peeling may result in a remaining loop not
2327 iterating enough to be vectorized then do not peel. Since this
2328 is a cost heuristic rather than a correctness decision, use the
2329 most likely runtime value for variable vectorization factors. */
2331 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2333 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2334 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2335 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2336 < assumed_vf
+ max_peel
)
2342 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2343 If the misalignment of DR_i is identical to that of dr0 then set
2344 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2345 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2346 by the peeling factor times the element size of DR_i (MOD the
2347 vectorization factor times the size). Otherwise, the
2348 misalignment of DR_i must be set to unknown. */
2349 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2350 if (dr
!= dr0_info
->dr
)
2352 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2353 if (!vect_relevant_for_alignment_p (dr_info
))
2356 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2359 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2361 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2363 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = -1;
2364 SET_DR_MISALIGNMENT (dr0_info
,
2365 vect_dr_misalign_for_aligned_access (dr0_info
));
2366 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE
, vect_location
,
2369 "Alignment of access forced using peeling.\n");
2370 dump_printf_loc (MSG_NOTE
, vect_location
,
2371 "Peeling for alignment will be applied.\n");
2374 /* The inside-loop cost will be accounted for in vectorizable_load
2375 and vectorizable_store correctly with adjusted alignments.
2376 Drop the body_cst_vec on the floor here. */
2377 return opt_result::success ();
2381 /* (2) Versioning to force alignment. */
2383 /* Try versioning if:
2384 1) optimize loop for speed and the cost-model is not cheap
2385 2) there is at least one unsupported misaligned data ref with an unknown
2387 3) all misaligned data refs with a known misalignment are supported, and
2388 4) the number of runtime alignment checks is within reason. */
2391 = (optimize_loop_nest_for_speed_p (loop
)
2392 && !loop
->inner
/* FORNOW */
2393 && loop_cost_model (loop
) > VECT_COST_MODEL_CHEAP
);
2397 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2399 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2400 if (!vect_relevant_for_alignment_p (dr_info
))
2403 stmt_vec_info stmt_info
= dr_info
->stmt
;
2404 if (STMT_VINFO_STRIDED_P (stmt_info
))
2406 do_versioning
= false;
2410 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2411 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2412 size_zero_node
) < 0;
2415 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2416 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2418 if ((misalignment
= dr_misalignment (dr_info
, vectype
, off
)) == 0)
2421 enum dr_alignment_support supportable_dr_alignment
2422 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2424 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2426 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
2427 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2428 >= (unsigned) param_vect_max_version_for_alignment_checks
))
2430 do_versioning
= false;
2434 /* At present we don't support versioning for alignment
2435 with variable VF, since there's no guarantee that the
2436 VF is a power of two. We could relax this if we added
2437 a way of enforcing a power-of-two size. */
2438 unsigned HOST_WIDE_INT size
;
2439 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2441 do_versioning
= false;
2445 /* Forcing alignment in the first iteration is no good if
2446 we don't keep it across iterations. For now, just disable
2447 versioning in this case.
2448 ?? We could actually unroll the loop to achieve the required
2449 overall step alignment, and forcing the alignment could be
2450 done by doing some iterations of the non-vectorized loop. */
2451 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2452 * DR_STEP_ALIGNMENT (dr
),
2453 DR_TARGET_ALIGNMENT (dr_info
)))
2455 do_versioning
= false;
2459 /* The rightmost bits of an aligned address must be zeros.
2460 Construct the mask needed for this test. For example,
2461 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2462 mask must be 15 = 0xf. */
2463 int mask
= size
- 1;
2465 /* FORNOW: use the same mask to test all potentially unaligned
2466 references in the loop. */
2467 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2468 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2470 do_versioning
= false;
2474 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2475 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2479 /* Versioning requires at least one misaligned data reference. */
2480 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2481 do_versioning
= false;
2482 else if (!do_versioning
)
2483 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2488 const vec
<stmt_vec_info
> &may_misalign_stmts
2489 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2490 stmt_vec_info stmt_info
;
2492 /* It can now be assumed that the data references in the statements
2493 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2494 of the loop being vectorized. */
2495 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2497 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2498 SET_DR_MISALIGNMENT (dr_info
,
2499 vect_dr_misalign_for_aligned_access (dr_info
));
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_NOTE
, vect_location
,
2502 "Alignment of access forced using versioning.\n");
2505 if (dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE
, vect_location
,
2507 "Versioning for alignment will be applied.\n");
2509 /* Peeling and versioning can't be done together at this time. */
2510 gcc_assert (! (do_peeling
&& do_versioning
));
2512 return opt_result::success ();
2515 /* This point is reached if neither peeling nor versioning is being done. */
2516 gcc_assert (! (do_peeling
|| do_versioning
));
2518 return opt_result::success ();
2522 /* Function vect_analyze_data_refs_alignment
2524 Analyze the alignment of the data-references in the loop.
2525 Return FALSE if a data reference is found that cannot be vectorized. */
2528 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2530 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2532 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2533 struct data_reference
*dr
;
2536 vect_record_base_alignments (loop_vinfo
);
2537 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2539 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2540 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2542 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
)
2543 && DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
) != dr_info
->stmt
)
2545 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
,
2546 STMT_VINFO_VECTYPE (dr_info
->stmt
));
2550 return opt_result::success ();
2554 /* Analyze alignment of DRs of stmts in NODE. */
2557 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2559 /* Alignment is maintained in the first element of the group. */
2560 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2561 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2562 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2563 tree vectype
= SLP_TREE_VECTYPE (node
);
2564 poly_uint64 vector_alignment
2565 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
2567 if (dr_info
->misalignment
== DR_MISALIGNMENT_UNINITIALIZED
)
2568 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2569 /* Re-analyze alignment when we're facing a vectorization with a bigger
2570 alignment requirement. */
2571 else if (known_lt (dr_info
->target_alignment
, vector_alignment
))
2573 poly_uint64 old_target_alignment
= dr_info
->target_alignment
;
2574 int old_misalignment
= dr_info
->misalignment
;
2575 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2576 /* But keep knowledge about a smaller alignment. */
2577 if (old_misalignment
!= DR_MISALIGNMENT_UNKNOWN
2578 && dr_info
->misalignment
== DR_MISALIGNMENT_UNKNOWN
)
2580 dr_info
->target_alignment
= old_target_alignment
;
2581 dr_info
->misalignment
= old_misalignment
;
2584 /* When we ever face unordered target alignments the first one wins in terms
2585 of analyzing and the other will become unknown in dr_misalignment. */
2589 /* Function vect_slp_analyze_instance_alignment
2591 Analyze the alignment of the data-references in the SLP instance.
2592 Return FALSE if a data reference is found that cannot be vectorized. */
2595 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
2596 slp_instance instance
)
2598 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2602 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2603 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
2606 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2607 && ! vect_slp_analyze_node_alignment
2608 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2615 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2616 accesses of legal size, step, etc. Detect gaps, single element
2617 interleaving, and other special cases. Set grouped access info.
2618 Collect groups of strided stores for further use in SLP analysis.
2619 Worker for vect_analyze_group_access. */
2622 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2624 data_reference
*dr
= dr_info
->dr
;
2625 tree step
= DR_STEP (dr
);
2626 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2627 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2628 stmt_vec_info stmt_info
= dr_info
->stmt
;
2629 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2630 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2631 HOST_WIDE_INT dr_step
= -1;
2632 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2633 bool slp_impossible
= false;
2635 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2636 size of the interleaving group (including gaps). */
2637 if (tree_fits_shwi_p (step
))
2639 dr_step
= tree_to_shwi (step
);
2640 /* Check that STEP is a multiple of type size. Otherwise there is
2641 a non-element-sized gap at the end of the group which we
2642 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2643 ??? As we can handle non-constant step fine here we should
2644 simply remove uses of DR_GROUP_GAP between the last and first
2645 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2646 simply not include that gap. */
2647 if ((dr_step
% type_size
) != 0)
2649 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_NOTE
, vect_location
,
2651 "Step %T is not a multiple of the element size"
2656 groupsize
= absu_hwi (dr_step
) / type_size
;
2661 /* Not consecutive access is possible only if it is a part of interleaving. */
2662 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2664 /* Check if it this DR is a part of interleaving, and is a single
2665 element of the group that is accessed in the loop. */
2667 /* Gaps are supported only for loads. STEP must be a multiple of the type
2670 && (dr_step
% type_size
) == 0
2672 /* This could be UINT_MAX but as we are generating code in a very
2673 inefficient way we have to cap earlier.
2674 See PR91403 for example. */
2675 && groupsize
<= 4096)
2677 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2678 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2679 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE
, vect_location
,
2682 "Detected single element interleaving %T"
2689 if (dump_enabled_p ())
2690 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2691 "not consecutive access %G", stmt_info
->stmt
);
2695 /* Mark the statement as unvectorizable. */
2696 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2702 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2706 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2708 /* First stmt in the interleaving chain. Check the chain. */
2709 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2710 struct data_reference
*data_ref
= dr
;
2711 unsigned int count
= 1;
2712 tree prev_init
= DR_INIT (data_ref
);
2713 HOST_WIDE_INT diff
, gaps
= 0;
2715 /* By construction, all group members have INTEGER_CST DR_INITs. */
2718 /* We never have the same DR multiple times. */
2719 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
2720 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
2722 data_ref
= STMT_VINFO_DATA_REF (next
);
2724 /* All group members have the same STEP by construction. */
2725 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2727 /* Check that the distance between two accesses is equal to the type
2728 size. Otherwise, we have gaps. */
2729 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2730 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2731 if (diff
< 1 || diff
> UINT_MAX
)
2733 /* For artificial testcases with array accesses with large
2734 constant indices we can run into overflow issues which
2735 can end up fooling the groupsize constraint below so
2736 check the individual gaps (which are represented as
2737 unsigned int) as well. */
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2740 "interleaved access with gap larger "
2741 "than representable\n");
2746 /* FORNOW: SLP of accesses with gaps is not supported. */
2747 slp_impossible
= true;
2748 if (DR_IS_WRITE (data_ref
))
2750 if (dump_enabled_p ())
2751 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2752 "interleaved store with gaps\n");
2759 last_accessed_element
+= diff
;
2761 /* Store the gap from the previous member of the group. If there is no
2762 gap in the access, DR_GROUP_GAP is always 1. */
2763 DR_GROUP_GAP (next
) = diff
;
2765 prev_init
= DR_INIT (data_ref
);
2766 next
= DR_GROUP_NEXT_ELEMENT (next
);
2767 /* Count the number of data-refs in the chain. */
2772 groupsize
= count
+ gaps
;
2774 /* This could be UINT_MAX but as we are generating code in a very
2775 inefficient way we have to cap earlier. See PR78699 for example. */
2776 if (groupsize
> 4096)
2778 if (dump_enabled_p ())
2779 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2780 "group is too large\n");
2784 /* Check that the size of the interleaving is equal to count for stores,
2785 i.e., that there are no gaps. */
2786 if (groupsize
!= count
2787 && !DR_IS_READ (dr
))
2790 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2793 /* If there is a gap after the last load in the group it is the
2794 difference between the groupsize and the last accessed
2796 When there is no gap, this difference should be 0. */
2797 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
2799 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2800 if (dump_enabled_p ())
2802 dump_printf_loc (MSG_NOTE
, vect_location
,
2803 "Detected interleaving ");
2804 if (DR_IS_READ (dr
))
2805 dump_printf (MSG_NOTE
, "load ");
2806 else if (STMT_VINFO_STRIDED_P (stmt_info
))
2807 dump_printf (MSG_NOTE
, "strided store ");
2809 dump_printf (MSG_NOTE
, "store ");
2810 dump_printf (MSG_NOTE
, "of size %u\n",
2811 (unsigned)groupsize
);
2812 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
2813 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2816 if (DR_GROUP_GAP (next
) != 1)
2817 dump_printf_loc (MSG_NOTE
, vect_location
,
2818 "\t<gap of %d elements>\n",
2819 DR_GROUP_GAP (next
) - 1);
2820 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
2821 next
= DR_GROUP_NEXT_ELEMENT (next
);
2823 if (DR_GROUP_GAP (stmt_info
) != 0)
2824 dump_printf_loc (MSG_NOTE
, vect_location
,
2825 "\t<gap of %d elements>\n",
2826 DR_GROUP_GAP (stmt_info
));
2829 /* SLP: create an SLP data structure for every interleaving group of
2830 stores for further analysis in vect_analyse_slp. */
2831 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2834 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
2836 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
2843 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2844 accesses of legal size, step, etc. Detect gaps, single element
2845 interleaving, and other special cases. Set grouped access info.
2846 Collect groups of strided stores for further use in SLP analysis. */
2849 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2851 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
2853 /* Dissolve the group if present. */
2854 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
2857 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2858 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2859 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
2867 /* Analyze the access pattern of the data-reference DR_INFO.
2868 In case of non-consecutive accesses call vect_analyze_group_access() to
2869 analyze groups of accesses. */
2872 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2874 data_reference
*dr
= dr_info
->dr
;
2875 tree step
= DR_STEP (dr
);
2876 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2877 stmt_vec_info stmt_info
= dr_info
->stmt
;
2878 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2879 class loop
*loop
= NULL
;
2881 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2885 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2887 if (loop_vinfo
&& !step
)
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2891 "bad data-ref access in loop\n");
2895 /* Allow loads with zero step in inner-loop vectorization. */
2896 if (loop_vinfo
&& integer_zerop (step
))
2898 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2899 if (!nested_in_vect_loop_p (loop
, stmt_info
))
2900 return DR_IS_READ (dr
);
2901 /* Allow references with zero step for outer loops marked
2902 with pragma omp simd only - it guarantees absence of
2903 loop-carried dependencies between inner loop iterations. */
2904 if (loop
->safelen
< 2)
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_NOTE
, vect_location
,
2908 "zero step in inner loop of nest\n");
2913 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2915 /* Interleaved accesses are not yet supported within outer-loop
2916 vectorization for references in the inner-loop. */
2917 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2919 /* For the rest of the analysis we use the outer-loop step. */
2920 step
= STMT_VINFO_DR_STEP (stmt_info
);
2921 if (integer_zerop (step
))
2923 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_NOTE
, vect_location
,
2925 "zero step in outer loop.\n");
2926 return DR_IS_READ (dr
);
2931 if (TREE_CODE (step
) == INTEGER_CST
)
2933 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2934 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2936 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2938 /* Mark that it is not interleaving. */
2939 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2944 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2946 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_NOTE
, vect_location
,
2948 "grouped access in outer loop.\n");
2953 /* Assume this is a DR handled by non-constant strided load case. */
2954 if (TREE_CODE (step
) != INTEGER_CST
)
2955 return (STMT_VINFO_STRIDED_P (stmt_info
)
2956 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2957 || vect_analyze_group_access (vinfo
, dr_info
)));
2959 /* Not consecutive access - check if it's a part of interleaving group. */
2960 return vect_analyze_group_access (vinfo
, dr_info
);
2963 /* Compare two data-references DRA and DRB to group them into chunks
2964 suitable for grouping. */
2967 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2969 dr_vec_info
*dra_info
= *(dr_vec_info
**)const_cast<void *>(dra_
);
2970 dr_vec_info
*drb_info
= *(dr_vec_info
**)const_cast<void *>(drb_
);
2971 data_reference_p dra
= dra_info
->dr
;
2972 data_reference_p drb
= drb_info
->dr
;
2975 /* Stabilize sort. */
2979 /* Different group IDs lead never belong to the same group. */
2980 if (dra_info
->group
!= drb_info
->group
)
2981 return dra_info
->group
< drb_info
->group
? -1 : 1;
2983 /* Ordering of DRs according to base. */
2984 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2985 DR_BASE_ADDRESS (drb
));
2989 /* And according to DR_OFFSET. */
2990 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2994 /* Put reads before writes. */
2995 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2996 return DR_IS_READ (dra
) ? -1 : 1;
2998 /* Then sort after access size. */
2999 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
3000 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
3004 /* And after step. */
3005 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
3009 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3010 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
3012 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
3016 /* If OP is the result of a conversion, return the unconverted value,
3017 otherwise return null. */
3020 strip_conversion (tree op
)
3022 if (TREE_CODE (op
) != SSA_NAME
)
3024 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
3025 if (!is_gimple_assign (stmt
)
3026 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
3028 return gimple_assign_rhs1 (stmt
);
3031 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3032 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3033 be grouped in SLP mode. */
3036 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
3039 if (gimple_assign_single_p (stmt1_info
->stmt
))
3040 return gimple_assign_single_p (stmt2_info
->stmt
);
3042 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
3043 if (call1
&& gimple_call_internal_p (call1
))
3045 /* Check for two masked loads or two masked stores. */
3046 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
3047 if (!call2
|| !gimple_call_internal_p (call2
))
3049 internal_fn ifn
= gimple_call_internal_fn (call1
);
3050 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
3052 if (ifn
!= gimple_call_internal_fn (call2
))
3055 /* Check that the masks are the same. Cope with casts of masks,
3056 like those created by build_mask_conversion. */
3057 tree mask1
= gimple_call_arg (call1
, 2);
3058 tree mask2
= gimple_call_arg (call2
, 2);
3059 if (!operand_equal_p (mask1
, mask2
, 0) && !allow_slp_p
)
3061 mask1
= strip_conversion (mask1
);
3064 mask2
= strip_conversion (mask2
);
3067 if (!operand_equal_p (mask1
, mask2
, 0))
3076 /* Function vect_analyze_data_ref_accesses.
3078 Analyze the access pattern of all the data references in the loop.
3080 FORNOW: the only access pattern that is considered vectorizable is a
3081 simple step 1 (consecutive) access.
3083 FORNOW: handle only arrays and pointer accesses. */
3086 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
3087 vec
<int> *dataref_groups
)
3090 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
3092 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3094 if (datarefs
.is_empty ())
3095 return opt_result::success ();
3097 /* Sort the array of datarefs to make building the interleaving chains
3098 linear. Don't modify the original vector's order, it is needed for
3099 determining what dependencies are reversed. */
3100 vec
<dr_vec_info
*> datarefs_copy
;
3101 datarefs_copy
.create (datarefs
.length ());
3102 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
3104 dr_vec_info
*dr_info
= vinfo
->lookup_dr (datarefs
[i
]);
3105 /* If the caller computed DR grouping use that, otherwise group by
3108 dr_info
->group
= (*dataref_groups
)[i
];
3110 dr_info
->group
= gimple_bb (DR_STMT (datarefs
[i
]))->index
;
3111 datarefs_copy
.quick_push (dr_info
);
3113 datarefs_copy
.qsort (dr_group_sort_cmp
);
3114 hash_set
<stmt_vec_info
> to_fixup
;
3116 /* Build the interleaving chains. */
3117 for (i
= 0; i
< datarefs_copy
.length () - 1;)
3119 dr_vec_info
*dr_info_a
= datarefs_copy
[i
];
3120 data_reference_p dra
= dr_info_a
->dr
;
3121 int dra_group_id
= dr_info_a
->group
;
3122 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
3123 stmt_vec_info lastinfo
= NULL
;
3124 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
3125 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
3130 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
3132 dr_vec_info
*dr_info_b
= datarefs_copy
[i
];
3133 data_reference_p drb
= dr_info_b
->dr
;
3134 int drb_group_id
= dr_info_b
->group
;
3135 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
3136 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
3137 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
3140 /* ??? Imperfect sorting (non-compatible types, non-modulo
3141 accesses, same accesses) can lead to a group to be artificially
3142 split here as we don't just skip over those. If it really
3143 matters we can push those to a worklist and re-iterate
3144 over them. The we can just skip ahead to the next DR here. */
3146 /* DRs in a different DR group should not be put into the same
3147 interleaving group. */
3148 if (dra_group_id
!= drb_group_id
)
3151 /* Check that the data-refs have same first location (except init)
3152 and they are both either store or load (not load and store,
3153 not masked loads or stores). */
3154 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3155 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3156 DR_BASE_ADDRESS (drb
)) != 0
3157 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3158 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3161 /* Check that the data-refs have the same constant size. */
3162 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3163 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3164 if (!tree_fits_uhwi_p (sza
)
3165 || !tree_fits_uhwi_p (szb
)
3166 || !tree_int_cst_equal (sza
, szb
))
3169 /* Check that the data-refs have the same step. */
3170 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3173 /* Check the types are compatible.
3174 ??? We don't distinguish this during sorting. */
3175 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3176 TREE_TYPE (DR_REF (drb
))))
3179 /* Check that the DR_INITs are compile-time constants. */
3180 if (!tree_fits_shwi_p (DR_INIT (dra
))
3181 || !tree_fits_shwi_p (DR_INIT (drb
)))
3184 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3185 just hold extra information. */
3186 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3187 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3188 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3191 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3192 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3193 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3194 HOST_WIDE_INT init_prev
3195 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]->dr
));
3196 gcc_assert (init_a
<= init_b
3197 && init_a
<= init_prev
3198 && init_prev
<= init_b
);
3200 /* Do not place the same access in the interleaving chain twice. */
3201 if (init_b
== init_prev
)
3203 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]->dr
))
3204 < gimple_uid (DR_STMT (drb
)));
3205 /* Simply link in duplicates and fix up the chain below. */
3209 /* If init_b == init_a + the size of the type * k, we have an
3210 interleaving, and DRA is accessed before DRB. */
3211 unsigned HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3212 if (type_size_a
== 0
3213 || (((unsigned HOST_WIDE_INT
)init_b
- init_a
)
3214 % type_size_a
!= 0))
3217 /* If we have a store, the accesses are adjacent. This splits
3218 groups into chunks we support (we don't support vectorization
3219 of stores with gaps). */
3220 if (!DR_IS_READ (dra
)
3221 && (((unsigned HOST_WIDE_INT
)init_b
- init_prev
)
3225 /* If the step (if not zero or non-constant) is smaller than the
3226 difference between data-refs' inits this splits groups into
3228 if (tree_fits_shwi_p (DR_STEP (dra
)))
3230 unsigned HOST_WIDE_INT step
3231 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3233 && step
<= ((unsigned HOST_WIDE_INT
)init_b
- init_a
))
3238 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE
, vect_location
,
3241 ? "Detected interleaving load %T and %T\n"
3242 : "Detected interleaving store %T and %T\n",
3243 DR_REF (dra
), DR_REF (drb
));
3245 /* Link the found element into the group list. */
3246 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3248 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3249 lastinfo
= stmtinfo_a
;
3251 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3252 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3253 lastinfo
= stmtinfo_b
;
3255 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3256 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3258 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3259 dump_printf_loc (MSG_NOTE
, vect_location
,
3260 "Load suitable for SLP vectorization only.\n");
3262 if (init_b
== init_prev
3263 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3264 && dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE
, vect_location
,
3266 "Queuing group with duplicate access for fixup\n");
3270 /* Fixup groups with duplicate entries by splitting it. */
3273 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3274 if (!(it
!= to_fixup
.end ()))
3276 stmt_vec_info grp
= *it
;
3277 to_fixup
.remove (grp
);
3279 /* Find the earliest duplicate group member. */
3280 unsigned first_duplicate
= -1u;
3281 stmt_vec_info next
, g
= grp
;
3282 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3284 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3285 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3286 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3287 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3290 if (first_duplicate
== -1U)
3293 /* Then move all stmts after the first duplicate to a new group.
3294 Note this is a heuristic but one with the property that *it
3295 is fixed up completely. */
3297 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3298 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3300 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3302 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3306 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3308 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3311 g
= DR_GROUP_NEXT_ELEMENT (g
);
3313 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3315 /* Fixup the new group which still may contain duplicates. */
3316 to_fixup
.add (newgroup
);
3319 dr_vec_info
*dr_info
;
3320 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_info
)
3322 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3323 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3325 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3327 "not vectorized: complicated access pattern.\n");
3329 if (is_a
<bb_vec_info
> (vinfo
))
3331 /* Mark the statement as not vectorizable. */
3332 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3337 datarefs_copy
.release ();
3338 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3340 " complicated access pattern.\n");
3345 datarefs_copy
.release ();
3346 return opt_result::success ();
3349 /* Function vect_vfa_segment_size.
3352 DR_INFO: The data reference.
3353 LENGTH_FACTOR: segment length to consider.
3355 Return a value suitable for the dr_with_seg_len::seg_len field.
3356 This is the "distance travelled" by the pointer from the first
3357 iteration in the segment to the last. Note that it does not include
3358 the size of the access; in effect it only describes the first byte. */
3361 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3363 length_factor
= size_binop (MINUS_EXPR
,
3364 fold_convert (sizetype
, length_factor
),
3366 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3370 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3371 gives the worst-case number of bytes covered by the segment. */
3373 static unsigned HOST_WIDE_INT
3374 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3376 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3377 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3378 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3379 unsigned HOST_WIDE_INT access_size
= ref_size
;
3380 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3382 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3383 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3385 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3387 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3388 && ((misalignment
= dr_misalignment (dr_info
, vectype
)), true)
3389 && (vect_supportable_dr_alignment (vinfo
, dr_info
, vectype
, misalignment
)
3390 == dr_explicit_realign_optimized
))
3392 /* We might access a full vector's worth. */
3393 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3398 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3402 vect_vfa_align (dr_vec_info
*dr_info
)
3404 return dr_alignment (dr_info
->dr
);
3407 /* Function vect_no_alias_p.
3409 Given data references A and B with equal base and offset, see whether
3410 the alias relation can be decided at compilation time. Return 1 if
3411 it can and the references alias, 0 if it can and the references do
3412 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3413 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3414 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3417 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3418 tree segment_length_a
, tree segment_length_b
,
3419 unsigned HOST_WIDE_INT access_size_a
,
3420 unsigned HOST_WIDE_INT access_size_b
)
3422 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3423 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3424 poly_uint64 const_length_a
;
3425 poly_uint64 const_length_b
;
3427 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3428 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3430 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3432 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3433 offset_a
-= const_length_a
;
3436 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3437 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3439 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3440 offset_b
-= const_length_b
;
3443 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3445 const_length_a
+= access_size_a
;
3446 const_length_b
+= access_size_b
;
3448 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3449 offset_b
, const_length_b
))
3452 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3453 offset_b
, const_length_b
))
3459 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3463 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3464 unsigned int loop_depth
, poly_uint64 vf
)
3466 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3467 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3470 /* If the dependence is exact, we should have limited the VF instead. */
3471 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3474 lambda_vector dist_v
;
3475 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3477 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3479 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3480 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE
, vect_location
,
3486 "dependence distance between %T and %T is >= VF\n",
3487 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3492 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3495 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3497 dump_printf (dump_kind
, "%s (%T) >= ",
3498 lower_bound
.unsigned_p
? "unsigned" : "abs",
3500 dump_dec (dump_kind
, lower_bound
.min_value
);
3503 /* Record that the vectorized loop requires the vec_lower_bound described
3504 by EXPR, UNSIGNED_P and MIN_VALUE. */
3507 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3508 poly_uint64 min_value
)
3510 vec
<vec_lower_bound
> &lower_bounds
3511 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3512 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3513 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3515 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3516 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3517 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3518 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3520 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3521 lower_bounds
[i
].min_value
= min_value
;
3522 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE
, vect_location
,
3525 "updating run-time check to ");
3526 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3527 dump_printf (MSG_NOTE
, "\n");
3533 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3534 if (dump_enabled_p ())
3536 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3537 dump_lower_bound (MSG_NOTE
, lower_bound
);
3538 dump_printf (MSG_NOTE
, "\n");
3540 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3543 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3544 will span fewer than GAP bytes. */
3547 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3550 stmt_vec_info stmt_info
= dr_info
->stmt
;
3552 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3553 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3554 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3555 return (estimated_poly_value (gap
)
3556 <= count
* vect_get_scalar_dr_size (dr_info
));
3559 /* Return true if we know that there is no alias between DR_INFO_A and
3560 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3561 When returning true, set *LOWER_BOUND_OUT to this N. */
3564 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3565 poly_uint64
*lower_bound_out
)
3567 /* Check that there is a constant gap of known sign between DR_A
3569 data_reference
*dr_a
= dr_info_a
->dr
;
3570 data_reference
*dr_b
= dr_info_b
->dr
;
3571 poly_int64 init_a
, init_b
;
3572 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3573 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3574 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3575 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3576 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3577 || !ordered_p (init_a
, init_b
))
3580 /* Sort DR_A and DR_B by the address they access. */
3581 if (maybe_lt (init_b
, init_a
))
3583 std::swap (init_a
, init_b
);
3584 std::swap (dr_info_a
, dr_info_b
);
3585 std::swap (dr_a
, dr_b
);
3588 /* If the two accesses could be dependent within a scalar iteration,
3589 make sure that we'd retain their order. */
3590 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3591 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3594 /* There is no alias if abs (DR_STEP) is greater than or equal to
3595 the bytes spanned by the combination of the two accesses. */
3596 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3600 /* Function vect_prune_runtime_alias_test_list.
3602 Prune a list of ddrs to be tested at run-time by versioning for alias.
3603 Merge several alias checks into one if possible.
3604 Return FALSE if resulting list of ddrs is longer then allowed by
3605 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3608 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3610 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3611 hash_set
<tree_pair_hash
> compared_objects
;
3613 const vec
<ddr_p
> &may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3614 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3615 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3616 const vec
<vec_object_pair
> &check_unequal_addrs
3617 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3618 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3619 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3625 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3627 /* Step values are irrelevant for aliasing if the number of vector
3628 iterations is equal to the number of scalar iterations (which can
3629 happen for fully-SLP loops). */
3630 bool vf_one_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3634 /* Convert the checks for nonzero steps into bound tests. */
3636 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3637 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3640 if (may_alias_ddrs
.is_empty ())
3641 return opt_result::success ();
3643 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3645 unsigned int loop_depth
3646 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3647 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3649 /* First, we collect all data ref pairs for aliasing checks. */
3650 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3652 poly_uint64 lower_bound
;
3653 tree segment_length_a
, segment_length_b
;
3654 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3655 unsigned int align_a
, align_b
;
3657 /* Ignore the alias if the VF we chose ended up being no greater
3658 than the dependence distance. */
3659 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3662 if (DDR_OBJECT_A (ddr
))
3664 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3665 if (!compared_objects
.add (new_pair
))
3667 if (dump_enabled_p ())
3668 dump_printf_loc (MSG_NOTE
, vect_location
,
3669 "checking that %T and %T"
3670 " have different addresses\n",
3671 new_pair
.first
, new_pair
.second
);
3672 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3677 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3678 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3680 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3681 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3683 bool preserves_scalar_order_p
3684 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3687 && (preserves_scalar_order_p
3688 || operand_equal_p (DR_STEP (dr_info_a
->dr
),
3689 DR_STEP (dr_info_b
->dr
))));
3691 /* Skip the pair if inter-iteration dependencies are irrelevant
3692 and intra-iteration dependencies are guaranteed to be honored. */
3694 && (preserves_scalar_order_p
3695 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3698 if (dump_enabled_p ())
3699 dump_printf_loc (MSG_NOTE
, vect_location
,
3700 "no need for alias check between "
3701 "%T and %T when VF is 1\n",
3702 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3706 /* See whether we can handle the alias using a bounds check on
3707 the step, and whether that's likely to be the best approach.
3708 (It might not be, for example, if the minimum step is much larger
3709 than the number of bytes handled by one vector iteration.) */
3711 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3712 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3714 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3715 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3717 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3718 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3721 "%T and %T when the step %T is outside ",
3722 DR_REF (dr_info_a
->dr
),
3723 DR_REF (dr_info_b
->dr
),
3724 DR_STEP (dr_info_a
->dr
));
3726 dump_printf (MSG_NOTE
, "[0");
3729 dump_printf (MSG_NOTE
, "(");
3730 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3732 dump_printf (MSG_NOTE
, ", ");
3733 dump_dec (MSG_NOTE
, lower_bound
);
3734 dump_printf (MSG_NOTE
, ")\n");
3736 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
3737 unsigned_p
, lower_bound
);
3741 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
3742 if (dr_group_first_a
)
3744 stmt_info_a
= dr_group_first_a
;
3745 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
3748 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
3749 if (dr_group_first_b
)
3751 stmt_info_b
= dr_group_first_b
;
3752 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
3757 segment_length_a
= size_zero_node
;
3758 segment_length_b
= size_zero_node
;
3762 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
3763 DR_STEP (dr_info_b
->dr
), 0))
3764 length_factor
= scalar_loop_iters
;
3766 length_factor
= size_int (vect_factor
);
3767 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
3768 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
3770 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
3771 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
3772 align_a
= vect_vfa_align (dr_info_a
);
3773 align_b
= vect_vfa_align (dr_info_b
);
3775 /* See whether the alias is known at compilation time. */
3776 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
3777 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
3778 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
3779 DR_OFFSET (dr_info_b
->dr
), 0)
3780 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
3781 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
3782 && poly_int_tree_p (segment_length_a
)
3783 && poly_int_tree_p (segment_length_b
))
3785 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
3790 if (res
>= 0 && dump_enabled_p ())
3792 dump_printf_loc (MSG_NOTE
, vect_location
,
3793 "can tell at compile time that %T and %T",
3794 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3796 dump_printf (MSG_NOTE
, " do not alias\n");
3798 dump_printf (MSG_NOTE
, " alias\n");
3805 return opt_result::failure_at (stmt_info_b
->stmt
,
3807 " compilation time alias: %G%G",
3812 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
3813 access_size_a
, align_a
);
3814 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
3815 access_size_b
, align_b
);
3816 /* Canonicalize the order to be the one that's needed for accurate
3817 RAW, WAR and WAW flags, in cases where the data references are
3818 well-ordered. The order doesn't really matter otherwise,
3819 but we might as well be consistent. */
3820 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
3821 std::swap (dr_a
, dr_b
);
3823 dr_with_seg_len_pair_t dr_with_seg_len_pair
3824 (dr_a
, dr_b
, (preserves_scalar_order_p
3825 ? dr_with_seg_len_pair_t::WELL_ORDERED
3826 : dr_with_seg_len_pair_t::REORDERED
));
3828 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3831 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3833 unsigned int count
= (comp_alias_ddrs
.length ()
3834 + check_unequal_addrs
.length ());
3837 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
))
3838 == VECT_COST_MODEL_VERY_CHEAP
))
3839 return opt_result::failure_at
3840 (vect_location
, "would need a runtime alias check\n");
3842 if (dump_enabled_p ())
3843 dump_printf_loc (MSG_NOTE
, vect_location
,
3844 "improved number of alias checks from %d to %d\n",
3845 may_alias_ddrs
.length (), count
);
3846 unsigned limit
= param_vect_max_version_for_alias_checks
;
3847 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)) == VECT_COST_MODEL_CHEAP
)
3848 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
3850 return opt_result::failure_at
3852 "number of versioning for alias run-time tests exceeds %d "
3853 "(--param vect-max-version-for-alias-checks)\n", limit
);
3855 return opt_result::success ();
3858 /* Check whether we can use an internal function for a gather load
3859 or scatter store. READ_P is true for loads and false for stores.
3860 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3861 the type of the memory elements being loaded or stored. OFFSET_TYPE
3862 is the type of the offset that is being applied to the invariant
3863 base address. SCALE is the amount by which the offset should
3864 be multiplied *after* it has been converted to address width.
3866 Return true if the function is supported, storing the function id in
3867 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3870 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
3871 tree vectype
, tree memory_type
, tree offset_type
,
3872 int scale
, internal_fn
*ifn_out
,
3873 tree
*offset_vectype_out
)
3875 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3876 unsigned int element_bits
= vector_element_bits (vectype
);
3877 if (element_bits
!= memory_bits
)
3878 /* For now the vector elements must be the same width as the
3882 /* Work out which function we need. */
3883 internal_fn ifn
, alt_ifn
, alt_ifn2
;
3886 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3887 alt_ifn
= IFN_MASK_GATHER_LOAD
;
3888 /* When target supports MASK_LEN_GATHER_LOAD, we always
3889 use MASK_LEN_GATHER_LOAD regardless whether len and
3890 mask are valid or not. */
3891 alt_ifn2
= IFN_MASK_LEN_GATHER_LOAD
;
3895 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3896 alt_ifn
= IFN_MASK_SCATTER_STORE
;
3897 /* When target supports MASK_LEN_SCATTER_STORE, we always
3898 use MASK_LEN_SCATTER_STORE regardless whether len and
3899 mask are valid or not. */
3900 alt_ifn2
= IFN_MASK_LEN_SCATTER_STORE
;
3905 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
3906 if (!offset_vectype
)
3909 /* Test whether the target supports this combination. */
3910 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3911 offset_vectype
, scale
))
3914 *offset_vectype_out
= offset_vectype
;
3918 && internal_gather_scatter_fn_supported_p (alt_ifn
, vectype
,
3924 *offset_vectype_out
= offset_vectype
;
3927 else if (internal_gather_scatter_fn_supported_p (alt_ifn2
, vectype
,
3929 offset_vectype
, scale
))
3931 *ifn_out
= alt_ifn2
;
3932 *offset_vectype_out
= offset_vectype
;
3936 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
3937 && TYPE_PRECISION (offset_type
) >= element_bits
)
3940 offset_type
= build_nonstandard_integer_type
3941 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
3945 /* STMT_INFO is a call to an internal gather load or scatter store function.
3946 Describe the operation in INFO. */
3949 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
3950 gather_scatter_info
*info
)
3952 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
3953 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3954 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3956 info
->ifn
= gimple_call_internal_fn (call
);
3957 info
->decl
= NULL_TREE
;
3958 info
->base
= gimple_call_arg (call
, 0);
3959 info
->offset
= gimple_call_arg (call
, 1);
3960 info
->offset_dt
= vect_unknown_def_type
;
3961 info
->offset_vectype
= NULL_TREE
;
3962 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3963 info
->element_type
= TREE_TYPE (vectype
);
3964 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3967 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3968 gather load or scatter store. Describe the operation in *INFO if so. */
3971 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
3972 gather_scatter_info
*info
)
3974 HOST_WIDE_INT scale
= 1;
3975 poly_int64 pbitpos
, pbitsize
;
3976 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3977 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3978 tree offtype
= NULL_TREE
;
3979 tree decl
= NULL_TREE
, base
, off
;
3980 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3981 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3983 int punsignedp
, reversep
, pvolatilep
= 0;
3985 tree offset_vectype
;
3986 bool masked_p
= false;
3988 /* See whether this is already a call to a gather/scatter internal function.
3989 If not, see whether it's a masked load or store. */
3990 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
3991 if (call
&& gimple_call_internal_p (call
))
3993 ifn
= gimple_call_internal_fn (call
);
3994 if (internal_gather_scatter_fn_p (ifn
))
3996 vect_describe_gather_scatter_call (stmt_info
, info
);
3999 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
4002 /* True if we should aim to use internal functions rather than
4003 built-in functions. */
4004 bool use_ifn_p
= (DR_IS_READ (dr
)
4005 ? supports_vec_gather_load_p (TYPE_MODE (vectype
))
4006 : supports_vec_scatter_store_p (TYPE_MODE (vectype
)));
4009 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4010 see if we can use the def stmt of the address. */
4012 && TREE_CODE (base
) == MEM_REF
4013 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
4014 && integer_zerop (TREE_OPERAND (base
, 1))
4015 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
4017 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
4018 if (is_gimple_assign (def_stmt
)
4019 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
4020 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
4023 /* The gather and scatter builtins need address of the form
4024 loop_invariant + vector * {1, 2, 4, 8}
4026 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4027 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4028 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4029 multiplications and additions in it. To get a vector, we need
4030 a single SSA_NAME that will be defined in the loop and will
4031 contain everything that is not loop invariant and that can be
4032 vectorized. The following code attempts to find such a preexistng
4033 SSA_NAME OFF and put the loop invariants into a tree BASE
4034 that can be gimplified before the loop. */
4035 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
4036 &punsignedp
, &reversep
, &pvolatilep
);
4040 /* PR 107346. Packed structs can have fields at offsets that are not
4041 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4042 if (!multiple_p (pbitpos
, BITS_PER_UNIT
))
4045 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
4047 if (TREE_CODE (base
) == MEM_REF
)
4049 if (!integer_zerop (TREE_OPERAND (base
, 1)))
4051 if (off
== NULL_TREE
)
4052 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
4054 off
= size_binop (PLUS_EXPR
, off
,
4055 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
4057 base
= TREE_OPERAND (base
, 0);
4060 base
= build_fold_addr_expr (base
);
4062 if (off
== NULL_TREE
)
4063 off
= size_zero_node
;
4065 /* If base is not loop invariant, either off is 0, then we start with just
4066 the constant offset in the loop invariant BASE and continue with base
4067 as OFF, otherwise give up.
4068 We could handle that case by gimplifying the addition of base + off
4069 into some SSA_NAME and use that as off, but for now punt. */
4070 if (!expr_invariant_in_loop_p (loop
, base
))
4072 if (!integer_zerop (off
))
4075 base
= size_int (pbytepos
);
4077 /* Otherwise put base + constant offset into the loop invariant BASE
4078 and continue with OFF. */
4081 base
= fold_convert (sizetype
, base
);
4082 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
4085 /* OFF at this point may be either a SSA_NAME or some tree expression
4086 from get_inner_reference. Try to peel off loop invariants from it
4087 into BASE as long as possible. */
4089 while (offtype
== NULL_TREE
)
4091 enum tree_code code
;
4092 tree op0
, op1
, add
= NULL_TREE
;
4094 if (TREE_CODE (off
) == SSA_NAME
)
4096 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
4098 if (expr_invariant_in_loop_p (loop
, off
))
4101 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
4104 op0
= gimple_assign_rhs1 (def_stmt
);
4105 code
= gimple_assign_rhs_code (def_stmt
);
4106 op1
= gimple_assign_rhs2 (def_stmt
);
4110 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
4112 code
= TREE_CODE (off
);
4113 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
4117 case POINTER_PLUS_EXPR
:
4119 if (expr_invariant_in_loop_p (loop
, op0
))
4124 add
= fold_convert (sizetype
, add
);
4126 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
4127 base
= size_binop (PLUS_EXPR
, base
, add
);
4130 if (expr_invariant_in_loop_p (loop
, op1
))
4138 if (expr_invariant_in_loop_p (loop
, op1
))
4140 add
= fold_convert (sizetype
, op1
);
4141 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
4147 if (scale
== 1 && tree_fits_shwi_p (op1
))
4149 int new_scale
= tree_to_shwi (op1
);
4150 /* Only treat this as a scaling operation if the target
4151 supports it for at least some offset type. */
4153 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4154 masked_p
, vectype
, memory_type
,
4155 signed_char_type_node
,
4158 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4159 masked_p
, vectype
, memory_type
,
4160 unsigned_char_type_node
,
4173 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
4174 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
4177 /* Don't include the conversion if the target is happy with
4178 the current offset type. */
4180 && TREE_CODE (off
) == SSA_NAME
4181 && !POINTER_TYPE_P (TREE_TYPE (off
))
4182 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4183 masked_p
, vectype
, memory_type
,
4184 TREE_TYPE (off
), scale
, &ifn
,
4188 if (TYPE_PRECISION (TREE_TYPE (op0
))
4189 == TYPE_PRECISION (TREE_TYPE (off
)))
4195 /* Include the conversion if it is widening and we're using
4196 the IFN path or the target can handle the converted from
4197 offset or the current size is not already the same as the
4198 data vector element size. */
4199 if ((TYPE_PRECISION (TREE_TYPE (op0
))
4200 < TYPE_PRECISION (TREE_TYPE (off
)))
4203 ? (targetm
.vectorize
.builtin_gather
4204 && targetm
.vectorize
.builtin_gather (vectype
,
4207 : (targetm
.vectorize
.builtin_scatter
4208 && targetm
.vectorize
.builtin_scatter (vectype
,
4211 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off
)),
4212 TYPE_SIZE (TREE_TYPE (vectype
)), 0)))
4215 offtype
= TREE_TYPE (off
);
4226 /* If at the end OFF still isn't a SSA_NAME or isn't
4227 defined in the loop, punt. */
4228 if (TREE_CODE (off
) != SSA_NAME
4229 || expr_invariant_in_loop_p (loop
, off
))
4232 if (offtype
== NULL_TREE
)
4233 offtype
= TREE_TYPE (off
);
4237 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4238 vectype
, memory_type
, offtype
, scale
,
4239 &ifn
, &offset_vectype
))
4245 if (DR_IS_READ (dr
))
4247 if (targetm
.vectorize
.builtin_gather
)
4248 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4252 if (targetm
.vectorize
.builtin_scatter
)
4253 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4256 /* The offset vector type will be read from DECL when needed. */
4257 offset_vectype
= NULL_TREE
;
4264 info
->offset_dt
= vect_unknown_def_type
;
4265 info
->offset_vectype
= offset_vectype
;
4266 info
->scale
= scale
;
4267 info
->element_type
= TREE_TYPE (vectype
);
4268 info
->memory_type
= memory_type
;
4272 /* Find the data references in STMT, analyze them with respect to LOOP and
4273 append them to DATAREFS. Return false if datarefs in this stmt cannot
4277 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4278 vec
<data_reference_p
> *datarefs
,
4279 vec
<int> *dataref_groups
, int group_id
)
4281 /* We can ignore clobbers for dataref analysis - they are removed during
4282 loop vectorization and BB vectorization checks dependences with a
4284 if (gimple_clobber_p (stmt
))
4285 return opt_result::success ();
4287 if (gimple_has_volatile_ops (stmt
))
4288 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4291 if (stmt_can_throw_internal (cfun
, stmt
))
4292 return opt_result::failure_at (stmt
,
4294 " statement can throw an exception: %G",
4297 auto_vec
<data_reference_p
, 2> refs
;
4298 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4302 if (refs
.is_empty ())
4303 return opt_result::success ();
4305 if (refs
.length () > 1)
4307 while (!refs
.is_empty ())
4308 free_data_ref (refs
.pop ());
4309 return opt_result::failure_at (stmt
,
4310 "not vectorized: more than one "
4311 "data ref in stmt: %G", stmt
);
4314 data_reference_p dr
= refs
.pop ();
4315 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4316 if (!gimple_call_internal_p (call
)
4317 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4318 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4321 return opt_result::failure_at (stmt
,
4322 "not vectorized: dr in a call %G", stmt
);
4325 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4326 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4329 return opt_result::failure_at (stmt
,
4331 " statement is an unsupported"
4332 " bitfield access %G", stmt
);
4335 if (DR_BASE_ADDRESS (dr
)
4336 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4339 return opt_result::failure_at (stmt
,
4341 " base addr of dr is a constant\n");
4344 /* Check whether this may be a SIMD lane access and adjust the
4345 DR to make it easier for us to handle it. */
4348 && (!DR_BASE_ADDRESS (dr
)
4353 struct data_reference
*newdr
4354 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4355 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4356 if (DR_BASE_ADDRESS (newdr
)
4357 && DR_OFFSET (newdr
)
4360 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4361 && integer_zerop (DR_STEP (newdr
)))
4363 tree base_address
= DR_BASE_ADDRESS (newdr
);
4364 tree off
= DR_OFFSET (newdr
);
4365 tree step
= ssize_int (1);
4366 if (integer_zerop (off
)
4367 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4369 off
= TREE_OPERAND (base_address
, 1);
4370 base_address
= TREE_OPERAND (base_address
, 0);
4373 if (TREE_CODE (off
) == MULT_EXPR
4374 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4376 step
= TREE_OPERAND (off
, 1);
4377 off
= TREE_OPERAND (off
, 0);
4380 if (CONVERT_EXPR_P (off
)
4381 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4382 < TYPE_PRECISION (TREE_TYPE (off
))))
4383 off
= TREE_OPERAND (off
, 0);
4384 if (TREE_CODE (off
) == SSA_NAME
)
4386 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4387 /* Look through widening conversion. */
4388 if (is_gimple_assign (def
)
4389 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4391 tree rhs1
= gimple_assign_rhs1 (def
);
4392 if (TREE_CODE (rhs1
) == SSA_NAME
4393 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4394 && (TYPE_PRECISION (TREE_TYPE (off
))
4395 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4396 def
= SSA_NAME_DEF_STMT (rhs1
);
4398 if (is_gimple_call (def
)
4399 && gimple_call_internal_p (def
)
4400 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4402 tree arg
= gimple_call_arg (def
, 0);
4403 tree reft
= TREE_TYPE (DR_REF (newdr
));
4404 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4405 arg
= SSA_NAME_VAR (arg
);
4406 if (arg
== loop
->simduid
4408 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4410 DR_BASE_ADDRESS (newdr
) = base_address
;
4411 DR_OFFSET (newdr
) = ssize_int (0);
4412 DR_STEP (newdr
) = step
;
4413 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4414 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4415 /* Mark as simd-lane access. */
4416 tree arg2
= gimple_call_arg (def
, 1);
4417 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4419 datarefs
->safe_push (newdr
);
4421 dataref_groups
->safe_push (group_id
);
4422 return opt_result::success ();
4427 free_data_ref (newdr
);
4430 datarefs
->safe_push (dr
);
4432 dataref_groups
->safe_push (group_id
);
4433 return opt_result::success ();
4436 /* Function vect_analyze_data_refs.
4438 Find all the data references in the loop or basic block.
4440 The general structure of the analysis of data refs in the vectorizer is as
4442 1- vect_analyze_data_refs(loop/bb): call
4443 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4444 in the loop/bb and their dependences.
4445 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4446 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4447 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4452 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4454 class loop
*loop
= NULL
;
4456 struct data_reference
*dr
;
4459 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4461 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4462 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4464 /* Go through the data-refs, check that the analysis succeeded. Update
4465 pointer from stmt_vec_info struct to DR and vectype. */
4467 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4468 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4470 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4473 gcc_assert (DR_REF (dr
));
4474 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4475 gcc_assert (!stmt_info
->dr_aux
.dr
);
4476 stmt_info
->dr_aux
.dr
= dr
;
4477 stmt_info
->dr_aux
.stmt
= stmt_info
;
4479 /* Check that analysis of the data-ref succeeded. */
4480 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4485 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4488 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4490 /* If target supports vector gather loads or scatter stores,
4491 see if they can't be used. */
4492 if (is_a
<loop_vec_info
> (vinfo
)
4493 && !nested_in_vect_loop_p (loop
, stmt_info
))
4495 if (maybe_gather
|| maybe_scatter
)
4498 gatherscatter
= GATHER
;
4500 gatherscatter
= SCATTER
;
4504 if (gatherscatter
== SG_NONE
)
4506 if (dump_enabled_p ())
4507 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4508 "not vectorized: data ref analysis "
4509 "failed %G", stmt_info
->stmt
);
4510 if (is_a
<bb_vec_info
> (vinfo
))
4512 /* In BB vectorization the ref can still participate
4513 in dependence analysis, we just can't vectorize it. */
4514 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4517 return opt_result::failure_at (stmt_info
->stmt
,
4519 " data ref analysis failed: %G",
4524 /* See if this was detected as SIMD lane access. */
4525 if (dr
->aux
== (void *)-1
4526 || dr
->aux
== (void *)-2
4527 || dr
->aux
== (void *)-3
4528 || dr
->aux
== (void *)-4)
4530 if (nested_in_vect_loop_p (loop
, stmt_info
))
4531 return opt_result::failure_at (stmt_info
->stmt
,
4533 " data ref analysis failed: %G",
4535 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4536 = -(uintptr_t) dr
->aux
;
4539 tree base
= get_base_address (DR_REF (dr
));
4540 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4542 if (dump_enabled_p ())
4543 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4544 "not vectorized: base object not addressable "
4545 "for stmt: %G", stmt_info
->stmt
);
4546 if (is_a
<bb_vec_info
> (vinfo
))
4548 /* In BB vectorization the ref can still participate
4549 in dependence analysis, we just can't vectorize it. */
4550 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4553 return opt_result::failure_at (stmt_info
->stmt
,
4554 "not vectorized: base object not"
4555 " addressable for stmt: %G",
4559 if (is_a
<loop_vec_info
> (vinfo
)
4561 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4563 if (nested_in_vect_loop_p (loop
, stmt_info
))
4564 return opt_result::failure_at (stmt_info
->stmt
,
4566 "not suitable for strided load %G",
4568 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4571 /* Update DR field in stmt_vec_info struct. */
4573 /* If the dataref is in an inner-loop of the loop that is considered for
4574 for vectorization, we also want to analyze the access relative to
4575 the outer-loop (DR contains information only relative to the
4576 inner-most enclosing loop). We do that by building a reference to the
4577 first location accessed by the inner-loop, and analyze it relative to
4579 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4581 /* Build a reference to the first location accessed by the
4582 inner loop: *(BASE + INIT + OFFSET). By construction,
4583 this address must be invariant in the inner loop, so we
4584 can consider it as being used in the outer loop. */
4585 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4586 tree offset
= unshare_expr (DR_OFFSET (dr
));
4587 tree init
= unshare_expr (DR_INIT (dr
));
4588 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4590 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4591 tree init_ref
= build_fold_indirect_ref (init_addr
);
4593 if (dump_enabled_p ())
4594 dump_printf_loc (MSG_NOTE
, vect_location
,
4595 "analyze in outer loop: %T\n", init_ref
);
4598 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4599 init_ref
, loop
, stmt_info
->stmt
);
4601 /* dr_analyze_innermost already explained the failure. */
4604 if (dump_enabled_p ())
4605 dump_printf_loc (MSG_NOTE
, vect_location
,
4606 "\touter base_address: %T\n"
4607 "\touter offset from base address: %T\n"
4608 "\touter constant offset from base address: %T\n"
4609 "\touter step: %T\n"
4610 "\touter base alignment: %d\n\n"
4611 "\touter base misalignment: %d\n"
4612 "\touter offset alignment: %d\n"
4613 "\touter step alignment: %d\n",
4614 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4615 STMT_VINFO_DR_OFFSET (stmt_info
),
4616 STMT_VINFO_DR_INIT (stmt_info
),
4617 STMT_VINFO_DR_STEP (stmt_info
),
4618 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4619 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4620 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4621 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4624 /* Set vectype for STMT. */
4625 scalar_type
= TREE_TYPE (DR_REF (dr
));
4626 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4629 if (dump_enabled_p ())
4631 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4632 "not vectorized: no vectype for stmt: %G",
4634 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4635 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4637 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4640 if (is_a
<bb_vec_info
> (vinfo
))
4642 /* No vector type is fine, the ref can still participate
4643 in dependence analysis, we just can't vectorize it. */
4644 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4649 return opt_result::failure_at (stmt_info
->stmt
,
4651 " no vectype for stmt: %G"
4652 " scalar_type: %T\n",
4653 stmt_info
->stmt
, scalar_type
);
4657 if (dump_enabled_p ())
4658 dump_printf_loc (MSG_NOTE
, vect_location
,
4659 "got vectype for stmt: %G%T\n",
4660 stmt_info
->stmt
, vectype
);
4663 /* Adjust the minimal vectorization factor according to the
4665 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4666 *min_vf
= upper_bound (*min_vf
, vf
);
4668 /* Leave the BB vectorizer to pick the vector type later, based on
4669 the final dataref group size and SLP node size. */
4670 if (is_a
<loop_vec_info
> (vinfo
))
4671 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4673 if (gatherscatter
!= SG_NONE
)
4675 gather_scatter_info gs_info
;
4676 if (!vect_check_gather_scatter (stmt_info
,
4677 as_a
<loop_vec_info
> (vinfo
),
4679 || !get_vectype_for_scalar_type (vinfo
,
4680 TREE_TYPE (gs_info
.offset
)))
4684 return opt_result::failure_at
4686 (gatherscatter
== GATHER
)
4687 ? "not vectorized: not suitable for gather load %G"
4688 : "not vectorized: not suitable for scatter store %G",
4691 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4695 /* We used to stop processing and prune the list here. Verify we no
4697 gcc_assert (i
== datarefs
.length ());
4699 return opt_result::success ();
4703 /* Function vect_get_new_vect_var.
4705 Returns a name for a new variable. The current naming scheme appends the
4706 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4707 the name of vectorizer generated variables, and appends that to NAME if
4711 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4718 case vect_simple_var
:
4721 case vect_scalar_var
:
4727 case vect_pointer_var
:
4736 char* tmp
= concat (prefix
, "_", name
, NULL
);
4737 new_vect_var
= create_tmp_reg (type
, tmp
);
4741 new_vect_var
= create_tmp_reg (type
, prefix
);
4743 return new_vect_var
;
4746 /* Like vect_get_new_vect_var but return an SSA name. */
4749 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4756 case vect_simple_var
:
4759 case vect_scalar_var
:
4762 case vect_pointer_var
:
4771 char* tmp
= concat (prefix
, "_", name
, NULL
);
4772 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4776 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4778 return new_vect_var
;
4781 /* Duplicate points-to info on NAME from DR_INFO. */
4784 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
4786 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
4787 /* DR_PTR_INFO is for a base SSA name, not including constant or
4788 variable offsets in the ref so its alignment info does not apply. */
4789 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4792 /* Function vect_create_addr_base_for_vector_ref.
4794 Create an expression that computes the address of the first memory location
4795 that will be accessed for a data reference.
4798 STMT_INFO: The statement containing the data reference.
4799 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4800 OFFSET: Optional. If supplied, it is be added to the initial address.
4801 LOOP: Specify relative to which loop-nest should the address be computed.
4802 For example, when the dataref is in an inner-loop nested in an
4803 outer-loop that is now being vectorized, LOOP can be either the
4804 outer-loop, or the inner-loop. The first memory location accessed
4805 by the following dataref ('in' points to short):
4812 if LOOP=i_loop: &in (relative to i_loop)
4813 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4816 1. Return an SSA_NAME whose value is the address of the memory location of
4817 the first vector of the data reference.
4818 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4819 these statement(s) which define the returned SSA_NAME.
4821 FORNOW: We are only handling array accesses with step 1. */
4824 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4825 gimple_seq
*new_stmt_list
,
4828 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4829 struct data_reference
*dr
= dr_info
->dr
;
4830 const char *base_name
;
4833 gimple_seq seq
= NULL
;
4835 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4836 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
4838 tree data_ref_base
= unshare_expr (drb
->base_address
);
4839 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
4840 tree init
= unshare_expr (drb
->init
);
4843 base_name
= get_name (data_ref_base
);
4846 base_offset
= ssize_int (0);
4847 init
= ssize_int (0);
4848 base_name
= get_name (DR_REF (dr
));
4851 /* Create base_offset */
4852 base_offset
= size_binop (PLUS_EXPR
,
4853 fold_convert (sizetype
, base_offset
),
4854 fold_convert (sizetype
, init
));
4858 offset
= fold_convert (sizetype
, offset
);
4859 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4860 base_offset
, offset
);
4863 /* base + base_offset */
4865 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4867 addr_base
= build1 (ADDR_EXPR
,
4868 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4869 /* Strip zero offset components since we don't need
4870 them and they can confuse late diagnostics if
4871 we CSE them wrongly. See PR106904 for example. */
4872 unshare_expr (strip_zero_offset_components
4875 vect_ptr_type
= build_pointer_type (TREE_TYPE (DR_REF (dr
)));
4876 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4877 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4878 gimple_seq_add_seq (new_stmt_list
, seq
);
4880 if (DR_PTR_INFO (dr
)
4881 && TREE_CODE (addr_base
) == SSA_NAME
4882 /* We should only duplicate pointer info to newly created SSA names. */
4883 && SSA_NAME_VAR (addr_base
) == dest
)
4885 gcc_assert (!SSA_NAME_PTR_INFO (addr_base
));
4886 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
4889 if (dump_enabled_p ())
4890 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
4896 /* Function vect_create_data_ref_ptr.
4898 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4899 location accessed in the loop by STMT_INFO, along with the def-use update
4900 chain to appropriately advance the pointer through the loop iterations.
4901 Also set aliasing information for the pointer. This pointer is used by
4902 the callers to this function to create a memory reference expression for
4903 vector load/store access.
4906 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4907 GIMPLE_ASSIGN <name, data-ref> or
4908 GIMPLE_ASSIGN <data-ref, name>.
4909 2. AGGR_TYPE: the type of the reference, which should be either a vector
4911 3. AT_LOOP: the loop where the vector memref is to be created.
4912 4. OFFSET (optional): a byte offset to be added to the initial address
4913 accessed by the data-ref in STMT_INFO.
4914 5. BSI: location where the new stmts are to be placed if there is no loop
4915 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4916 pointing to the initial address.
4917 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4918 to the IV during each iteration of the loop. NULL says to move
4919 by one copy of AGGR_TYPE up or down, depending on the step of the
4923 1. Declare a new ptr to vector_type, and have it point to the base of the
4924 data reference (initial addressed accessed by the data reference).
4925 For example, for vector of type V8HI, the following code is generated:
4928 ap = (v8hi *)initial_address;
4930 if OFFSET is not supplied:
4931 initial_address = &a[init];
4932 if OFFSET is supplied:
4933 initial_address = &a[init] + OFFSET;
4934 if BYTE_OFFSET is supplied:
4935 initial_address = &a[init] + BYTE_OFFSET;
4937 Return the initial_address in INITIAL_ADDRESS.
4939 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4940 update the pointer in each iteration of the loop.
4942 Return the increment stmt that updates the pointer in PTR_INCR.
4944 3. Return the pointer. */
4947 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4948 tree aggr_type
, class loop
*at_loop
, tree offset
,
4949 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
4950 gimple
**ptr_incr
, bool only_init
,
4953 const char *base_name
;
4954 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4955 class loop
*loop
= NULL
;
4956 bool nested_in_vect_loop
= false;
4957 class loop
*containing_loop
= NULL
;
4961 gimple_seq new_stmt_list
= NULL
;
4965 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4966 struct data_reference
*dr
= dr_info
->dr
;
4968 gimple_stmt_iterator incr_gsi
;
4970 tree indx_before_incr
, indx_after_incr
;
4972 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
4974 gcc_assert (iv_step
!= NULL_TREE
4975 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4976 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4980 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4981 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
4982 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
4983 pe
= loop_preheader_edge (loop
);
4987 gcc_assert (bb_vinfo
);
4992 /* Create an expression for the first address accessed by this load
4994 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4996 if (dump_enabled_p ())
4998 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4999 dump_printf_loc (MSG_NOTE
, vect_location
,
5000 "create %s-pointer variable to type: %T",
5001 get_tree_code_name (TREE_CODE (aggr_type
)),
5003 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
5004 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
5005 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
5006 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
5007 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
5008 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
5010 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
5011 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
5014 /* (1) Create the new aggregate-pointer variable.
5015 Vector and array types inherit the alias set of their component
5016 type by default so we need to use a ref-all pointer if the data
5017 reference does not conflict with the created aggregated data
5018 reference because it is not addressable. */
5019 bool need_ref_all
= false;
5020 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5021 get_alias_set (DR_REF (dr
))))
5022 need_ref_all
= true;
5023 /* Likewise for any of the data references in the stmt group. */
5024 else if (DR_GROUP_SIZE (stmt_info
) > 1)
5026 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5029 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
5030 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5031 get_alias_set (DR_REF (sdr
))))
5033 need_ref_all
= true;
5036 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
5040 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
5042 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
5045 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5046 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5047 def-use update cycles for the pointer: one relative to the outer-loop
5048 (LOOP), which is what steps (3) and (4) below do. The other is relative
5049 to the inner-loop (which is the inner-most loop containing the dataref),
5050 and this is done be step (5) below.
5052 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5053 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5054 redundant. Steps (3),(4) create the following:
5057 LOOP: vp1 = phi(vp0,vp2)
5063 If there is an inner-loop nested in loop, then step (5) will also be
5064 applied, and an additional update in the inner-loop will be created:
5067 LOOP: vp1 = phi(vp0,vp2)
5069 inner: vp3 = phi(vp1,vp4)
5070 vp4 = vp3 + inner_step
5076 /* (2) Calculate the initial address of the aggregate-pointer, and set
5077 the aggregate-pointer to point to it before the loop. */
5079 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5081 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
5082 stmt_info
, &new_stmt_list
,
5088 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
5089 gcc_assert (!new_bb
);
5092 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
5095 *initial_address
= new_temp
;
5096 aggr_ptr_init
= new_temp
;
5098 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5099 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5100 inner-loop nested in LOOP (during outer-loop vectorization). */
5102 /* No update in loop is required. */
5103 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
5104 aptr
= aggr_ptr_init
;
5107 /* Accesses to invariant addresses should be handled specially
5109 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
5110 gcc_assert (!integer_zerop (step
));
5112 if (iv_step
== NULL_TREE
)
5114 /* The step of the aggregate pointer is the type size,
5115 negated for downward accesses. */
5116 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
5117 if (tree_int_cst_sgn (step
) == -1)
5118 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
5121 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5123 create_iv (aggr_ptr_init
, PLUS_EXPR
,
5124 fold_convert (aggr_ptr_type
, iv_step
),
5125 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
5126 &indx_before_incr
, &indx_after_incr
);
5127 incr
= gsi_stmt (incr_gsi
);
5129 /* Copy the points-to information if it exists. */
5130 if (DR_PTR_INFO (dr
))
5132 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5133 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5138 aptr
= indx_before_incr
;
5141 if (!nested_in_vect_loop
|| only_init
)
5145 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5146 nested in LOOP, if exists. */
5148 gcc_assert (nested_in_vect_loop
);
5151 standard_iv_increment_position (containing_loop
, &incr_gsi
,
5153 create_iv (aptr
, PLUS_EXPR
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)),
5154 aggr_ptr
, containing_loop
, &incr_gsi
, insert_after
,
5155 &indx_before_incr
, &indx_after_incr
);
5156 incr
= gsi_stmt (incr_gsi
);
5158 /* Copy the points-to information if it exists. */
5159 if (DR_PTR_INFO (dr
))
5161 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5162 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5167 return indx_before_incr
;
5174 /* Function bump_vector_ptr
5176 Increment a pointer (to a vector type) by vector-size. If requested,
5177 i.e. if PTR-INCR is given, then also connect the new increment stmt
5178 to the existing def-use update-chain of the pointer, by modifying
5179 the PTR_INCR as illustrated below:
5181 The pointer def-use update-chain before this function:
5182 DATAREF_PTR = phi (p_0, p_2)
5184 PTR_INCR: p_2 = DATAREF_PTR + step
5186 The pointer def-use update-chain after this function:
5187 DATAREF_PTR = phi (p_0, p_2)
5189 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5191 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5194 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5196 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5197 the loop. The increment amount across iterations is expected
5199 BSI - location where the new update stmt is to be placed.
5200 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5201 BUMP - optional. The offset by which to bump the pointer. If not given,
5202 the offset is assumed to be vector_size.
5204 Output: Return NEW_DATAREF_PTR as illustrated above.
5209 bump_vector_ptr (vec_info
*vinfo
,
5210 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5211 stmt_vec_info stmt_info
, tree bump
)
5213 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5214 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5215 tree update
= TYPE_SIZE_UNIT (vectype
);
5218 use_operand_p use_p
;
5219 tree new_dataref_ptr
;
5224 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5225 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5226 else if (is_gimple_min_invariant (dataref_ptr
))
5227 /* When possible avoid emitting a separate increment stmt that will
5228 force the addressed object addressable. */
5229 return build1 (ADDR_EXPR
, TREE_TYPE (dataref_ptr
),
5230 fold_build2 (MEM_REF
,
5231 TREE_TYPE (TREE_TYPE (dataref_ptr
)),
5233 fold_convert (ptr_type_node
, update
)));
5235 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5236 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5237 dataref_ptr
, update
);
5238 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5239 /* Fold the increment, avoiding excessive chains use-def chains of
5240 those, leading to compile-time issues for passes until the next
5241 forwprop pass which would do this as well. */
5242 gimple_stmt_iterator fold_gsi
= gsi_for_stmt (incr_stmt
);
5243 if (fold_stmt (&fold_gsi
, follow_all_ssa_edges
))
5245 incr_stmt
= gsi_stmt (fold_gsi
);
5246 update_stmt (incr_stmt
);
5249 /* Copy the points-to information if it exists. */
5250 if (DR_PTR_INFO (dr
))
5252 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5253 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5257 return new_dataref_ptr
;
5259 /* Update the vector-pointer's cross-iteration increment. */
5260 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5262 tree use
= USE_FROM_PTR (use_p
);
5264 if (use
== dataref_ptr
)
5265 SET_USE (use_p
, new_dataref_ptr
);
5267 gcc_assert (operand_equal_p (use
, update
, 0));
5270 return new_dataref_ptr
;
5274 /* Copy memory reference info such as base/clique from the SRC reference
5275 to the DEST MEM_REF. */
5278 vect_copy_ref_info (tree dest
, tree src
)
5280 if (TREE_CODE (dest
) != MEM_REF
)
5283 tree src_base
= src
;
5284 while (handled_component_p (src_base
))
5285 src_base
= TREE_OPERAND (src_base
, 0);
5286 if (TREE_CODE (src_base
) != MEM_REF
5287 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5290 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5291 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5295 /* Function vect_create_destination_var.
5297 Create a new temporary of type VECTYPE. */
5300 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5306 enum vect_var_kind kind
;
5309 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5313 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5315 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5317 name
= get_name (scalar_dest
);
5319 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5321 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5322 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5328 /* Function vect_grouped_store_supported.
5330 Returns TRUE if interleave high and interleave low permutations
5331 are supported, and FALSE otherwise. */
5334 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5336 machine_mode mode
= TYPE_MODE (vectype
);
5338 /* vect_permute_store_chain requires the group size to be equal to 3 or
5339 be a power of two. */
5340 if (count
!= 3 && exact_log2 (count
) == -1)
5342 if (dump_enabled_p ())
5343 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5344 "the size of the group of accesses"
5345 " is not a power of 2 or not eqaul to 3\n");
5349 /* Check that the permutation is supported. */
5350 if (VECTOR_MODE_P (mode
))
5355 unsigned int j0
= 0, j1
= 0, j2
= 0;
5359 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5361 if (dump_enabled_p ())
5362 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5363 "cannot handle groups of 3 stores for"
5364 " variable-length vectors\n");
5368 vec_perm_builder
sel (nelt
, nelt
, 1);
5369 sel
.quick_grow (nelt
);
5370 vec_perm_indices indices
;
5371 for (j
= 0; j
< 3; j
++)
5373 int nelt0
= ((3 - j
) * nelt
) % 3;
5374 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5375 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5376 for (i
= 0; i
< nelt
; i
++)
5378 if (3 * i
+ nelt0
< nelt
)
5379 sel
[3 * i
+ nelt0
] = j0
++;
5380 if (3 * i
+ nelt1
< nelt
)
5381 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5382 if (3 * i
+ nelt2
< nelt
)
5383 sel
[3 * i
+ nelt2
] = 0;
5385 indices
.new_vector (sel
, 2, nelt
);
5386 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5388 if (dump_enabled_p ())
5389 dump_printf (MSG_MISSED_OPTIMIZATION
,
5390 "permutation op not supported by target.\n");
5394 for (i
= 0; i
< nelt
; i
++)
5396 if (3 * i
+ nelt0
< nelt
)
5397 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5398 if (3 * i
+ nelt1
< nelt
)
5399 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5400 if (3 * i
+ nelt2
< nelt
)
5401 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5403 indices
.new_vector (sel
, 2, nelt
);
5404 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5406 if (dump_enabled_p ())
5407 dump_printf (MSG_MISSED_OPTIMIZATION
,
5408 "permutation op not supported by target.\n");
5416 /* If length is not equal to 3 then only power of 2 is supported. */
5417 gcc_assert (pow2p_hwi (count
));
5418 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5420 /* The encoding has 2 interleaved stepped patterns. */
5421 if(!multiple_p (nelt
, 2))
5423 vec_perm_builder
sel (nelt
, 2, 3);
5425 for (i
= 0; i
< 3; i
++)
5428 sel
[i
* 2 + 1] = i
+ nelt
;
5430 vec_perm_indices
indices (sel
, 2, nelt
);
5431 if (can_vec_perm_const_p (mode
, mode
, indices
))
5433 for (i
= 0; i
< 6; i
++)
5434 sel
[i
] += exact_div (nelt
, 2);
5435 indices
.new_vector (sel
, 2, nelt
);
5436 if (can_vec_perm_const_p (mode
, mode
, indices
))
5442 if (dump_enabled_p ())
5443 dump_printf (MSG_MISSED_OPTIMIZATION
,
5444 "permutation op not supported by target.\n");
5448 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5449 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5452 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5455 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5456 vec_mask_len_store_lanes_optab
, vectype
,
5458 return IFN_MASK_LEN_STORE_LANES
;
5461 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5462 vec_mask_store_lanes_optab
, vectype
,
5464 return IFN_MASK_STORE_LANES
;
5468 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5469 vec_store_lanes_optab
, vectype
, count
))
5470 return IFN_STORE_LANES
;
5476 /* Function vect_permute_store_chain.
5478 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5479 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5480 the data correctly for the stores. Return the final references for stores
5483 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5484 The input is 4 vectors each containing 8 elements. We assign a number to
5485 each element, the input sequence is:
5487 1st vec: 0 1 2 3 4 5 6 7
5488 2nd vec: 8 9 10 11 12 13 14 15
5489 3rd vec: 16 17 18 19 20 21 22 23
5490 4th vec: 24 25 26 27 28 29 30 31
5492 The output sequence should be:
5494 1st vec: 0 8 16 24 1 9 17 25
5495 2nd vec: 2 10 18 26 3 11 19 27
5496 3rd vec: 4 12 20 28 5 13 21 30
5497 4th vec: 6 14 22 30 7 15 23 31
5499 i.e., we interleave the contents of the four vectors in their order.
5501 We use interleave_high/low instructions to create such output. The input of
5502 each interleave_high/low operation is two vectors:
5505 the even elements of the result vector are obtained left-to-right from the
5506 high/low elements of the first vector. The odd elements of the result are
5507 obtained left-to-right from the high/low elements of the second vector.
5508 The output of interleave_high will be: 0 4 1 5
5509 and of interleave_low: 2 6 3 7
5512 The permutation is done in log LENGTH stages. In each stage interleave_high
5513 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5514 where the first argument is taken from the first half of DR_CHAIN and the
5515 second argument from it's second half.
5518 I1: interleave_high (1st vec, 3rd vec)
5519 I2: interleave_low (1st vec, 3rd vec)
5520 I3: interleave_high (2nd vec, 4th vec)
5521 I4: interleave_low (2nd vec, 4th vec)
5523 The output for the first stage is:
5525 I1: 0 16 1 17 2 18 3 19
5526 I2: 4 20 5 21 6 22 7 23
5527 I3: 8 24 9 25 10 26 11 27
5528 I4: 12 28 13 29 14 30 15 31
5530 The output of the second stage, i.e. the final result is:
5532 I1: 0 8 16 24 1 9 17 25
5533 I2: 2 10 18 26 3 11 19 27
5534 I3: 4 12 20 28 5 13 21 30
5535 I4: 6 14 22 30 7 15 23 31. */
5538 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> &dr_chain
,
5539 unsigned int length
,
5540 stmt_vec_info stmt_info
,
5541 gimple_stmt_iterator
*gsi
,
5542 vec
<tree
> *result_chain
)
5544 tree vect1
, vect2
, high
, low
;
5546 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5547 tree perm_mask_low
, perm_mask_high
;
5549 tree perm3_mask_low
, perm3_mask_high
;
5550 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5552 result_chain
->quick_grow (length
);
5553 memcpy (result_chain
->address (), dr_chain
.address (),
5554 length
* sizeof (tree
));
5558 /* vect_grouped_store_supported ensures that this is constant. */
5559 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5560 unsigned int j0
= 0, j1
= 0, j2
= 0;
5562 vec_perm_builder
sel (nelt
, nelt
, 1);
5563 sel
.quick_grow (nelt
);
5564 vec_perm_indices indices
;
5565 for (j
= 0; j
< 3; j
++)
5567 int nelt0
= ((3 - j
) * nelt
) % 3;
5568 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5569 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5571 for (i
= 0; i
< nelt
; i
++)
5573 if (3 * i
+ nelt0
< nelt
)
5574 sel
[3 * i
+ nelt0
] = j0
++;
5575 if (3 * i
+ nelt1
< nelt
)
5576 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5577 if (3 * i
+ nelt2
< nelt
)
5578 sel
[3 * i
+ nelt2
] = 0;
5580 indices
.new_vector (sel
, 2, nelt
);
5581 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5583 for (i
= 0; i
< nelt
; i
++)
5585 if (3 * i
+ nelt0
< nelt
)
5586 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5587 if (3 * i
+ nelt1
< nelt
)
5588 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5589 if (3 * i
+ nelt2
< nelt
)
5590 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5592 indices
.new_vector (sel
, 2, nelt
);
5593 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5595 vect1
= dr_chain
[0];
5596 vect2
= dr_chain
[1];
5598 /* Create interleaving stmt:
5599 low = VEC_PERM_EXPR <vect1, vect2,
5600 {j, nelt, *, j + 1, nelt + j + 1, *,
5601 j + 2, nelt + j + 2, *, ...}> */
5602 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5603 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5604 vect2
, perm3_mask_low
);
5605 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5608 vect2
= dr_chain
[2];
5609 /* Create interleaving stmt:
5610 low = VEC_PERM_EXPR <vect1, vect2,
5611 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5612 6, 7, nelt + j + 2, ...}> */
5613 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5614 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5615 vect2
, perm3_mask_high
);
5616 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5617 (*result_chain
)[j
] = data_ref
;
5622 /* If length is not equal to 3 then only power of 2 is supported. */
5623 gcc_assert (pow2p_hwi (length
));
5625 /* The encoding has 2 interleaved stepped patterns. */
5626 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5627 vec_perm_builder
sel (nelt
, 2, 3);
5629 for (i
= 0; i
< 3; i
++)
5632 sel
[i
* 2 + 1] = i
+ nelt
;
5634 vec_perm_indices
indices (sel
, 2, nelt
);
5635 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5637 for (i
= 0; i
< 6; i
++)
5638 sel
[i
] += exact_div (nelt
, 2);
5639 indices
.new_vector (sel
, 2, nelt
);
5640 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5642 for (i
= 0, n
= log_length
; i
< n
; i
++)
5644 for (j
= 0; j
< length
/2; j
++)
5646 vect1
= dr_chain
[j
];
5647 vect2
= dr_chain
[j
+length
/2];
5649 /* Create interleaving stmt:
5650 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5652 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5653 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5654 vect2
, perm_mask_high
);
5655 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5656 (*result_chain
)[2*j
] = high
;
5658 /* Create interleaving stmt:
5659 low = VEC_PERM_EXPR <vect1, vect2,
5660 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5662 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5663 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5664 vect2
, perm_mask_low
);
5665 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5666 (*result_chain
)[2*j
+1] = low
;
5668 memcpy (dr_chain
.address (), result_chain
->address (),
5669 length
* sizeof (tree
));
5674 /* Function vect_setup_realignment
5676 This function is called when vectorizing an unaligned load using
5677 the dr_explicit_realign[_optimized] scheme.
5678 This function generates the following code at the loop prolog:
5681 x msq_init = *(floor(p)); # prolog load
5682 realignment_token = call target_builtin;
5684 x msq = phi (msq_init, ---)
5686 The stmts marked with x are generated only for the case of
5687 dr_explicit_realign_optimized.
5689 The code above sets up a new (vector) pointer, pointing to the first
5690 location accessed by STMT_INFO, and a "floor-aligned" load using that
5691 pointer. It also generates code to compute the "realignment-token"
5692 (if the relevant target hook was defined), and creates a phi-node at the
5693 loop-header bb whose arguments are the result of the prolog-load (created
5694 by this function) and the result of a load that takes place in the loop
5695 (to be created by the caller to this function).
5697 For the case of dr_explicit_realign_optimized:
5698 The caller to this function uses the phi-result (msq) to create the
5699 realignment code inside the loop, and sets up the missing phi argument,
5702 msq = phi (msq_init, lsq)
5703 lsq = *(floor(p')); # load in loop
5704 result = realign_load (msq, lsq, realignment_token);
5706 For the case of dr_explicit_realign:
5708 msq = *(floor(p)); # load in loop
5710 lsq = *(floor(p')); # load in loop
5711 result = realign_load (msq, lsq, realignment_token);
5714 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5715 a memory location that may be unaligned.
5716 BSI - place where new code is to be inserted.
5717 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5721 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5722 target hook, if defined.
5723 Return value - the result of the loop-header phi node. */
5726 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5727 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
5728 enum dr_alignment_support alignment_support_scheme
,
5730 class loop
**at_loop
)
5732 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5733 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5734 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5735 struct data_reference
*dr
= dr_info
->dr
;
5736 class loop
*loop
= NULL
;
5738 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
5744 tree msq_init
= NULL_TREE
;
5747 tree msq
= NULL_TREE
;
5748 gimple_seq stmts
= NULL
;
5749 bool compute_in_loop
= false;
5750 bool nested_in_vect_loop
= false;
5751 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5752 class loop
*loop_for_initial_load
= NULL
;
5756 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5757 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5760 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5761 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5763 /* We need to generate three things:
5764 1. the misalignment computation
5765 2. the extra vector load (for the optimized realignment scheme).
5766 3. the phi node for the two vectors from which the realignment is
5767 done (for the optimized realignment scheme). */
5769 /* 1. Determine where to generate the misalignment computation.
5771 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5772 calculation will be generated by this function, outside the loop (in the
5773 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5774 caller, inside the loop.
5776 Background: If the misalignment remains fixed throughout the iterations of
5777 the loop, then both realignment schemes are applicable, and also the
5778 misalignment computation can be done outside LOOP. This is because we are
5779 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5780 are a multiple of VS (the Vector Size), and therefore the misalignment in
5781 different vectorized LOOP iterations is always the same.
5782 The problem arises only if the memory access is in an inner-loop nested
5783 inside LOOP, which is now being vectorized using outer-loop vectorization.
5784 This is the only case when the misalignment of the memory access may not
5785 remain fixed throughout the iterations of the inner-loop (as explained in
5786 detail in vect_supportable_dr_alignment). In this case, not only is the
5787 optimized realignment scheme not applicable, but also the misalignment
5788 computation (and generation of the realignment token that is passed to
5789 REALIGN_LOAD) have to be done inside the loop.
5791 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5792 or not, which in turn determines if the misalignment is computed inside
5793 the inner-loop, or outside LOOP. */
5795 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5797 compute_in_loop
= true;
5798 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5802 /* 2. Determine where to generate the extra vector load.
5804 For the optimized realignment scheme, instead of generating two vector
5805 loads in each iteration, we generate a single extra vector load in the
5806 preheader of the loop, and in each iteration reuse the result of the
5807 vector load from the previous iteration. In case the memory access is in
5808 an inner-loop nested inside LOOP, which is now being vectorized using
5809 outer-loop vectorization, we need to determine whether this initial vector
5810 load should be generated at the preheader of the inner-loop, or can be
5811 generated at the preheader of LOOP. If the memory access has no evolution
5812 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5813 to be generated inside LOOP (in the preheader of the inner-loop). */
5815 if (nested_in_vect_loop
)
5817 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5818 bool invariant_in_outerloop
=
5819 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5820 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5823 loop_for_initial_load
= loop
;
5825 *at_loop
= loop_for_initial_load
;
5827 tree vuse
= NULL_TREE
;
5828 if (loop_for_initial_load
)
5830 pe
= loop_preheader_edge (loop_for_initial_load
);
5831 if (gphi
*vphi
= get_virtual_phi (loop_for_initial_load
->header
))
5832 vuse
= PHI_ARG_DEF_FROM_EDGE (vphi
, pe
);
5835 vuse
= gimple_vuse (gsi_stmt (*gsi
));
5837 /* 3. For the case of the optimized realignment, create the first vector
5838 load at the loop preheader. */
5840 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5842 /* Create msq_init = *(floor(p1)) in the loop preheader */
5845 gcc_assert (!compute_in_loop
);
5846 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5847 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
5848 loop_for_initial_load
, NULL_TREE
,
5849 &init_addr
, NULL
, &inc
, true);
5850 if (TREE_CODE (ptr
) == SSA_NAME
)
5851 new_temp
= copy_ssa_name (ptr
);
5853 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5854 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
5855 tree type
= TREE_TYPE (ptr
);
5856 new_stmt
= gimple_build_assign
5857 (new_temp
, BIT_AND_EXPR
, ptr
,
5858 fold_build2 (MINUS_EXPR
, type
,
5859 build_int_cst (type
, 0),
5860 build_int_cst (type
, align
)));
5861 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5862 gcc_assert (!new_bb
);
5864 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5865 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5866 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5867 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5868 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5869 gimple_assign_set_lhs (new_stmt
, new_temp
);
5870 gimple_set_vuse (new_stmt
, vuse
);
5873 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5874 gcc_assert (!new_bb
);
5877 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5879 msq_init
= gimple_assign_lhs (new_stmt
);
5882 /* 4. Create realignment token using a target builtin, if available.
5883 It is done either inside the containing loop, or before LOOP (as
5884 determined above). */
5886 if (targetm
.vectorize
.builtin_mask_for_load
)
5891 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5894 /* Generate the INIT_ADDR computation outside LOOP. */
5895 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
5900 pe
= loop_preheader_edge (loop
);
5901 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5902 gcc_assert (!new_bb
);
5905 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5908 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5909 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5911 vect_create_destination_var (scalar_dest
,
5912 gimple_call_return_type (new_stmt
));
5913 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5914 gimple_call_set_lhs (new_stmt
, new_temp
);
5916 if (compute_in_loop
)
5917 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5920 /* Generate the misalignment computation outside LOOP. */
5921 pe
= loop_preheader_edge (loop
);
5922 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5923 gcc_assert (!new_bb
);
5926 *realignment_token
= gimple_call_lhs (new_stmt
);
5928 /* The result of the CALL_EXPR to this builtin is determined from
5929 the value of the parameter and no global variables are touched
5930 which makes the builtin a "const" function. Requiring the
5931 builtin to have the "const" attribute makes it unnecessary
5932 to call mark_call_clobbered. */
5933 gcc_assert (TREE_READONLY (builtin_decl
));
5936 if (alignment_support_scheme
== dr_explicit_realign
)
5939 gcc_assert (!compute_in_loop
);
5940 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5943 /* 5. Create msq = phi <msq_init, lsq> in loop */
5945 pe
= loop_preheader_edge (containing_loop
);
5946 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5947 msq
= make_ssa_name (vec_dest
);
5948 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5949 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5955 /* Function vect_grouped_load_supported.
5957 COUNT is the size of the load group (the number of statements plus the
5958 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5959 only one statement, with a gap of COUNT - 1.
5961 Returns true if a suitable permute exists. */
5964 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5965 unsigned HOST_WIDE_INT count
)
5967 machine_mode mode
= TYPE_MODE (vectype
);
5969 /* If this is single-element interleaving with an element distance
5970 that leaves unused vector loads around punt - we at least create
5971 very sub-optimal code in that case (and blow up memory,
5973 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5975 if (dump_enabled_p ())
5976 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5977 "single-element interleaving not supported "
5978 "for not adjacent vector loads\n");
5982 /* vect_permute_load_chain requires the group size to be equal to 3 or
5983 be a power of two. */
5984 if (count
!= 3 && exact_log2 (count
) == -1)
5986 if (dump_enabled_p ())
5987 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5988 "the size of the group of accesses"
5989 " is not a power of 2 or not equal to 3\n");
5993 /* Check that the permutation is supported. */
5994 if (VECTOR_MODE_P (mode
))
6000 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
6002 if (dump_enabled_p ())
6003 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6004 "cannot handle groups of 3 loads for"
6005 " variable-length vectors\n");
6009 vec_perm_builder
sel (nelt
, nelt
, 1);
6010 sel
.quick_grow (nelt
);
6011 vec_perm_indices indices
;
6013 for (k
= 0; k
< 3; k
++)
6015 for (i
= 0; i
< nelt
; i
++)
6016 if (3 * i
+ k
< 2 * nelt
)
6020 indices
.new_vector (sel
, 2, nelt
);
6021 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6023 if (dump_enabled_p ())
6024 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6025 "shuffle of 3 loads is not supported by"
6029 for (i
= 0, j
= 0; i
< nelt
; i
++)
6030 if (3 * i
+ k
< 2 * nelt
)
6033 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6034 indices
.new_vector (sel
, 2, nelt
);
6035 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6037 if (dump_enabled_p ())
6038 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6039 "shuffle of 3 loads is not supported by"
6048 /* If length is not equal to 3 then only power of 2 is supported. */
6049 gcc_assert (pow2p_hwi (count
));
6050 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
6052 /* The encoding has a single stepped pattern. */
6053 vec_perm_builder
sel (nelt
, 1, 3);
6055 for (i
= 0; i
< 3; i
++)
6057 vec_perm_indices
indices (sel
, 2, nelt
);
6058 if (can_vec_perm_const_p (mode
, mode
, indices
))
6060 for (i
= 0; i
< 3; i
++)
6062 indices
.new_vector (sel
, 2, nelt
);
6063 if (can_vec_perm_const_p (mode
, mode
, indices
))
6069 if (dump_enabled_p ())
6070 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6071 "extract even/odd not supported by target\n");
6075 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6076 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6079 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
6082 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6083 vec_mask_len_load_lanes_optab
, vectype
,
6085 return IFN_MASK_LEN_LOAD_LANES
;
6088 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6089 vec_mask_load_lanes_optab
, vectype
,
6091 return IFN_MASK_LOAD_LANES
;
6095 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab
,
6097 return IFN_LOAD_LANES
;
6102 /* Function vect_permute_load_chain.
6104 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6105 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6106 the input data correctly. Return the final references for loads in
6109 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6110 The input is 4 vectors each containing 8 elements. We assign a number to each
6111 element, the input sequence is:
6113 1st vec: 0 1 2 3 4 5 6 7
6114 2nd vec: 8 9 10 11 12 13 14 15
6115 3rd vec: 16 17 18 19 20 21 22 23
6116 4th vec: 24 25 26 27 28 29 30 31
6118 The output sequence should be:
6120 1st vec: 0 4 8 12 16 20 24 28
6121 2nd vec: 1 5 9 13 17 21 25 29
6122 3rd vec: 2 6 10 14 18 22 26 30
6123 4th vec: 3 7 11 15 19 23 27 31
6125 i.e., the first output vector should contain the first elements of each
6126 interleaving group, etc.
6128 We use extract_even/odd instructions to create such output. The input of
6129 each extract_even/odd operation is two vectors
6133 and the output is the vector of extracted even/odd elements. The output of
6134 extract_even will be: 0 2 4 6
6135 and of extract_odd: 1 3 5 7
6138 The permutation is done in log LENGTH stages. In each stage extract_even
6139 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6140 their order. In our example,
6142 E1: extract_even (1st vec, 2nd vec)
6143 E2: extract_odd (1st vec, 2nd vec)
6144 E3: extract_even (3rd vec, 4th vec)
6145 E4: extract_odd (3rd vec, 4th vec)
6147 The output for the first stage will be:
6149 E1: 0 2 4 6 8 10 12 14
6150 E2: 1 3 5 7 9 11 13 15
6151 E3: 16 18 20 22 24 26 28 30
6152 E4: 17 19 21 23 25 27 29 31
6154 In order to proceed and create the correct sequence for the next stage (or
6155 for the correct output, if the second stage is the last one, as in our
6156 example), we first put the output of extract_even operation and then the
6157 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6158 The input for the second stage is:
6160 1st vec (E1): 0 2 4 6 8 10 12 14
6161 2nd vec (E3): 16 18 20 22 24 26 28 30
6162 3rd vec (E2): 1 3 5 7 9 11 13 15
6163 4th vec (E4): 17 19 21 23 25 27 29 31
6165 The output of the second stage:
6167 E1: 0 4 8 12 16 20 24 28
6168 E2: 2 6 10 14 18 22 26 30
6169 E3: 1 5 9 13 17 21 25 29
6170 E4: 3 7 11 15 19 23 27 31
6172 And RESULT_CHAIN after reordering:
6174 1st vec (E1): 0 4 8 12 16 20 24 28
6175 2nd vec (E3): 1 5 9 13 17 21 25 29
6176 3rd vec (E2): 2 6 10 14 18 22 26 30
6177 4th vec (E4): 3 7 11 15 19 23 27 31. */
6180 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6181 unsigned int length
,
6182 stmt_vec_info stmt_info
,
6183 gimple_stmt_iterator
*gsi
,
6184 vec
<tree
> *result_chain
)
6186 tree data_ref
, first_vect
, second_vect
;
6187 tree perm_mask_even
, perm_mask_odd
;
6188 tree perm3_mask_low
, perm3_mask_high
;
6190 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6191 unsigned int i
, j
, log_length
= exact_log2 (length
);
6193 result_chain
->quick_grow (length
);
6194 memcpy (result_chain
->address (), dr_chain
.address (),
6195 length
* sizeof (tree
));
6199 /* vect_grouped_load_supported ensures that this is constant. */
6200 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6203 vec_perm_builder
sel (nelt
, nelt
, 1);
6204 sel
.quick_grow (nelt
);
6205 vec_perm_indices indices
;
6206 for (k
= 0; k
< 3; k
++)
6208 for (i
= 0; i
< nelt
; i
++)
6209 if (3 * i
+ k
< 2 * nelt
)
6213 indices
.new_vector (sel
, 2, nelt
);
6214 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6216 for (i
= 0, j
= 0; i
< nelt
; i
++)
6217 if (3 * i
+ k
< 2 * nelt
)
6220 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6221 indices
.new_vector (sel
, 2, nelt
);
6222 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6224 first_vect
= dr_chain
[0];
6225 second_vect
= dr_chain
[1];
6227 /* Create interleaving stmt (low part of):
6228 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6230 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6231 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6232 second_vect
, perm3_mask_low
);
6233 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6235 /* Create interleaving stmt (high part of):
6236 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6238 first_vect
= data_ref
;
6239 second_vect
= dr_chain
[2];
6240 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6241 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6242 second_vect
, perm3_mask_high
);
6243 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6244 (*result_chain
)[k
] = data_ref
;
6249 /* If length is not equal to 3 then only power of 2 is supported. */
6250 gcc_assert (pow2p_hwi (length
));
6252 /* The encoding has a single stepped pattern. */
6253 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6254 vec_perm_builder
sel (nelt
, 1, 3);
6256 for (i
= 0; i
< 3; ++i
)
6258 vec_perm_indices
indices (sel
, 2, nelt
);
6259 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6261 for (i
= 0; i
< 3; ++i
)
6263 indices
.new_vector (sel
, 2, nelt
);
6264 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6266 for (i
= 0; i
< log_length
; i
++)
6268 for (j
= 0; j
< length
; j
+= 2)
6270 first_vect
= dr_chain
[j
];
6271 second_vect
= dr_chain
[j
+1];
6273 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6274 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6275 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6276 first_vect
, second_vect
,
6278 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6279 (*result_chain
)[j
/2] = data_ref
;
6281 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6282 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6283 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6284 first_vect
, second_vect
,
6286 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6287 (*result_chain
)[j
/2+length
/2] = data_ref
;
6289 memcpy (dr_chain
.address (), result_chain
->address (),
6290 length
* sizeof (tree
));
6295 /* Function vect_shift_permute_load_chain.
6297 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6298 sequence of stmts to reorder the input data accordingly.
6299 Return the final references for loads in RESULT_CHAIN.
6300 Return true if successed, false otherwise.
6302 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6303 The input is 3 vectors each containing 8 elements. We assign a
6304 number to each element, the input sequence is:
6306 1st vec: 0 1 2 3 4 5 6 7
6307 2nd vec: 8 9 10 11 12 13 14 15
6308 3rd vec: 16 17 18 19 20 21 22 23
6310 The output sequence should be:
6312 1st vec: 0 3 6 9 12 15 18 21
6313 2nd vec: 1 4 7 10 13 16 19 22
6314 3rd vec: 2 5 8 11 14 17 20 23
6316 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6318 First we shuffle all 3 vectors to get correct elements order:
6320 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6321 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6322 3rd vec: (16 19 22) (17 20 23) (18 21)
6324 Next we unite and shift vector 3 times:
6327 shift right by 6 the concatenation of:
6328 "1st vec" and "2nd vec"
6329 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6330 "2nd vec" and "3rd vec"
6331 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6332 "3rd vec" and "1st vec"
6333 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6336 So that now new vectors are:
6338 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6339 2nd vec: (10 13) (16 19 22) (17 20 23)
6340 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6343 shift right by 5 the concatenation of:
6344 "1st vec" and "3rd vec"
6345 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6346 "2nd vec" and "1st vec"
6347 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6348 "3rd vec" and "2nd vec"
6349 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6352 So that now new vectors are:
6354 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6355 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6356 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6359 shift right by 5 the concatenation of:
6360 "1st vec" and "1st vec"
6361 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6362 shift right by 3 the concatenation of:
6363 "2nd vec" and "2nd vec"
6364 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6367 So that now all vectors are READY:
6368 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6369 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6370 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6372 This algorithm is faster than one in vect_permute_load_chain if:
6373 1. "shift of a concatination" is faster than general permutation.
6375 2. The TARGET machine can't execute vector instructions in parallel.
6376 This is because each step of the algorithm depends on previous.
6377 The algorithm in vect_permute_load_chain is much more parallel.
6379 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6383 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6384 unsigned int length
,
6385 stmt_vec_info stmt_info
,
6386 gimple_stmt_iterator
*gsi
,
6387 vec
<tree
> *result_chain
)
6389 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6390 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6391 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6394 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6395 machine_mode vmode
= TYPE_MODE (vectype
);
6397 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6399 unsigned HOST_WIDE_INT nelt
, vf
;
6400 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6401 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6402 /* Not supported for variable-length vectors. */
6405 vec_perm_builder
sel (nelt
, nelt
, 1);
6406 sel
.quick_grow (nelt
);
6408 result_chain
->quick_grow (length
);
6409 memcpy (result_chain
->address (), dr_chain
.address (),
6410 length
* sizeof (tree
));
6412 if (pow2p_hwi (length
) && vf
> 4)
6414 unsigned int j
, log_length
= exact_log2 (length
);
6415 for (i
= 0; i
< nelt
/ 2; ++i
)
6417 for (i
= 0; i
< nelt
/ 2; ++i
)
6418 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6419 vec_perm_indices
indices (sel
, 2, nelt
);
6420 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6422 if (dump_enabled_p ())
6423 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6424 "shuffle of 2 fields structure is not \
6425 supported by target\n");
6428 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6430 for (i
= 0; i
< nelt
/ 2; ++i
)
6432 for (i
= 0; i
< nelt
/ 2; ++i
)
6433 sel
[nelt
/ 2 + i
] = i
* 2;
6434 indices
.new_vector (sel
, 2, nelt
);
6435 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6437 if (dump_enabled_p ())
6438 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6439 "shuffle of 2 fields structure is not \
6440 supported by target\n");
6443 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6445 /* Generating permutation constant to shift all elements.
6446 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6447 for (i
= 0; i
< nelt
; i
++)
6448 sel
[i
] = nelt
/ 2 + i
;
6449 indices
.new_vector (sel
, 2, nelt
);
6450 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6452 if (dump_enabled_p ())
6453 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6454 "shift permutation is not supported by target\n");
6457 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6459 /* Generating permutation constant to select vector from 2.
6460 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6461 for (i
= 0; i
< nelt
/ 2; i
++)
6463 for (i
= nelt
/ 2; i
< nelt
; i
++)
6465 indices
.new_vector (sel
, 2, nelt
);
6466 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6468 if (dump_enabled_p ())
6469 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6470 "select is not supported by target\n");
6473 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6475 for (i
= 0; i
< log_length
; i
++)
6477 for (j
= 0; j
< length
; j
+= 2)
6479 first_vect
= dr_chain
[j
];
6480 second_vect
= dr_chain
[j
+ 1];
6482 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6483 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6484 first_vect
, first_vect
,
6486 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6489 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6490 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6491 second_vect
, second_vect
,
6493 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6496 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6497 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6498 vect
[0], vect
[1], shift1_mask
);
6499 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6500 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6502 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6503 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6504 vect
[0], vect
[1], select_mask
);
6505 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6506 (*result_chain
)[j
/2] = data_ref
;
6508 memcpy (dr_chain
.address (), result_chain
->address (),
6509 length
* sizeof (tree
));
6513 if (length
== 3 && vf
> 2)
6515 unsigned int k
= 0, l
= 0;
6517 /* Generating permutation constant to get all elements in rigth order.
6518 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6519 for (i
= 0; i
< nelt
; i
++)
6521 if (3 * k
+ (l
% 3) >= nelt
)
6524 l
+= (3 - (nelt
% 3));
6526 sel
[i
] = 3 * k
+ (l
% 3);
6529 vec_perm_indices
indices (sel
, 2, nelt
);
6530 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6532 if (dump_enabled_p ())
6533 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6534 "shuffle of 3 fields structure is not \
6535 supported by target\n");
6538 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6540 /* Generating permutation constant to shift all elements.
6541 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6542 for (i
= 0; i
< nelt
; i
++)
6543 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6544 indices
.new_vector (sel
, 2, nelt
);
6545 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6547 if (dump_enabled_p ())
6548 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6549 "shift permutation is not supported by target\n");
6552 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6554 /* Generating permutation constant to shift all elements.
6555 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6556 for (i
= 0; i
< nelt
; i
++)
6557 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6558 indices
.new_vector (sel
, 2, nelt
);
6559 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6561 if (dump_enabled_p ())
6562 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6563 "shift permutation is not supported by target\n");
6566 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6568 /* Generating permutation constant to shift all elements.
6569 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6570 for (i
= 0; i
< nelt
; i
++)
6571 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6572 indices
.new_vector (sel
, 2, nelt
);
6573 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6575 if (dump_enabled_p ())
6576 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6577 "shift permutation is not supported by target\n");
6580 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6582 /* Generating permutation constant to shift all elements.
6583 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6584 for (i
= 0; i
< nelt
; i
++)
6585 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6586 indices
.new_vector (sel
, 2, nelt
);
6587 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6589 if (dump_enabled_p ())
6590 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6591 "shift permutation is not supported by target\n");
6594 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6596 for (k
= 0; k
< 3; k
++)
6598 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6599 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6600 dr_chain
[k
], dr_chain
[k
],
6602 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6606 for (k
= 0; k
< 3; k
++)
6608 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6609 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6610 vect
[k
% 3], vect
[(k
+ 1) % 3],
6612 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6613 vect_shift
[k
] = data_ref
;
6616 for (k
= 0; k
< 3; k
++)
6618 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6619 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6620 vect_shift
[(4 - k
) % 3],
6621 vect_shift
[(3 - k
) % 3],
6623 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6627 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6629 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6630 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6631 vect
[0], shift3_mask
);
6632 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6633 (*result_chain
)[nelt
% 3] = data_ref
;
6635 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6636 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6637 vect
[1], shift4_mask
);
6638 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6639 (*result_chain
)[0] = data_ref
;
6645 /* Function vect_transform_grouped_load.
6647 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6648 to perform their permutation and ascribe the result vectorized statements to
6649 the scalar statements.
6653 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6655 int size
, gimple_stmt_iterator
*gsi
)
6658 vec
<tree
> result_chain
= vNULL
;
6660 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6661 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6662 vectors, that are ready for vector computation. */
6663 result_chain
.create (size
);
6665 /* If reassociation width for vector type is 2 or greater target machine can
6666 execute 2 or more vector instructions in parallel. Otherwise try to
6667 get chain for loads group using vect_shift_permute_load_chain. */
6668 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6669 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6671 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6672 gsi
, &result_chain
))
6673 vect_permute_load_chain (vinfo
, dr_chain
,
6674 size
, stmt_info
, gsi
, &result_chain
);
6675 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6676 result_chain
.release ();
6679 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6680 generated as part of the vectorization of STMT_INFO. Assign the statement
6681 for each vector to the associated scalar statement. */
6684 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
6685 vec
<tree
> result_chain
)
6687 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6688 unsigned int i
, gap_count
;
6691 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6692 Since we scan the chain starting from it's first node, their order
6693 corresponds the order of data-refs in RESULT_CHAIN. */
6694 stmt_vec_info next_stmt_info
= first_stmt_info
;
6696 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6698 if (!next_stmt_info
)
6701 /* Skip the gaps. Loads created for the gaps will be removed by dead
6702 code elimination pass later. No need to check for the first stmt in
6703 the group, since it always exists.
6704 DR_GROUP_GAP is the number of steps in elements from the previous
6705 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6706 correspond to the gaps. */
6707 if (next_stmt_info
!= first_stmt_info
6708 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6714 /* ??? The following needs cleanup after the removal of
6715 DR_GROUP_SAME_DR_STMT. */
6718 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6719 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6720 copies, and we put the new vector statement last. */
6721 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
6723 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
6729 /* Function vect_force_dr_alignment_p.
6731 Returns whether the alignment of a DECL can be forced to be aligned
6732 on ALIGNMENT bit boundary. */
6735 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
6740 if (decl_in_symtab_p (decl
)
6741 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6744 if (TREE_STATIC (decl
))
6745 return (known_le (alignment
,
6746 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
6748 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
6751 /* Return whether the data reference DR_INFO is supported with respect to its
6753 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6754 it is aligned, i.e., check if it is possible to vectorize it with different
6757 enum dr_alignment_support
6758 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
6759 tree vectype
, int misalignment
)
6761 data_reference
*dr
= dr_info
->dr
;
6762 stmt_vec_info stmt_info
= dr_info
->stmt
;
6763 machine_mode mode
= TYPE_MODE (vectype
);
6764 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6765 class loop
*vect_loop
= NULL
;
6766 bool nested_in_vect_loop
= false;
6768 if (misalignment
== 0)
6771 /* For now assume all conditional loads/stores support unaligned
6772 access without any special code. */
6773 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
6774 if (gimple_call_internal_p (stmt
)
6775 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6776 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6777 return dr_unaligned_supported
;
6781 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6782 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
6785 /* Possibly unaligned access. */
6787 /* We can choose between using the implicit realignment scheme (generating
6788 a misaligned_move stmt) and the explicit realignment scheme (generating
6789 aligned loads with a REALIGN_LOAD). There are two variants to the
6790 explicit realignment scheme: optimized, and unoptimized.
6791 We can optimize the realignment only if the step between consecutive
6792 vector loads is equal to the vector size. Since the vector memory
6793 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6794 is guaranteed that the misalignment amount remains the same throughout the
6795 execution of the vectorized loop. Therefore, we can create the
6796 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6797 at the loop preheader.
6799 However, in the case of outer-loop vectorization, when vectorizing a
6800 memory access in the inner-loop nested within the LOOP that is now being
6801 vectorized, while it is guaranteed that the misalignment of the
6802 vectorized memory access will remain the same in different outer-loop
6803 iterations, it is *not* guaranteed that is will remain the same throughout
6804 the execution of the inner-loop. This is because the inner-loop advances
6805 with the original scalar step (and not in steps of VS). If the inner-loop
6806 step happens to be a multiple of VS, then the misalignment remains fixed
6807 and we can use the optimized realignment scheme. For example:
6813 When vectorizing the i-loop in the above example, the step between
6814 consecutive vector loads is 1, and so the misalignment does not remain
6815 fixed across the execution of the inner-loop, and the realignment cannot
6816 be optimized (as illustrated in the following pseudo vectorized loop):
6818 for (i=0; i<N; i+=4)
6819 for (j=0; j<M; j++){
6820 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6821 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6822 // (assuming that we start from an aligned address).
6825 We therefore have to use the unoptimized realignment scheme:
6827 for (i=0; i<N; i+=4)
6828 for (j=k; j<M; j+=4)
6829 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6830 // that the misalignment of the initial address is
6833 The loop can then be vectorized as follows:
6835 for (k=0; k<4; k++){
6836 rt = get_realignment_token (&vp[k]);
6837 for (i=0; i<N; i+=4){
6839 for (j=k; j<M; j+=4){
6841 va = REALIGN_LOAD <v1,v2,rt>;
6848 if (DR_IS_READ (dr
))
6850 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6851 && (!targetm
.vectorize
.builtin_mask_for_load
6852 || targetm
.vectorize
.builtin_mask_for_load ()))
6854 /* If we are doing SLP then the accesses need not have the
6855 same alignment, instead it depends on the SLP group size. */
6857 && STMT_SLP_TYPE (stmt_info
)
6858 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
6859 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6861 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6862 TYPE_VECTOR_SUBPARTS (vectype
))))
6864 else if (!loop_vinfo
6865 || (nested_in_vect_loop
6866 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6867 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6868 return dr_explicit_realign
;
6870 return dr_explicit_realign_optimized
;
6874 bool is_packed
= false;
6875 tree type
= TREE_TYPE (DR_REF (dr
));
6876 if (misalignment
== DR_MISALIGNMENT_UNKNOWN
)
6877 is_packed
= not_size_aligned (DR_REF (dr
));
6878 if (targetm
.vectorize
.support_vector_misalignment (mode
, type
, misalignment
,
6880 return dr_unaligned_supported
;
6883 return dr_unaligned_unsupported
;