1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode
), count
);
83 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
87 "cannot use %s<%s><%s>\n", name
,
88 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE
, vect_location
,
94 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
95 GET_MODE_NAME (mode
));
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
, tree scalar_type
)
121 HOST_WIDE_INT lhs
, rhs
;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
128 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
130 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
133 scalar_type
= TREE_TYPE (gimple_assign_lhs (assign
));
134 if (gimple_assign_cast_p (assign
)
135 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign
) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign
) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
143 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
145 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
147 scalar_type
= rhs_type
;
150 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
153 if (gimple_call_internal_p (call
))
155 internal_fn ifn
= gimple_call_internal_fn (call
);
156 if (internal_load_fn_p (ifn
))
157 /* For loads the LHS type does the trick. */
159 else if (internal_store_fn_p (ifn
))
161 /* For stores use the tyep of the stored value. */
162 i
= internal_fn_stored_value_index (ifn
);
163 scalar_type
= TREE_TYPE (gimple_call_arg (call
, i
));
166 else if (internal_fn_mask_index (ifn
) == 0)
169 if (i
< gimple_call_num_args (call
))
171 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
172 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
174 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
176 scalar_type
= rhs_type
;
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186 tested at run-time. Return TRUE if DDR was successfully inserted.
187 Return false if versioning is not supported. */
190 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
192 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
194 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
195 return opt_result::failure_at (vect_location
,
196 "will not create alias checks, as"
197 " --param vect-max-version-for-alias-checks"
201 = runtime_alias_check_p (ddr
, loop
,
202 optimize_loop_nest_for_speed_p (loop
));
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
207 return opt_result::success ();
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
213 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
215 const vec
<tree
> &checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
216 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
217 if (checks
[i
] == value
)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE
, vect_location
,
222 "need run-time check that %T is nonzero\n",
224 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229 DR_INFO_B. At least one of the accesses is a write. */
232 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
234 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
235 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
237 /* Single statements are always kept in their original order. */
238 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
239 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
242 /* STMT_A and STMT_B belong to overlapping groups. All loads are
243 emitted at the position of the first scalar load.
244 Stores in a group are emitted at the position of the last scalar store.
245 Compute that position and check whether the resulting order matches
247 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
250 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
251 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
252 s
= DR_GROUP_NEXT_ELEMENT (s
))
253 il_a
= get_later_stmt (il_a
, s
);
254 else /* DR_IS_READ */
255 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
256 s
= DR_GROUP_NEXT_ELEMENT (s
))
257 if (get_later_stmt (il_a
, s
) == il_a
)
262 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
265 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
266 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
267 s
= DR_GROUP_NEXT_ELEMENT (s
))
268 il_b
= get_later_stmt (il_b
, s
);
269 else /* DR_IS_READ */
270 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
271 s
= DR_GROUP_NEXT_ELEMENT (s
))
272 if (get_later_stmt (il_b
, s
) == il_b
)
277 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
278 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
281 /* A subroutine of vect_analyze_data_ref_dependence. Handle
282 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283 distances. These distances are conservatively correct but they don't
284 reflect a guaranteed dependence.
286 Return true if this function does all the work necessary to avoid
287 an alias or false if the caller should use the dependence distances
288 to limit the vectorization factor in the usual way. LOOP_DEPTH is
289 the depth of the loop described by LOOP_VINFO and the other arguments
290 are as for vect_analyze_data_ref_dependence. */
293 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
294 loop_vec_info loop_vinfo
,
295 int loop_depth
, unsigned int *max_vf
)
297 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
298 for (lambda_vector
&dist_v
: DDR_DIST_VECTS (ddr
))
300 int dist
= dist_v
[loop_depth
];
301 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
303 /* If the user asserted safelen >= DIST consecutive iterations
304 can be executed concurrently, assume independence.
306 ??? An alternative would be to add the alias check even
307 in this case, and vectorize the fallback loop with the
308 maximum VF set to safelen. However, if the user has
309 explicitly given a length, it's less likely that that
311 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
313 if ((unsigned int) loop
->safelen
< *max_vf
)
314 *max_vf
= loop
->safelen
;
315 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
319 /* For dependence distances of 2 or more, we have the option
320 of limiting VF or checking for an alias at runtime.
321 Prefer to check at runtime if we can, to avoid limiting
322 the VF unnecessarily when the bases are in fact independent.
324 Note that the alias checks will be removed if the VF ends up
325 being small enough. */
326 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
327 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
328 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
329 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
330 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
337 /* Function vect_analyze_data_ref_dependence.
339 FIXME: I needed to change the sense of the returned flag.
341 Return FALSE if there (might) exist a dependence between a memory-reference
342 DRA and a memory-reference DRB. When versioning for alias may check a
343 dependence at run-time, return TRUE. Adjust *MAX_VF according to
344 the data dependence. */
347 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
348 loop_vec_info loop_vinfo
,
349 unsigned int *max_vf
)
352 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
353 struct data_reference
*dra
= DDR_A (ddr
);
354 struct data_reference
*drb
= DDR_B (ddr
);
355 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
356 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
357 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
358 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
359 lambda_vector dist_v
;
360 unsigned int loop_depth
;
362 /* If user asserted safelen consecutive iterations can be
363 executed concurrently, assume independence. */
364 auto apply_safelen
= [&]()
366 if (loop
->safelen
>= 2)
368 if ((unsigned int) loop
->safelen
< *max_vf
)
369 *max_vf
= loop
->safelen
;
370 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
376 /* In loop analysis all data references should be vectorizable. */
377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
381 /* Independent data accesses. */
382 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
383 return opt_result::success ();
386 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
387 return opt_result::success ();
389 /* We do not have to consider dependences between accesses that belong
390 to the same group, unless the stride could be smaller than the
392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
395 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
396 return opt_result::success ();
398 /* Even if we have an anti-dependence then, as the vectorized loop covers at
399 least two scalar iterations, there is always also a true dependence.
400 As the vectorizer does not re-order loads and stores we can ignore
401 the anti-dependence if TBAA can disambiguate both DRs similar to the
402 case with known negative distance anti-dependences (positive
403 distance anti-dependences would violate TBAA constraints). */
404 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
405 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
407 get_alias_set (DR_REF (drb
))))
408 return opt_result::success ();
410 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
411 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
413 if (apply_safelen ())
414 return opt_result::success ();
416 return opt_result::failure_at
418 "possible alias involving gather/scatter between %T and %T\n",
419 DR_REF (dra
), DR_REF (drb
));
422 /* Unknown data dependence. */
423 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
425 if (apply_safelen ())
426 return opt_result::success ();
428 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
430 "versioning for alias required: "
431 "can't determine dependence between %T and %T\n",
432 DR_REF (dra
), DR_REF (drb
));
434 /* Add to list of ddrs that need to be tested at run-time. */
435 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
438 /* Known data dependence. */
439 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
441 if (apply_safelen ())
442 return opt_result::success ();
444 if (dump_enabled_p ())
445 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
446 "versioning for alias required: "
447 "bad dist vector for %T and %T\n",
448 DR_REF (dra
), DR_REF (drb
));
449 /* Add to list of ddrs that need to be tested at run-time. */
450 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
453 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
455 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
456 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
458 return opt_result::success ();
460 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
462 int dist
= dist_v
[loop_depth
];
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE
, vect_location
,
466 "dependence distance = %d.\n", dist
);
470 if (dump_enabled_p ())
471 dump_printf_loc (MSG_NOTE
, vect_location
,
472 "dependence distance == 0 between %T and %T\n",
473 DR_REF (dra
), DR_REF (drb
));
475 /* When we perform grouped accesses and perform implicit CSE
476 by detecting equal accesses and doing disambiguation with
477 runtime alias tests like for
485 where we will end up loading { a[i], a[i+1] } once, make
486 sure that inserting group loads before the first load and
487 stores after the last store will do the right thing.
488 Similar for groups like
492 where loads from the group interleave with the store. */
493 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
494 return opt_result::failure_at (stmtinfo_a
->stmt
,
495 "READ_WRITE dependence"
496 " in interleaving.\n");
498 if (loop
->safelen
< 2)
500 tree indicator
= dr_zero_step_indicator (dra
);
501 if (!indicator
|| integer_zerop (indicator
))
502 return opt_result::failure_at (stmtinfo_a
->stmt
,
503 "access also has a zero step\n");
504 else if (TREE_CODE (indicator
) != INTEGER_CST
)
505 vect_check_nonzero_value (loop_vinfo
, indicator
);
510 if (dist
> 0 && DDR_REVERSED_P (ddr
))
512 /* If DDR_REVERSED_P the order of the data-refs in DDR was
513 reversed (to make distance vector positive), and the actual
514 distance is negative. */
515 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE
, vect_location
,
517 "dependence distance negative.\n");
518 /* When doing outer loop vectorization, we need to check if there is
519 a backward dependence at the inner loop level if the dependence
520 at the outer loop is reversed. See PR81740. */
521 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
522 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
524 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
525 DDR_LOOP_NEST (ddr
));
526 if (dist_v
[inner_depth
] < 0)
527 return opt_result::failure_at (stmtinfo_a
->stmt
,
528 "not vectorized, dependence "
529 "between data-refs %T and %T\n",
530 DR_REF (dra
), DR_REF (drb
));
532 /* Record a negative dependence distance to later limit the
533 amount of stmt copying / unrolling we can perform.
534 Only need to handle read-after-write dependence. */
536 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
537 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
538 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
542 unsigned int abs_dist
= abs (dist
);
543 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
545 /* The dependence distance requires reduction of the maximal
546 vectorization factor. */
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE
, vect_location
,
550 "adjusting maximal vectorization factor to %i\n",
554 if (abs_dist
>= *max_vf
)
556 /* Dependence distance does not create dependence, as far as
557 vectorization is concerned, in this case. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE
, vect_location
,
560 "dependence distance >= VF.\n");
564 return opt_result::failure_at (stmtinfo_a
->stmt
,
565 "not vectorized, possible dependence "
566 "between data-refs %T and %T\n",
567 DR_REF (dra
), DR_REF (drb
));
570 return opt_result::success ();
573 /* Function vect_analyze_data_ref_dependences.
575 Examine all the data references in the loop, and make sure there do not
576 exist any data dependences between them. Set *MAX_VF according to
577 the maximum vectorization factor the data dependences allow. */
580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
581 unsigned int *max_vf
)
584 struct data_dependence_relation
*ddr
;
586 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
588 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
590 LOOP_VINFO_DDRS (loop_vinfo
)
591 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
592 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
593 /* We do not need read-read dependences. */
594 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
595 &LOOP_VINFO_DDRS (loop_vinfo
),
596 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
601 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
603 /* For epilogues we either have no aliases or alias versioning
604 was applied to original loop. Therefore we may just get max_vf
605 using VF of original loop. */
606 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
607 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
609 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
612 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
617 return opt_result::success ();
621 /* Function vect_slp_analyze_data_ref_dependence.
623 Return TRUE if there (might) exist a dependence between a memory-reference
624 DRA and a memory-reference DRB for VINFO. When versioning for alias
625 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
626 according to the data dependence. */
629 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
630 struct data_dependence_relation
*ddr
)
632 struct data_reference
*dra
= DDR_A (ddr
);
633 struct data_reference
*drb
= DDR_B (ddr
);
634 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
635 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
637 /* We need to check dependences of statements marked as unvectorizable
638 as well, they still can prohibit vectorization. */
640 /* Independent data accesses. */
641 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
647 /* Read-read is OK. */
648 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
651 /* If dra and drb are part of the same interleaving chain consider
653 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
654 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
655 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
658 /* Unknown data dependence. */
659 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
663 "can't determine dependence between %T and %T\n",
664 DR_REF (dra
), DR_REF (drb
));
666 else if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE
, vect_location
,
668 "determined dependence between %T and %T\n",
669 DR_REF (dra
), DR_REF (drb
));
675 /* Analyze dependences involved in the transform of SLP NODE. STORES
676 contain the vector of scalar stores of this instance if we are
677 disambiguating the loads. */
680 vect_slp_analyze_node_dependences (vec_info
*vinfo
, slp_tree node
,
681 vec
<stmt_vec_info
> stores
,
682 stmt_vec_info last_store_info
)
684 /* This walks over all stmts involved in the SLP load/store done
685 in NODE verifying we can sink them up to the last stmt in the
687 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
689 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
690 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
692 stmt_vec_info access_info
693 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
694 if (access_info
== last_access_info
)
696 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
698 bool ref_initialized_p
= false;
699 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
700 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
702 gimple
*stmt
= gsi_stmt (gsi
);
703 if (! gimple_vuse (stmt
))
706 /* If we couldn't record a (single) data reference for this
707 stmt we have to resort to the alias oracle. */
708 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
709 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
712 /* We are moving a store - this means
713 we cannot use TBAA for disambiguation. */
714 if (!ref_initialized_p
)
715 ao_ref_init (&ref
, DR_REF (dr_a
));
716 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
717 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
722 bool dependent
= false;
723 /* If we run into a store of this same instance (we've just
724 marked those) then delay dependence checking until we run
725 into the last store because this is where it will have
726 been sunk to (and we verify if we can do that as well). */
727 if (gimple_visited_p (stmt
))
729 if (stmt_info
!= last_store_info
)
732 for (stmt_vec_info
&store_info
: stores
)
734 data_reference
*store_dr
735 = STMT_VINFO_DATA_REF (store_info
);
736 ddr_p ddr
= initialize_data_dependence_relation
737 (dr_a
, store_dr
, vNULL
);
739 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
740 free_dependence_relation (ddr
);
747 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
749 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
750 free_dependence_relation (ddr
);
757 else /* DR_IS_READ */
759 stmt_vec_info first_access_info
760 = vect_find_first_scalar_stmt_in_slp (node
);
761 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
763 stmt_vec_info access_info
764 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
765 if (access_info
== first_access_info
)
767 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
769 bool ref_initialized_p
= false;
770 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
771 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
773 gimple
*stmt
= gsi_stmt (gsi
);
774 if (! gimple_vdef (stmt
))
777 /* If we couldn't record a (single) data reference for this
778 stmt we have to resort to the alias oracle. */
779 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
780 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
782 /* We are hoisting a load - this means we can use
783 TBAA for disambiguation. */
784 if (!ref_initialized_p
)
785 ao_ref_init (&ref
, DR_REF (dr_a
));
786 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, true))
790 /* Resort to dependence checking below. */
796 bool dependent
= false;
797 /* If we run into a store of this same instance (we've just
798 marked those) then delay dependence checking until we run
799 into the last store because this is where it will have
800 been sunk to (and we verify if we can do that as well). */
801 if (gimple_visited_p (stmt
))
803 if (stmt_info
!= last_store_info
)
806 for (stmt_vec_info
&store_info
: stores
)
808 data_reference
*store_dr
809 = STMT_VINFO_DATA_REF (store_info
);
810 ddr_p ddr
= initialize_data_dependence_relation
811 (dr_a
, store_dr
, vNULL
);
813 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
814 free_dependence_relation (ddr
);
821 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
823 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
824 free_dependence_relation (ddr
);
835 /* Function vect_analyze_data_ref_dependences.
837 Examine all the data references in the basic-block, and make sure there
838 do not exist any data dependences between them. Set *MAX_VF according to
839 the maximum vectorization factor the data dependences allow. */
842 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
844 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
846 /* The stores of this instance are at the root of the SLP tree. */
847 slp_tree store
= NULL
;
848 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
)
849 store
= SLP_INSTANCE_TREE (instance
);
851 /* Verify we can sink stores to the vectorized stmt insert location. */
852 stmt_vec_info last_store_info
= NULL
;
855 if (! vect_slp_analyze_node_dependences (vinfo
, store
, vNULL
, NULL
))
858 /* Mark stores in this instance and remember the last one. */
859 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
860 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
861 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
866 /* Verify we can sink loads to the vectorized stmt insert location,
867 special-casing stores of this instance. */
868 for (slp_tree
&load
: SLP_INSTANCE_LOADS (instance
))
869 if (! vect_slp_analyze_node_dependences (vinfo
, load
,
871 ? SLP_TREE_SCALAR_STMTS (store
)
872 : vNULL
, last_store_info
))
878 /* Unset the visited flag. */
880 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
881 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
886 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
890 dr_misalignment (dr_vec_info
*dr_info
, tree vectype
, poly_int64 offset
)
892 HOST_WIDE_INT diff
= 0;
893 /* Alignment is only analyzed for the first element of a DR group,
894 use that but adjust misalignment by the offset of the access. */
895 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
))
897 dr_vec_info
*first_dr
898 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
));
899 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
900 INTEGER_CSTs and the first element in the group has the lowest
902 diff
= (TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
))
903 - TREE_INT_CST_LOW (DR_INIT (first_dr
->dr
)));
904 gcc_assert (diff
>= 0);
908 int misalign
= dr_info
->misalignment
;
909 gcc_assert (misalign
!= DR_MISALIGNMENT_UNINITIALIZED
);
910 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
913 /* If the access is only aligned for a vector type with smaller alignment
914 requirement the access has unknown misalignment. */
915 if (maybe_lt (dr_info
->target_alignment
* BITS_PER_UNIT
,
916 targetm
.vectorize
.preferred_vector_alignment (vectype
)))
917 return DR_MISALIGNMENT_UNKNOWN
;
919 /* Apply the offset from the DR group start and the externally supplied
920 offset which can for example result from a negative stride access. */
921 poly_int64 misalignment
= misalign
+ diff
+ offset
;
923 /* vect_compute_data_ref_alignment will have ensured that target_alignment
924 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
925 unsigned HOST_WIDE_INT target_alignment_c
926 = dr_info
->target_alignment
.to_constant ();
927 if (!known_misalignment (misalignment
, target_alignment_c
, &misalign
))
928 return DR_MISALIGNMENT_UNKNOWN
;
932 /* Record the base alignment guarantee given by DRB, which occurs
936 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
937 innermost_loop_behavior
*drb
)
940 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> &entry
941 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
942 if (!existed
|| entry
.second
->base_alignment
< drb
->base_alignment
)
944 entry
= std::make_pair (stmt_info
, drb
);
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE
, vect_location
,
947 "recording new base alignment for %T\n"
949 " misalignment: %d\n"
953 drb
->base_misalignment
,
958 /* If the region we're going to vectorize is reached, all unconditional
959 data references occur at least once. We can therefore pool the base
960 alignment guarantees from each unconditional reference. Do this by
961 going through all the data references in VINFO and checking whether
962 the containing statement makes the reference unconditionally. If so,
963 record the alignment of the base address in VINFO so that it can be
964 used for all other references with the same base. */
967 vect_record_base_alignments (vec_info
*vinfo
)
969 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
970 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
971 for (data_reference
*dr
: vinfo
->shared
->datarefs
)
973 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
974 stmt_vec_info stmt_info
= dr_info
->stmt
;
975 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
976 && STMT_VINFO_VECTORIZABLE (stmt_info
)
977 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
979 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
981 /* If DR is nested in the loop that is being vectorized, we can also
982 record the alignment of the base wrt the outer loop. */
983 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
984 vect_record_base_alignment
985 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
990 /* Function vect_compute_data_ref_alignment
992 Compute the misalignment of the data reference DR_INFO when vectorizing
996 1. initialized misalignment info for DR_INFO
998 FOR NOW: No analysis is actually performed. Misalignment is calculated
999 only for trivial cases. TODO. */
1002 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1005 stmt_vec_info stmt_info
= dr_info
->stmt
;
1006 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
1007 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1008 class loop
*loop
= NULL
;
1009 tree ref
= DR_REF (dr_info
->dr
);
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_NOTE
, vect_location
,
1013 "vect_compute_data_ref_alignment:\n");
1016 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1018 /* Initialize misalignment to unknown. */
1019 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1021 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1024 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
1025 bool step_preserves_misalignment_p
;
1027 poly_uint64 vector_alignment
1028 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
1030 SET_DR_TARGET_ALIGNMENT (dr_info
, vector_alignment
);
1032 /* If the main loop has peeled for alignment we have no way of knowing
1033 whether the data accesses in the epilogues are aligned. We can't at
1034 compile time answer the question whether we have entered the main loop or
1035 not. Fixes PR 92351. */
1038 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1040 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1044 unsigned HOST_WIDE_INT vect_align_c
;
1045 if (!vector_alignment
.is_constant (&vect_align_c
))
1048 /* No step for BB vectorization. */
1051 gcc_assert (integer_zerop (drb
->step
));
1052 step_preserves_misalignment_p
= true;
1055 /* In case the dataref is in an inner-loop of the loop that is being
1056 vectorized (LOOP), we use the base and misalignment information
1057 relative to the outer-loop (LOOP). This is ok only if the misalignment
1058 stays the same throughout the execution of the inner-loop, which is why
1059 we have to check that the stride of the dataref in the inner-loop evenly
1060 divides by the vector alignment. */
1061 else if (nested_in_vect_loop_p (loop
, stmt_info
))
1063 step_preserves_misalignment_p
1064 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1066 if (dump_enabled_p ())
1068 if (step_preserves_misalignment_p
)
1069 dump_printf_loc (MSG_NOTE
, vect_location
,
1070 "inner step divides the vector alignment.\n");
1072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1073 "inner step doesn't divide the vector"
1078 /* Similarly we can only use base and misalignment information relative to
1079 an innermost loop if the misalignment stays the same throughout the
1080 execution of the loop. As above, this is the case if the stride of
1081 the dataref evenly divides by the alignment. */
1084 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1085 step_preserves_misalignment_p
1086 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
1088 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1090 "step doesn't divide the vector alignment.\n");
1093 unsigned int base_alignment
= drb
->base_alignment
;
1094 unsigned int base_misalignment
= drb
->base_misalignment
;
1096 /* Calculate the maximum of the pooled base address alignment and the
1097 alignment that we can compute for DR itself. */
1098 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> *entry
1099 = base_alignments
->get (drb
->base_address
);
1101 && base_alignment
< (*entry
).second
->base_alignment
1103 || (dominated_by_p (CDI_DOMINATORS
, gimple_bb (stmt_info
->stmt
),
1104 gimple_bb (entry
->first
->stmt
))
1105 && (gimple_bb (stmt_info
->stmt
) != gimple_bb (entry
->first
->stmt
)
1106 || (entry
->first
->dr_aux
.group
<= dr_info
->group
)))))
1108 base_alignment
= entry
->second
->base_alignment
;
1109 base_misalignment
= entry
->second
->base_misalignment
;
1112 if (drb
->offset_alignment
< vect_align_c
1113 || !step_preserves_misalignment_p
1114 /* We need to know whether the step wrt the vectorized loop is
1115 negative when computing the starting misalignment below. */
1116 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1120 "Unknown alignment for access: %T\n", ref
);
1124 if (base_alignment
< vect_align_c
)
1126 unsigned int max_alignment
;
1127 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1128 if (max_alignment
< vect_align_c
1129 || !vect_can_force_dr_alignment_p (base
,
1130 vect_align_c
* BITS_PER_UNIT
))
1132 if (dump_enabled_p ())
1133 dump_printf_loc (MSG_NOTE
, vect_location
,
1134 "can't force alignment of ref: %T\n", ref
);
1138 /* Force the alignment of the decl.
1139 NOTE: This is the only change to the code we make during
1140 the analysis phase, before deciding to vectorize the loop. */
1141 if (dump_enabled_p ())
1142 dump_printf_loc (MSG_NOTE
, vect_location
,
1143 "force alignment of %T\n", ref
);
1145 dr_info
->base_decl
= base
;
1146 dr_info
->base_misaligned
= true;
1147 base_misalignment
= 0;
1149 poly_int64 misalignment
1150 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1152 unsigned int const_misalignment
;
1153 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1157 "Non-constant misalignment for access: %T\n", ref
);
1161 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1165 "misalign = %d bytes of ref %T\n",
1166 const_misalignment
, ref
);
1171 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1172 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1173 is made aligned via peeling. */
1176 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1177 dr_vec_info
*dr_peel_info
)
1179 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1180 DR_TARGET_ALIGNMENT (dr_info
)))
1182 poly_offset_int diff
1183 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1184 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1185 if (known_eq (diff
, 0)
1186 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1192 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1193 aligned via peeling. */
1196 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1197 dr_vec_info
*dr_peel_info
)
1199 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1200 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1201 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1202 DR_OFFSET (dr_peel_info
->dr
), 0)
1203 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1204 DR_STEP (dr_peel_info
->dr
), 0))
1207 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1210 /* Compute the value for dr_info->misalign so that the access appears
1211 aligned. This is used by peeling to compensate for dr_misalignment
1212 applying the offset for negative step. */
1215 vect_dr_misalign_for_aligned_access (dr_vec_info
*dr_info
)
1217 if (tree_int_cst_sgn (DR_STEP (dr_info
->dr
)) >= 0)
1220 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1221 poly_int64 misalignment
1222 = ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1223 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1225 unsigned HOST_WIDE_INT target_alignment_c
;
1227 if (!dr_info
->target_alignment
.is_constant (&target_alignment_c
)
1228 || !known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1229 return DR_MISALIGNMENT_UNKNOWN
;
1233 /* Function vect_update_misalignment_for_peel.
1234 Sets DR_INFO's misalignment
1235 - to 0 if it has the same alignment as DR_PEEL_INFO,
1236 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1237 - to -1 (unknown) otherwise.
1239 DR_INFO - the data reference whose misalignment is to be adjusted.
1240 DR_PEEL_INFO - the data reference whose misalignment is being made
1241 zero in the vector loop by the peel.
1242 NPEEL - the number of iterations in the peel loop if the misalignment
1243 of DR_PEEL_INFO is known at compile time. */
1246 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1247 dr_vec_info
*dr_peel_info
, int npeel
)
1249 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1250 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1252 SET_DR_MISALIGNMENT (dr_info
,
1253 vect_dr_misalign_for_aligned_access (dr_peel_info
));
1257 unsigned HOST_WIDE_INT alignment
;
1258 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1259 && known_alignment_for_access_p (dr_info
,
1260 STMT_VINFO_VECTYPE (dr_info
->stmt
))
1261 && known_alignment_for_access_p (dr_peel_info
,
1262 STMT_VINFO_VECTYPE (dr_peel_info
->stmt
)))
1264 int misal
= dr_info
->misalignment
;
1265 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1266 misal
&= alignment
- 1;
1267 set_dr_misalignment (dr_info
, misal
);
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1273 "to unknown (-1).\n");
1274 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1277 /* Return true if alignment is relevant for DR_INFO. */
1280 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1282 stmt_vec_info stmt_info
= dr_info
->stmt
;
1284 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1287 /* For interleaving, only the alignment of the first access matters. */
1288 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1289 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1292 /* Scatter-gather and invariant accesses continue to address individual
1293 scalars, so vector-level alignment is irrelevant. */
1294 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1295 || integer_zerop (DR_STEP (dr_info
->dr
)))
1298 /* Strided accesses perform only component accesses, alignment is
1299 irrelevant for them. */
1300 if (STMT_VINFO_STRIDED_P (stmt_info
)
1301 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1307 /* Given an memory reference EXP return whether its alignment is less
1311 not_size_aligned (tree exp
)
1313 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1316 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1317 > get_object_alignment (exp
));
1320 /* Function vector_alignment_reachable_p
1322 Return true if vector alignment for DR_INFO is reachable by peeling
1323 a few loop iterations. Return false otherwise. */
1326 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1328 stmt_vec_info stmt_info
= dr_info
->stmt
;
1329 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1331 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1333 /* For interleaved access we peel only if number of iterations in
1334 the prolog loop ({VF - misalignment}), is a multiple of the
1335 number of the interleaved accesses. */
1336 int elem_size
, mis_in_elements
;
1338 /* FORNOW: handle only known alignment. */
1339 if (!known_alignment_for_access_p (dr_info
, vectype
))
1342 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1343 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1344 elem_size
= vector_element_size (vector_size
, nelements
);
1345 mis_in_elements
= dr_misalignment (dr_info
, vectype
) / elem_size
;
1347 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1351 /* If misalignment is known at the compile time then allow peeling
1352 only if natural alignment is reachable through peeling. */
1353 if (known_alignment_for_access_p (dr_info
, vectype
)
1354 && !aligned_access_p (dr_info
, vectype
))
1356 HOST_WIDE_INT elmsize
=
1357 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1358 if (dump_enabled_p ())
1360 dump_printf_loc (MSG_NOTE
, vect_location
,
1361 "data size = %wd. misalignment = %d.\n", elmsize
,
1362 dr_misalignment (dr_info
, vectype
));
1364 if (dr_misalignment (dr_info
, vectype
) % elmsize
)
1366 if (dump_enabled_p ())
1367 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1368 "data size does not divide the misalignment.\n");
1373 if (!known_alignment_for_access_p (dr_info
, vectype
))
1375 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1376 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1377 if (dump_enabled_p ())
1378 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1379 "Unknown misalignment, %snaturally aligned\n",
1380 is_packed
? "not " : "");
1381 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1388 /* Calculate the cost of the memory access represented by DR_INFO. */
1391 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1392 dr_alignment_support alignment_support_scheme
,
1394 unsigned int *inside_cost
,
1395 unsigned int *outside_cost
,
1396 stmt_vector_for_cost
*body_cost_vec
,
1397 stmt_vector_for_cost
*prologue_cost_vec
)
1399 stmt_vec_info stmt_info
= dr_info
->stmt
;
1400 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1403 if (PURE_SLP_STMT (stmt_info
))
1406 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1408 if (DR_IS_READ (dr_info
->dr
))
1409 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, alignment_support_scheme
,
1410 misalignment
, true, inside_cost
,
1411 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1413 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, alignment_support_scheme
,
1414 misalignment
, inside_cost
, body_cost_vec
);
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE
, vect_location
,
1418 "vect_get_data_access_cost: inside_cost = %d, "
1419 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1423 typedef struct _vect_peel_info
1425 dr_vec_info
*dr_info
;
1430 typedef struct _vect_peel_extended_info
1433 struct _vect_peel_info peel_info
;
1434 unsigned int inside_cost
;
1435 unsigned int outside_cost
;
1436 } *vect_peel_extended_info
;
1439 /* Peeling hashtable helpers. */
1441 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1443 static inline hashval_t
hash (const _vect_peel_info
*);
1444 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1448 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1450 return (hashval_t
) peel_info
->npeel
;
1454 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1456 return (a
->npeel
== b
->npeel
);
1460 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1463 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1464 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1465 int npeel
, bool supportable_if_not_aligned
)
1467 struct _vect_peel_info elem
, *slot
;
1468 _vect_peel_info
**new_slot
;
1471 slot
= peeling_htab
->find (&elem
);
1476 slot
= XNEW (struct _vect_peel_info
);
1477 slot
->npeel
= npeel
;
1478 slot
->dr_info
= dr_info
;
1480 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1484 /* If this DR is not supported with unknown misalignment then bias
1485 this slot when the cost model is disabled. */
1486 if (!supportable_if_not_aligned
1487 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1488 slot
->count
+= VECT_MAX_COST
;
1492 /* Traverse peeling hash table to find peeling option that aligns maximum
1493 number of data accesses. */
1496 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1497 _vect_peel_extended_info
*max
)
1499 vect_peel_info elem
= *slot
;
1501 if (elem
->count
> max
->peel_info
.count
1502 || (elem
->count
== max
->peel_info
.count
1503 && max
->peel_info
.npeel
> elem
->npeel
))
1505 max
->peel_info
.npeel
= elem
->npeel
;
1506 max
->peel_info
.count
= elem
->count
;
1507 max
->peel_info
.dr_info
= elem
->dr_info
;
1513 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1514 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1515 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1519 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1520 dr_vec_info
*dr0_info
,
1521 unsigned int *inside_cost
,
1522 unsigned int *outside_cost
,
1523 stmt_vector_for_cost
*body_cost_vec
,
1524 stmt_vector_for_cost
*prologue_cost_vec
,
1527 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1529 bool dr0_alignment_known_p
1531 && known_alignment_for_access_p (dr0_info
,
1532 STMT_VINFO_VECTYPE (dr0_info
->stmt
)));
1534 for (data_reference
*dr
: datarefs
)
1536 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1537 if (!vect_relevant_for_alignment_p (dr_info
))
1540 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1541 dr_alignment_support alignment_support_scheme
;
1543 unsigned HOST_WIDE_INT alignment
;
1545 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1546 size_zero_node
) < 0;
1549 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1550 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1553 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1554 else if (dr_info
== dr0_info
1555 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1557 else if (!dr0_alignment_known_p
1558 || !known_alignment_for_access_p (dr_info
, vectype
)
1559 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1560 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1563 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1564 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1565 misalignment
&= alignment
- 1;
1567 alignment_support_scheme
1568 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1571 vect_get_data_access_cost (loop_vinfo
, dr_info
,
1572 alignment_support_scheme
, misalignment
,
1573 inside_cost
, outside_cost
,
1574 body_cost_vec
, prologue_cost_vec
);
1578 /* Traverse peeling hash table and calculate cost for each peeling option.
1579 Find the one with the lowest cost. */
1582 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1583 _vect_peel_extended_info
*min
)
1585 vect_peel_info elem
= *slot
;
1587 unsigned int inside_cost
= 0, outside_cost
= 0;
1588 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1589 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1592 prologue_cost_vec
.create (2);
1593 body_cost_vec
.create (2);
1594 epilogue_cost_vec
.create (2);
1596 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1597 &outside_cost
, &body_cost_vec
,
1598 &prologue_cost_vec
, elem
->npeel
);
1600 body_cost_vec
.release ();
1602 outside_cost
+= vect_get_known_peeling_cost
1603 (loop_vinfo
, elem
->npeel
, &dummy
,
1604 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1605 &prologue_cost_vec
, &epilogue_cost_vec
);
1607 /* Prologue and epilogue costs are added to the target model later.
1608 These costs depend only on the scalar iteration cost, the
1609 number of peeling iterations finally chosen, and the number of
1610 misaligned statements. So discard the information found here. */
1611 prologue_cost_vec
.release ();
1612 epilogue_cost_vec
.release ();
1614 if (inside_cost
< min
->inside_cost
1615 || (inside_cost
== min
->inside_cost
1616 && outside_cost
< min
->outside_cost
))
1618 min
->inside_cost
= inside_cost
;
1619 min
->outside_cost
= outside_cost
;
1620 min
->peel_info
.dr_info
= elem
->dr_info
;
1621 min
->peel_info
.npeel
= elem
->npeel
;
1622 min
->peel_info
.count
= elem
->count
;
1629 /* Choose best peeling option by traversing peeling hash table and either
1630 choosing an option with the lowest cost (if cost model is enabled) or the
1631 option that aligns as many accesses as possible. */
1633 static struct _vect_peel_extended_info
1634 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1635 loop_vec_info loop_vinfo
)
1637 struct _vect_peel_extended_info res
;
1639 res
.peel_info
.dr_info
= NULL
;
1640 res
.vinfo
= loop_vinfo
;
1642 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1644 res
.inside_cost
= INT_MAX
;
1645 res
.outside_cost
= INT_MAX
;
1646 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1647 vect_peeling_hash_get_lowest_cost
> (&res
);
1651 res
.peel_info
.count
= 0;
1652 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1653 vect_peeling_hash_get_most_frequent
> (&res
);
1654 res
.inside_cost
= 0;
1655 res
.outside_cost
= 0;
1661 /* Return true if the new peeling NPEEL is supported. */
1664 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1667 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1668 enum dr_alignment_support supportable_dr_alignment
;
1670 bool dr0_alignment_known_p
1671 = known_alignment_for_access_p (dr0_info
,
1672 STMT_VINFO_VECTYPE (dr0_info
->stmt
));
1674 /* Ensure that all data refs can be vectorized after the peel. */
1675 for (data_reference
*dr
: datarefs
)
1677 if (dr
== dr0_info
->dr
)
1680 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1681 if (!vect_relevant_for_alignment_p (dr_info
)
1682 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1685 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1687 unsigned HOST_WIDE_INT alignment
;
1688 if (!dr0_alignment_known_p
1689 || !known_alignment_for_access_p (dr_info
, vectype
)
1690 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1691 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1694 misalignment
= dr_misalignment (dr_info
, vectype
);
1695 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1696 misalignment
&= alignment
- 1;
1698 supportable_dr_alignment
1699 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1701 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
1708 /* Compare two data-references DRA and DRB to group them into chunks
1709 with related alignment. */
1712 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
1714 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
1715 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
1718 /* Stabilize sort. */
1722 /* Ordering of DRs according to base. */
1723 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
1724 DR_BASE_ADDRESS (drb
));
1728 /* And according to DR_OFFSET. */
1729 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
1733 /* And after step. */
1734 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
1738 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1739 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
1741 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
1745 /* Function vect_enhance_data_refs_alignment
1747 This pass will use loop versioning and loop peeling in order to enhance
1748 the alignment of data references in the loop.
1750 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1751 original loop is to be vectorized. Any other loops that are created by
1752 the transformations performed in this pass - are not supposed to be
1753 vectorized. This restriction will be relaxed.
1755 This pass will require a cost model to guide it whether to apply peeling
1756 or versioning or a combination of the two. For example, the scheme that
1757 intel uses when given a loop with several memory accesses, is as follows:
1758 choose one memory access ('p') which alignment you want to force by doing
1759 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1760 other accesses are not necessarily aligned, or (2) use loop versioning to
1761 generate one loop in which all accesses are aligned, and another loop in
1762 which only 'p' is necessarily aligned.
1764 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1765 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1766 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1768 Devising a cost model is the most critical aspect of this work. It will
1769 guide us on which access to peel for, whether to use loop versioning, how
1770 many versions to create, etc. The cost model will probably consist of
1771 generic considerations as well as target specific considerations (on
1772 powerpc for example, misaligned stores are more painful than misaligned
1775 Here are the general steps involved in alignment enhancements:
1777 -- original loop, before alignment analysis:
1778 for (i=0; i<N; i++){
1779 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1780 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1783 -- After vect_compute_data_refs_alignment:
1784 for (i=0; i<N; i++){
1785 x = q[i]; # DR_MISALIGNMENT(q) = 3
1786 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1789 -- Possibility 1: we do loop versioning:
1791 for (i=0; i<N; i++){ # loop 1A
1792 x = q[i]; # DR_MISALIGNMENT(q) = 3
1793 p[i] = y; # DR_MISALIGNMENT(p) = 0
1797 for (i=0; i<N; i++){ # loop 1B
1798 x = q[i]; # DR_MISALIGNMENT(q) = 3
1799 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1803 -- Possibility 2: we do loop peeling:
1804 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1808 for (i = 3; i < N; i++){ # loop 2A
1809 x = q[i]; # DR_MISALIGNMENT(q) = 0
1810 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1813 -- Possibility 3: combination of loop peeling and versioning:
1814 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1819 for (i = 3; i<N; i++){ # loop 3A
1820 x = q[i]; # DR_MISALIGNMENT(q) = 0
1821 p[i] = y; # DR_MISALIGNMENT(p) = 0
1825 for (i = 3; i<N; i++){ # loop 3B
1826 x = q[i]; # DR_MISALIGNMENT(q) = 0
1827 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1831 These loops are later passed to loop_transform to be vectorized. The
1832 vectorizer will use the alignment information to guide the transformation
1833 (whether to generate regular loads/stores, or with special handling for
1837 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1839 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1840 dr_vec_info
*first_store
= NULL
;
1841 dr_vec_info
*dr0_info
= NULL
;
1842 struct data_reference
*dr
;
1844 bool do_peeling
= false;
1845 bool do_versioning
= false;
1846 unsigned int npeel
= 0;
1847 bool one_misalignment_known
= false;
1848 bool one_misalignment_unknown
= false;
1849 bool one_dr_unsupportable
= false;
1850 dr_vec_info
*unsupportable_dr_info
= NULL
;
1851 unsigned int dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
1852 hash_table
<peel_info_hasher
> peeling_htab (1);
1854 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1856 /* Reset data so we can safely be called multiple times. */
1857 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1858 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1860 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
1861 return opt_result::success ();
1863 /* Sort the vector of datarefs so DRs that have the same or dependent
1864 alignment are next to each other. */
1865 auto_vec
<data_reference_p
> datarefs
1866 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
1867 datarefs
.qsort (dr_align_group_sort_cmp
);
1869 /* Compute the number of DRs that become aligned when we peel
1870 a dataref so it becomes aligned. */
1871 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
1872 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
1874 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
1875 if (DR_BASE_ADDRESS (datarefs
[i0
]))
1877 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
1879 if (i
== datarefs
.length ()
1880 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
1881 DR_BASE_ADDRESS (datarefs
[i
]), 0)
1882 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
1883 DR_OFFSET (datarefs
[i
]), 0)
1884 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
1885 DR_STEP (datarefs
[i
]), 0))
1887 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1888 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1889 will get known misalignment if we align one of the refs
1890 with the largest DR_TARGET_ALIGNMENT. */
1891 for (unsigned j
= i0
; j
< i
; ++j
)
1893 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
1894 for (unsigned k
= i0
; k
< i
; ++k
)
1898 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
1899 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
1901 n_same_align_refs
[j
]++;
1908 /* While cost model enhancements are expected in the future, the high level
1909 view of the code at this time is as follows:
1911 A) If there is a misaligned access then see if peeling to align
1912 this access can make all data references satisfy
1913 vect_supportable_dr_alignment. If so, update data structures
1914 as needed and return true.
1916 B) If peeling wasn't possible and there is a data reference with an
1917 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1918 then see if loop versioning checks can be used to make all data
1919 references satisfy vect_supportable_dr_alignment. If so, update
1920 data structures as needed and return true.
1922 C) If neither peeling nor versioning were successful then return false if
1923 any data reference does not satisfy vect_supportable_dr_alignment.
1925 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1927 Note, Possibility 3 above (which is peeling and versioning together) is not
1928 being done at this time. */
1930 /* (1) Peeling to force alignment. */
1932 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1934 + How many accesses will become aligned due to the peeling
1935 - How many accesses will become unaligned due to the peeling,
1936 and the cost of misaligned accesses.
1937 - The cost of peeling (the extra runtime checks, the increase
1940 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1942 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1943 if (!vect_relevant_for_alignment_p (dr_info
))
1946 stmt_vec_info stmt_info
= dr_info
->stmt
;
1947 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1948 do_peeling
= vector_alignment_reachable_p (dr_info
);
1951 if (known_alignment_for_access_p (dr_info
, vectype
))
1953 unsigned int npeel_tmp
= 0;
1954 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1955 size_zero_node
) < 0;
1957 /* If known_alignment_for_access_p then we have set
1958 DR_MISALIGNMENT which is only done if we know it at compiler
1959 time, so it is safe to assume target alignment is constant.
1961 unsigned int target_align
=
1962 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
1963 unsigned HOST_WIDE_INT dr_size
= vect_get_scalar_dr_size (dr_info
);
1966 off
= (TYPE_VECTOR_SUBPARTS (vectype
) - 1) * -dr_size
;
1967 unsigned int mis
= dr_misalignment (dr_info
, vectype
, off
);
1968 mis
= negative
? mis
: -mis
;
1970 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1972 /* For multiple types, it is possible that the bigger type access
1973 will have more than one peeling option. E.g., a loop with two
1974 types: one of size (vector size / 4), and the other one of
1975 size (vector size / 8). Vectorization factor will 8. If both
1976 accesses are misaligned by 3, the first one needs one scalar
1977 iteration to be aligned, and the second one needs 5. But the
1978 first one will be aligned also by peeling 5 scalar
1979 iterations, and in that case both accesses will be aligned.
1980 Hence, except for the immediate peeling amount, we also want
1981 to try to add full vector size, while we don't exceed
1982 vectorization factor.
1983 We do this automatically for cost model, since we calculate
1984 cost for every peeling option. */
1985 poly_uint64 nscalars
= npeel_tmp
;
1986 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1988 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1989 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1990 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1993 /* Save info about DR in the hash table. Also include peeling
1994 amounts according to the explanation above. Indicate
1995 the alignment status when the ref is not aligned.
1996 ??? Rather than using unknown alignment here we should
1997 prune all entries from the peeling hashtable which cause
1998 DRs to be not supported. */
1999 bool supportable_if_not_aligned
2000 = vect_supportable_dr_alignment
2001 (loop_vinfo
, dr_info
, vectype
, DR_MISALIGNMENT_UNKNOWN
);
2002 while (known_le (npeel_tmp
, nscalars
))
2004 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
2006 supportable_if_not_aligned
);
2007 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
2010 one_misalignment_known
= true;
2014 /* If we don't know any misalignment values, we prefer
2015 peeling for data-ref that has the maximum number of data-refs
2016 with the same alignment, unless the target prefers to align
2017 stores over load. */
2018 unsigned same_align_drs
= n_same_align_refs
[i
];
2020 || dr0_same_align_drs
< same_align_drs
)
2022 dr0_same_align_drs
= same_align_drs
;
2025 /* For data-refs with the same number of related
2026 accesses prefer the one where the misalign
2027 computation will be invariant in the outermost loop. */
2028 else if (dr0_same_align_drs
== same_align_drs
)
2030 class loop
*ivloop0
, *ivloop
;
2031 ivloop0
= outermost_invariant_loop_for_expr
2032 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
2033 ivloop
= outermost_invariant_loop_for_expr
2034 (loop
, DR_BASE_ADDRESS (dr
));
2035 if ((ivloop
&& !ivloop0
)
2036 || (ivloop
&& ivloop0
2037 && flow_loop_nested_p (ivloop
, ivloop0
)))
2041 one_misalignment_unknown
= true;
2043 /* Check for data refs with unsupportable alignment that
2045 enum dr_alignment_support supportable_dr_alignment
2046 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2047 DR_MISALIGNMENT_UNKNOWN
);
2048 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2050 one_dr_unsupportable
= true;
2051 unsupportable_dr_info
= dr_info
;
2054 if (!first_store
&& DR_IS_WRITE (dr
))
2056 first_store
= dr_info
;
2057 first_store_same_align_drs
= same_align_drs
;
2063 if (!aligned_access_p (dr_info
, vectype
))
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2067 "vector alignment may not be reachable\n");
2073 /* Check if we can possibly peel the loop. */
2074 if (!vect_can_advance_ivs_p (loop_vinfo
)
2075 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
2079 struct _vect_peel_extended_info peel_for_known_alignment
;
2080 struct _vect_peel_extended_info peel_for_unknown_alignment
;
2081 struct _vect_peel_extended_info best_peel
;
2083 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
2084 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
2085 peel_for_unknown_alignment
.peel_info
.count
= 0;
2088 && one_misalignment_unknown
)
2090 /* Check if the target requires to prefer stores over loads, i.e., if
2091 misaligned stores are more expensive than misaligned loads (taking
2092 drs with same alignment into account). */
2093 unsigned int load_inside_cost
= 0;
2094 unsigned int load_outside_cost
= 0;
2095 unsigned int store_inside_cost
= 0;
2096 unsigned int store_outside_cost
= 0;
2097 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
2099 stmt_vector_for_cost dummy
;
2101 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
2104 &dummy
, &dummy
, estimated_npeels
);
2110 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
2112 &store_outside_cost
,
2119 store_inside_cost
= INT_MAX
;
2120 store_outside_cost
= INT_MAX
;
2123 if (load_inside_cost
> store_inside_cost
2124 || (load_inside_cost
== store_inside_cost
2125 && load_outside_cost
> store_outside_cost
))
2127 dr0_info
= first_store
;
2128 dr0_same_align_drs
= first_store_same_align_drs
;
2129 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2130 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2134 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2135 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2138 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2139 prologue_cost_vec
.create (2);
2140 epilogue_cost_vec
.create (2);
2143 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2144 (loop_vinfo
, estimated_npeels
, &dummy2
,
2145 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2146 &prologue_cost_vec
, &epilogue_cost_vec
);
2148 prologue_cost_vec
.release ();
2149 epilogue_cost_vec
.release ();
2151 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2154 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2155 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2157 best_peel
= peel_for_unknown_alignment
;
2159 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2160 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2161 peel_for_known_alignment
.peel_info
.count
= 0;
2162 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2164 if (do_peeling
&& one_misalignment_known
)
2166 /* Peeling is possible, but there is no data access that is not supported
2167 unless aligned. So we try to choose the best possible peeling from
2169 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2170 (&peeling_htab
, loop_vinfo
);
2173 /* Compare costs of peeling for known and unknown alignment. */
2174 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2175 && peel_for_unknown_alignment
.inside_cost
2176 >= peel_for_known_alignment
.inside_cost
)
2178 best_peel
= peel_for_known_alignment
;
2180 /* If the best peeling for known alignment has NPEEL == 0, perform no
2181 peeling at all except if there is an unsupportable dr that we can
2183 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2187 /* If there is an unsupportable data ref, prefer this over all choices so far
2188 since we'd have to discard a chosen peeling except when it accidentally
2189 aligned the unsupportable data ref. */
2190 if (one_dr_unsupportable
)
2191 dr0_info
= unsupportable_dr_info
;
2192 else if (do_peeling
)
2194 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2195 TODO: Use nopeel_outside_cost or get rid of it? */
2196 unsigned nopeel_inside_cost
= 0;
2197 unsigned nopeel_outside_cost
= 0;
2199 stmt_vector_for_cost dummy
;
2201 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2202 &nopeel_outside_cost
, &dummy
, &dummy
, 0);
2205 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2206 costs will be recorded. */
2207 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2208 prologue_cost_vec
.create (2);
2209 epilogue_cost_vec
.create (2);
2212 nopeel_outside_cost
+= vect_get_known_peeling_cost
2213 (loop_vinfo
, 0, &dummy2
,
2214 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2215 &prologue_cost_vec
, &epilogue_cost_vec
);
2217 prologue_cost_vec
.release ();
2218 epilogue_cost_vec
.release ();
2220 npeel
= best_peel
.peel_info
.npeel
;
2221 dr0_info
= best_peel
.peel_info
.dr_info
;
2223 /* If no peeling is not more expensive than the best peeling we
2224 have so far, don't perform any peeling. */
2225 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2231 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2232 if (known_alignment_for_access_p (dr0_info
,
2233 STMT_VINFO_VECTYPE (stmt_info
)))
2235 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2236 size_zero_node
) < 0;
2239 /* Since it's known at compile time, compute the number of
2240 iterations in the peeled loop (the peeling factor) for use in
2241 updating DR_MISALIGNMENT values. The peeling factor is the
2242 vectorization factor minus the misalignment as an element
2244 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2247 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2248 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2250 = dr_misalignment (dr0_info
, vectype
, off
);
2251 mis
= negative
? mis
: -mis
;
2252 /* If known_alignment_for_access_p then we have set
2253 DR_MISALIGNMENT which is only done if we know it at compiler
2254 time, so it is safe to assume target alignment is constant.
2256 unsigned int target_align
=
2257 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2258 npeel
= ((mis
& (target_align
- 1))
2259 / vect_get_scalar_dr_size (dr0_info
));
2262 /* For interleaved data access every iteration accesses all the
2263 members of the group, therefore we divide the number of iterations
2264 by the group size. */
2265 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2266 npeel
/= DR_GROUP_SIZE (stmt_info
);
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_NOTE
, vect_location
,
2270 "Try peeling by %d\n", npeel
);
2273 /* Ensure that all datarefs can be vectorized after the peel. */
2274 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2277 /* Check if all datarefs are supportable and log. */
2280 && known_alignment_for_access_p (dr0_info
,
2281 STMT_VINFO_VECTYPE (stmt_info
)))
2282 return opt_result::success ();
2284 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2287 unsigned max_allowed_peel
2288 = param_vect_max_peeling_for_alignment
;
2289 if (loop_cost_model (loop
) <= VECT_COST_MODEL_CHEAP
)
2290 max_allowed_peel
= 0;
2291 if (max_allowed_peel
!= (unsigned)-1)
2293 unsigned max_peel
= npeel
;
2296 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2297 unsigned HOST_WIDE_INT target_align_c
;
2298 if (target_align
.is_constant (&target_align_c
))
2300 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE
, vect_location
,
2306 "Disable peeling, max peels set and vector"
2307 " alignment unknown\n");
2310 if (max_peel
> max_allowed_peel
)
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_NOTE
, vect_location
,
2315 "Disable peeling, max peels reached: %d\n", max_peel
);
2320 /* Cost model #2 - if peeling may result in a remaining loop not
2321 iterating enough to be vectorized then do not peel. Since this
2322 is a cost heuristic rather than a correctness decision, use the
2323 most likely runtime value for variable vectorization factors. */
2325 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2327 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2328 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2329 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2330 < assumed_vf
+ max_peel
)
2336 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2337 If the misalignment of DR_i is identical to that of dr0 then set
2338 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2339 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2340 by the peeling factor times the element size of DR_i (MOD the
2341 vectorization factor times the size). Otherwise, the
2342 misalignment of DR_i must be set to unknown. */
2343 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2344 if (dr
!= dr0_info
->dr
)
2346 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2347 if (!vect_relevant_for_alignment_p (dr_info
))
2350 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2353 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2355 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2357 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = -1;
2358 SET_DR_MISALIGNMENT (dr0_info
,
2359 vect_dr_misalign_for_aligned_access (dr0_info
));
2360 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_NOTE
, vect_location
,
2363 "Alignment of access forced using peeling.\n");
2364 dump_printf_loc (MSG_NOTE
, vect_location
,
2365 "Peeling for alignment will be applied.\n");
2368 /* The inside-loop cost will be accounted for in vectorizable_load
2369 and vectorizable_store correctly with adjusted alignments.
2370 Drop the body_cst_vec on the floor here. */
2371 return opt_result::success ();
2375 /* (2) Versioning to force alignment. */
2377 /* Try versioning if:
2378 1) optimize loop for speed and the cost-model is not cheap
2379 2) there is at least one unsupported misaligned data ref with an unknown
2381 3) all misaligned data refs with a known misalignment are supported, and
2382 4) the number of runtime alignment checks is within reason. */
2385 = (optimize_loop_nest_for_speed_p (loop
)
2386 && !loop
->inner
/* FORNOW */
2387 && loop_cost_model (loop
) > VECT_COST_MODEL_CHEAP
);
2391 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2393 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2394 if (!vect_relevant_for_alignment_p (dr_info
))
2397 stmt_vec_info stmt_info
= dr_info
->stmt
;
2398 if (STMT_VINFO_STRIDED_P (stmt_info
))
2400 do_versioning
= false;
2404 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2405 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2406 size_zero_node
) < 0;
2409 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2410 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2412 if ((misalignment
= dr_misalignment (dr_info
, vectype
, off
)) == 0)
2415 enum dr_alignment_support supportable_dr_alignment
2416 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2418 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2420 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
2421 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2422 >= (unsigned) param_vect_max_version_for_alignment_checks
))
2424 do_versioning
= false;
2428 /* At present we don't support versioning for alignment
2429 with variable VF, since there's no guarantee that the
2430 VF is a power of two. We could relax this if we added
2431 a way of enforcing a power-of-two size. */
2432 unsigned HOST_WIDE_INT size
;
2433 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2435 do_versioning
= false;
2439 /* Forcing alignment in the first iteration is no good if
2440 we don't keep it across iterations. For now, just disable
2441 versioning in this case.
2442 ?? We could actually unroll the loop to achieve the required
2443 overall step alignment, and forcing the alignment could be
2444 done by doing some iterations of the non-vectorized loop. */
2445 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2446 * DR_STEP_ALIGNMENT (dr
),
2447 DR_TARGET_ALIGNMENT (dr_info
)))
2449 do_versioning
= false;
2453 /* The rightmost bits of an aligned address must be zeros.
2454 Construct the mask needed for this test. For example,
2455 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2456 mask must be 15 = 0xf. */
2457 int mask
= size
- 1;
2459 /* FORNOW: use the same mask to test all potentially unaligned
2460 references in the loop. */
2461 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2462 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2464 do_versioning
= false;
2468 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2469 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2473 /* Versioning requires at least one misaligned data reference. */
2474 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2475 do_versioning
= false;
2476 else if (!do_versioning
)
2477 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2482 const vec
<stmt_vec_info
> &may_misalign_stmts
2483 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2484 stmt_vec_info stmt_info
;
2486 /* It can now be assumed that the data references in the statements
2487 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2488 of the loop being vectorized. */
2489 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2491 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2492 SET_DR_MISALIGNMENT (dr_info
,
2493 vect_dr_misalign_for_aligned_access (dr_info
));
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_NOTE
, vect_location
,
2496 "Alignment of access forced using versioning.\n");
2499 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE
, vect_location
,
2501 "Versioning for alignment will be applied.\n");
2503 /* Peeling and versioning can't be done together at this time. */
2504 gcc_assert (! (do_peeling
&& do_versioning
));
2506 return opt_result::success ();
2509 /* This point is reached if neither peeling nor versioning is being done. */
2510 gcc_assert (! (do_peeling
|| do_versioning
));
2512 return opt_result::success ();
2516 /* Function vect_analyze_data_refs_alignment
2518 Analyze the alignment of the data-references in the loop.
2519 Return FALSE if a data reference is found that cannot be vectorized. */
2522 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2524 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2526 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2527 struct data_reference
*dr
;
2530 vect_record_base_alignments (loop_vinfo
);
2531 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2533 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2534 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2536 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
)
2537 && DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
) != dr_info
->stmt
)
2539 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
,
2540 STMT_VINFO_VECTYPE (dr_info
->stmt
));
2544 return opt_result::success ();
2548 /* Analyze alignment of DRs of stmts in NODE. */
2551 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2553 /* Alignment is maintained in the first element of the group. */
2554 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2555 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2556 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2557 tree vectype
= SLP_TREE_VECTYPE (node
);
2558 poly_uint64 vector_alignment
2559 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
2561 if (dr_info
->misalignment
== DR_MISALIGNMENT_UNINITIALIZED
)
2562 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2563 /* Re-analyze alignment when we're facing a vectorization with a bigger
2564 alignment requirement. */
2565 else if (known_lt (dr_info
->target_alignment
, vector_alignment
))
2567 poly_uint64 old_target_alignment
= dr_info
->target_alignment
;
2568 int old_misalignment
= dr_info
->misalignment
;
2569 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2570 /* But keep knowledge about a smaller alignment. */
2571 if (old_misalignment
!= DR_MISALIGNMENT_UNKNOWN
2572 && dr_info
->misalignment
== DR_MISALIGNMENT_UNKNOWN
)
2574 dr_info
->target_alignment
= old_target_alignment
;
2575 dr_info
->misalignment
= old_misalignment
;
2578 /* When we ever face unordered target alignments the first one wins in terms
2579 of analyzing and the other will become unknown in dr_misalignment. */
2583 /* Function vect_slp_analyze_instance_alignment
2585 Analyze the alignment of the data-references in the SLP instance.
2586 Return FALSE if a data reference is found that cannot be vectorized. */
2589 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
2590 slp_instance instance
)
2592 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2596 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2597 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
2600 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2601 && ! vect_slp_analyze_node_alignment
2602 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2609 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2610 accesses of legal size, step, etc. Detect gaps, single element
2611 interleaving, and other special cases. Set grouped access info.
2612 Collect groups of strided stores for further use in SLP analysis.
2613 Worker for vect_analyze_group_access. */
2616 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2618 data_reference
*dr
= dr_info
->dr
;
2619 tree step
= DR_STEP (dr
);
2620 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2621 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2622 stmt_vec_info stmt_info
= dr_info
->stmt
;
2623 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2624 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2625 HOST_WIDE_INT dr_step
= -1;
2626 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2627 bool slp_impossible
= false;
2629 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2630 size of the interleaving group (including gaps). */
2631 if (tree_fits_shwi_p (step
))
2633 dr_step
= tree_to_shwi (step
);
2634 /* Check that STEP is a multiple of type size. Otherwise there is
2635 a non-element-sized gap at the end of the group which we
2636 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2637 ??? As we can handle non-constant step fine here we should
2638 simply remove uses of DR_GROUP_GAP between the last and first
2639 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2640 simply not include that gap. */
2641 if ((dr_step
% type_size
) != 0)
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_NOTE
, vect_location
,
2645 "Step %T is not a multiple of the element size"
2650 groupsize
= absu_hwi (dr_step
) / type_size
;
2655 /* Not consecutive access is possible only if it is a part of interleaving. */
2656 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2658 /* Check if it this DR is a part of interleaving, and is a single
2659 element of the group that is accessed in the loop. */
2661 /* Gaps are supported only for loads. STEP must be a multiple of the type
2664 && (dr_step
% type_size
) == 0
2666 /* This could be UINT_MAX but as we are generating code in a very
2667 inefficient way we have to cap earlier.
2668 See PR91403 for example. */
2669 && groupsize
<= 4096)
2671 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2672 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2673 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_NOTE
, vect_location
,
2676 "Detected single element interleaving %T"
2683 if (dump_enabled_p ())
2684 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2685 "not consecutive access %G", stmt_info
->stmt
);
2689 /* Mark the statement as unvectorizable. */
2690 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2694 if (dump_enabled_p ())
2695 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2696 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2700 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2702 /* First stmt in the interleaving chain. Check the chain. */
2703 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2704 struct data_reference
*data_ref
= dr
;
2705 unsigned int count
= 1;
2706 tree prev_init
= DR_INIT (data_ref
);
2707 HOST_WIDE_INT diff
, gaps
= 0;
2709 /* By construction, all group members have INTEGER_CST DR_INITs. */
2712 /* We never have the same DR multiple times. */
2713 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
2714 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
2716 data_ref
= STMT_VINFO_DATA_REF (next
);
2718 /* All group members have the same STEP by construction. */
2719 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2721 /* Check that the distance between two accesses is equal to the type
2722 size. Otherwise, we have gaps. */
2723 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2724 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2725 if (diff
< 1 || diff
> UINT_MAX
)
2727 /* For artificial testcases with array accesses with large
2728 constant indices we can run into overflow issues which
2729 can end up fooling the groupsize constraint below so
2730 check the individual gaps (which are represented as
2731 unsigned int) as well. */
2732 if (dump_enabled_p ())
2733 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2734 "interleaved access with gap larger "
2735 "than representable\n");
2740 /* FORNOW: SLP of accesses with gaps is not supported. */
2741 slp_impossible
= true;
2742 if (DR_IS_WRITE (data_ref
))
2744 if (dump_enabled_p ())
2745 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2746 "interleaved store with gaps\n");
2753 last_accessed_element
+= diff
;
2755 /* Store the gap from the previous member of the group. If there is no
2756 gap in the access, DR_GROUP_GAP is always 1. */
2757 DR_GROUP_GAP (next
) = diff
;
2759 prev_init
= DR_INIT (data_ref
);
2760 next
= DR_GROUP_NEXT_ELEMENT (next
);
2761 /* Count the number of data-refs in the chain. */
2766 groupsize
= count
+ gaps
;
2768 /* This could be UINT_MAX but as we are generating code in a very
2769 inefficient way we have to cap earlier. See PR78699 for example. */
2770 if (groupsize
> 4096)
2772 if (dump_enabled_p ())
2773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2774 "group is too large\n");
2778 /* Check that the size of the interleaving is equal to count for stores,
2779 i.e., that there are no gaps. */
2780 if (groupsize
!= count
2781 && !DR_IS_READ (dr
))
2784 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2787 /* If there is a gap after the last load in the group it is the
2788 difference between the groupsize and the last accessed
2790 When there is no gap, this difference should be 0. */
2791 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
2793 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2794 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_NOTE
, vect_location
,
2797 "Detected interleaving ");
2798 if (DR_IS_READ (dr
))
2799 dump_printf (MSG_NOTE
, "load ");
2800 else if (STMT_VINFO_STRIDED_P (stmt_info
))
2801 dump_printf (MSG_NOTE
, "strided store ");
2803 dump_printf (MSG_NOTE
, "store ");
2804 dump_printf (MSG_NOTE
, "of size %u\n",
2805 (unsigned)groupsize
);
2806 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
2807 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2810 if (DR_GROUP_GAP (next
) != 1)
2811 dump_printf_loc (MSG_NOTE
, vect_location
,
2812 "\t<gap of %d elements>\n",
2813 DR_GROUP_GAP (next
) - 1);
2814 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
2815 next
= DR_GROUP_NEXT_ELEMENT (next
);
2817 if (DR_GROUP_GAP (stmt_info
) != 0)
2818 dump_printf_loc (MSG_NOTE
, vect_location
,
2819 "\t<gap of %d elements>\n",
2820 DR_GROUP_GAP (stmt_info
));
2823 /* SLP: create an SLP data structure for every interleaving group of
2824 stores for further analysis in vect_analyse_slp. */
2825 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2828 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
2830 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
2837 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2838 accesses of legal size, step, etc. Detect gaps, single element
2839 interleaving, and other special cases. Set grouped access info.
2840 Collect groups of strided stores for further use in SLP analysis. */
2843 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2845 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
2847 /* Dissolve the group if present. */
2848 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
2851 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2852 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2853 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
2861 /* Analyze the access pattern of the data-reference DR_INFO.
2862 In case of non-consecutive accesses call vect_analyze_group_access() to
2863 analyze groups of accesses. */
2866 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2868 data_reference
*dr
= dr_info
->dr
;
2869 tree step
= DR_STEP (dr
);
2870 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2871 stmt_vec_info stmt_info
= dr_info
->stmt
;
2872 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2873 class loop
*loop
= NULL
;
2875 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2879 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2881 if (loop_vinfo
&& !step
)
2883 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2885 "bad data-ref access in loop\n");
2889 /* Allow loads with zero step in inner-loop vectorization. */
2890 if (loop_vinfo
&& integer_zerop (step
))
2892 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2893 if (!nested_in_vect_loop_p (loop
, stmt_info
))
2894 return DR_IS_READ (dr
);
2895 /* Allow references with zero step for outer loops marked
2896 with pragma omp simd only - it guarantees absence of
2897 loop-carried dependencies between inner loop iterations. */
2898 if (loop
->safelen
< 2)
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_NOTE
, vect_location
,
2902 "zero step in inner loop of nest\n");
2907 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2909 /* Interleaved accesses are not yet supported within outer-loop
2910 vectorization for references in the inner-loop. */
2911 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2913 /* For the rest of the analysis we use the outer-loop step. */
2914 step
= STMT_VINFO_DR_STEP (stmt_info
);
2915 if (integer_zerop (step
))
2917 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE
, vect_location
,
2919 "zero step in outer loop.\n");
2920 return DR_IS_READ (dr
);
2925 if (TREE_CODE (step
) == INTEGER_CST
)
2927 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2928 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2930 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2932 /* Mark that it is not interleaving. */
2933 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2938 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2940 if (dump_enabled_p ())
2941 dump_printf_loc (MSG_NOTE
, vect_location
,
2942 "grouped access in outer loop.\n");
2947 /* Assume this is a DR handled by non-constant strided load case. */
2948 if (TREE_CODE (step
) != INTEGER_CST
)
2949 return (STMT_VINFO_STRIDED_P (stmt_info
)
2950 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2951 || vect_analyze_group_access (vinfo
, dr_info
)));
2953 /* Not consecutive access - check if it's a part of interleaving group. */
2954 return vect_analyze_group_access (vinfo
, dr_info
);
2957 /* Compare two data-references DRA and DRB to group them into chunks
2958 suitable for grouping. */
2961 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2963 dr_vec_info
*dra_info
= *(dr_vec_info
**)const_cast<void *>(dra_
);
2964 dr_vec_info
*drb_info
= *(dr_vec_info
**)const_cast<void *>(drb_
);
2965 data_reference_p dra
= dra_info
->dr
;
2966 data_reference_p drb
= drb_info
->dr
;
2969 /* Stabilize sort. */
2973 /* Different group IDs lead never belong to the same group. */
2974 if (dra_info
->group
!= drb_info
->group
)
2975 return dra_info
->group
< drb_info
->group
? -1 : 1;
2977 /* Ordering of DRs according to base. */
2978 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2979 DR_BASE_ADDRESS (drb
));
2983 /* And according to DR_OFFSET. */
2984 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2988 /* Put reads before writes. */
2989 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2990 return DR_IS_READ (dra
) ? -1 : 1;
2992 /* Then sort after access size. */
2993 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2994 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2998 /* And after step. */
2999 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
3003 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3004 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
3006 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
3010 /* If OP is the result of a conversion, return the unconverted value,
3011 otherwise return null. */
3014 strip_conversion (tree op
)
3016 if (TREE_CODE (op
) != SSA_NAME
)
3018 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
3019 if (!is_gimple_assign (stmt
)
3020 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
3022 return gimple_assign_rhs1 (stmt
);
3025 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3026 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3027 be grouped in SLP mode. */
3030 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
3033 if (gimple_assign_single_p (stmt1_info
->stmt
))
3034 return gimple_assign_single_p (stmt2_info
->stmt
);
3036 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
3037 if (call1
&& gimple_call_internal_p (call1
))
3039 /* Check for two masked loads or two masked stores. */
3040 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
3041 if (!call2
|| !gimple_call_internal_p (call2
))
3043 internal_fn ifn
= gimple_call_internal_fn (call1
);
3044 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
3046 if (ifn
!= gimple_call_internal_fn (call2
))
3049 /* Check that the masks are the same. Cope with casts of masks,
3050 like those created by build_mask_conversion. */
3051 tree mask1
= gimple_call_arg (call1
, 2);
3052 tree mask2
= gimple_call_arg (call2
, 2);
3053 if (!operand_equal_p (mask1
, mask2
, 0)
3054 && (ifn
== IFN_MASK_STORE
|| !allow_slp_p
))
3056 mask1
= strip_conversion (mask1
);
3059 mask2
= strip_conversion (mask2
);
3062 if (!operand_equal_p (mask1
, mask2
, 0))
3071 /* Function vect_analyze_data_ref_accesses.
3073 Analyze the access pattern of all the data references in the loop.
3075 FORNOW: the only access pattern that is considered vectorizable is a
3076 simple step 1 (consecutive) access.
3078 FORNOW: handle only arrays and pointer accesses. */
3081 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
3082 vec
<int> *dataref_groups
)
3085 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
3087 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3089 if (datarefs
.is_empty ())
3090 return opt_result::success ();
3092 /* Sort the array of datarefs to make building the interleaving chains
3093 linear. Don't modify the original vector's order, it is needed for
3094 determining what dependencies are reversed. */
3095 vec
<dr_vec_info
*> datarefs_copy
;
3096 datarefs_copy
.create (datarefs
.length ());
3097 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
3099 dr_vec_info
*dr_info
= vinfo
->lookup_dr (datarefs
[i
]);
3100 /* If the caller computed DR grouping use that, otherwise group by
3103 dr_info
->group
= (*dataref_groups
)[i
];
3105 dr_info
->group
= gimple_bb (DR_STMT (datarefs
[i
]))->index
;
3106 datarefs_copy
.quick_push (dr_info
);
3108 datarefs_copy
.qsort (dr_group_sort_cmp
);
3109 hash_set
<stmt_vec_info
> to_fixup
;
3111 /* Build the interleaving chains. */
3112 for (i
= 0; i
< datarefs_copy
.length () - 1;)
3114 dr_vec_info
*dr_info_a
= datarefs_copy
[i
];
3115 data_reference_p dra
= dr_info_a
->dr
;
3116 int dra_group_id
= dr_info_a
->group
;
3117 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
3118 stmt_vec_info lastinfo
= NULL
;
3119 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
3120 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
3125 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
3127 dr_vec_info
*dr_info_b
= datarefs_copy
[i
];
3128 data_reference_p drb
= dr_info_b
->dr
;
3129 int drb_group_id
= dr_info_b
->group
;
3130 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
3131 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
3132 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
3135 /* ??? Imperfect sorting (non-compatible types, non-modulo
3136 accesses, same accesses) can lead to a group to be artificially
3137 split here as we don't just skip over those. If it really
3138 matters we can push those to a worklist and re-iterate
3139 over them. The we can just skip ahead to the next DR here. */
3141 /* DRs in a different DR group should not be put into the same
3142 interleaving group. */
3143 if (dra_group_id
!= drb_group_id
)
3146 /* Check that the data-refs have same first location (except init)
3147 and they are both either store or load (not load and store,
3148 not masked loads or stores). */
3149 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3150 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3151 DR_BASE_ADDRESS (drb
)) != 0
3152 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3153 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3156 /* Check that the data-refs have the same constant size. */
3157 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3158 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3159 if (!tree_fits_uhwi_p (sza
)
3160 || !tree_fits_uhwi_p (szb
)
3161 || !tree_int_cst_equal (sza
, szb
))
3164 /* Check that the data-refs have the same step. */
3165 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3168 /* Check the types are compatible.
3169 ??? We don't distinguish this during sorting. */
3170 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3171 TREE_TYPE (DR_REF (drb
))))
3174 /* Check that the DR_INITs are compile-time constants. */
3175 if (!tree_fits_shwi_p (DR_INIT (dra
))
3176 || !tree_fits_shwi_p (DR_INIT (drb
)))
3179 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3180 just hold extra information. */
3181 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3182 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3183 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3186 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3187 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3188 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3189 HOST_WIDE_INT init_prev
3190 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]->dr
));
3191 gcc_assert (init_a
<= init_b
3192 && init_a
<= init_prev
3193 && init_prev
<= init_b
);
3195 /* Do not place the same access in the interleaving chain twice. */
3196 if (init_b
== init_prev
)
3198 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]->dr
))
3199 < gimple_uid (DR_STMT (drb
)));
3200 /* Simply link in duplicates and fix up the chain below. */
3204 /* If init_b == init_a + the size of the type * k, we have an
3205 interleaving, and DRA is accessed before DRB. */
3206 unsigned HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3207 if (type_size_a
== 0
3208 || (((unsigned HOST_WIDE_INT
)init_b
- init_a
)
3209 % type_size_a
!= 0))
3212 /* If we have a store, the accesses are adjacent. This splits
3213 groups into chunks we support (we don't support vectorization
3214 of stores with gaps). */
3215 if (!DR_IS_READ (dra
)
3216 && (((unsigned HOST_WIDE_INT
)init_b
- init_prev
)
3220 /* If the step (if not zero or non-constant) is smaller than the
3221 difference between data-refs' inits this splits groups into
3223 if (tree_fits_shwi_p (DR_STEP (dra
)))
3225 unsigned HOST_WIDE_INT step
3226 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3228 && step
<= ((unsigned HOST_WIDE_INT
)init_b
- init_a
))
3233 if (dump_enabled_p ())
3234 dump_printf_loc (MSG_NOTE
, vect_location
,
3236 ? "Detected interleaving load %T and %T\n"
3237 : "Detected interleaving store %T and %T\n",
3238 DR_REF (dra
), DR_REF (drb
));
3240 /* Link the found element into the group list. */
3241 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3243 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3244 lastinfo
= stmtinfo_a
;
3246 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3247 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3248 lastinfo
= stmtinfo_b
;
3250 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3251 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3253 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3254 dump_printf_loc (MSG_NOTE
, vect_location
,
3255 "Load suitable for SLP vectorization only.\n");
3257 if (init_b
== init_prev
3258 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3259 && dump_enabled_p ())
3260 dump_printf_loc (MSG_NOTE
, vect_location
,
3261 "Queuing group with duplicate access for fixup\n");
3265 /* Fixup groups with duplicate entries by splitting it. */
3268 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3269 if (!(it
!= to_fixup
.end ()))
3271 stmt_vec_info grp
= *it
;
3272 to_fixup
.remove (grp
);
3274 /* Find the earliest duplicate group member. */
3275 unsigned first_duplicate
= -1u;
3276 stmt_vec_info next
, g
= grp
;
3277 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3279 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3280 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3281 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3282 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3285 if (first_duplicate
== -1U)
3288 /* Then move all stmts after the first duplicate to a new group.
3289 Note this is a heuristic but one with the property that *it
3290 is fixed up completely. */
3292 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3293 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3295 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3297 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3301 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3303 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3306 g
= DR_GROUP_NEXT_ELEMENT (g
);
3308 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3310 /* Fixup the new group which still may contain duplicates. */
3311 to_fixup
.add (newgroup
);
3314 dr_vec_info
*dr_info
;
3315 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_info
)
3317 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3318 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3320 if (dump_enabled_p ())
3321 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3322 "not vectorized: complicated access pattern.\n");
3324 if (is_a
<bb_vec_info
> (vinfo
))
3326 /* Mark the statement as not vectorizable. */
3327 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3332 datarefs_copy
.release ();
3333 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3335 " complicated access pattern.\n");
3340 datarefs_copy
.release ();
3341 return opt_result::success ();
3344 /* Function vect_vfa_segment_size.
3347 DR_INFO: The data reference.
3348 LENGTH_FACTOR: segment length to consider.
3350 Return a value suitable for the dr_with_seg_len::seg_len field.
3351 This is the "distance travelled" by the pointer from the first
3352 iteration in the segment to the last. Note that it does not include
3353 the size of the access; in effect it only describes the first byte. */
3356 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3358 length_factor
= size_binop (MINUS_EXPR
,
3359 fold_convert (sizetype
, length_factor
),
3361 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3365 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3366 gives the worst-case number of bytes covered by the segment. */
3368 static unsigned HOST_WIDE_INT
3369 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3371 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3372 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3373 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3374 unsigned HOST_WIDE_INT access_size
= ref_size
;
3375 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3377 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3378 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3380 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3382 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3383 && ((misalignment
= dr_misalignment (dr_info
, vectype
)), true)
3384 && (vect_supportable_dr_alignment (vinfo
, dr_info
, vectype
, misalignment
)
3385 == dr_explicit_realign_optimized
))
3387 /* We might access a full vector's worth. */
3388 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3393 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3397 vect_vfa_align (dr_vec_info
*dr_info
)
3399 return dr_alignment (dr_info
->dr
);
3402 /* Function vect_no_alias_p.
3404 Given data references A and B with equal base and offset, see whether
3405 the alias relation can be decided at compilation time. Return 1 if
3406 it can and the references alias, 0 if it can and the references do
3407 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3408 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3409 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3412 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3413 tree segment_length_a
, tree segment_length_b
,
3414 unsigned HOST_WIDE_INT access_size_a
,
3415 unsigned HOST_WIDE_INT access_size_b
)
3417 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3418 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3419 poly_uint64 const_length_a
;
3420 poly_uint64 const_length_b
;
3422 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3423 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3425 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3427 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3428 offset_a
-= const_length_a
;
3431 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3432 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3434 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3435 offset_b
-= const_length_b
;
3438 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3440 const_length_a
+= access_size_a
;
3441 const_length_b
+= access_size_b
;
3443 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3444 offset_b
, const_length_b
))
3447 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3448 offset_b
, const_length_b
))
3454 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3458 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3459 unsigned int loop_depth
, poly_uint64 vf
)
3461 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3462 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3465 /* If the dependence is exact, we should have limited the VF instead. */
3466 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3469 lambda_vector dist_v
;
3470 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3472 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3474 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3475 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3479 if (dump_enabled_p ())
3480 dump_printf_loc (MSG_NOTE
, vect_location
,
3481 "dependence distance between %T and %T is >= VF\n",
3482 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3487 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3490 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3492 dump_printf (dump_kind
, "%s (%T) >= ",
3493 lower_bound
.unsigned_p
? "unsigned" : "abs",
3495 dump_dec (dump_kind
, lower_bound
.min_value
);
3498 /* Record that the vectorized loop requires the vec_lower_bound described
3499 by EXPR, UNSIGNED_P and MIN_VALUE. */
3502 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3503 poly_uint64 min_value
)
3505 vec
<vec_lower_bound
> &lower_bounds
3506 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3507 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3508 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3510 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3511 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3512 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3513 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3515 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3516 lower_bounds
[i
].min_value
= min_value
;
3517 if (dump_enabled_p ())
3519 dump_printf_loc (MSG_NOTE
, vect_location
,
3520 "updating run-time check to ");
3521 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3522 dump_printf (MSG_NOTE
, "\n");
3528 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3529 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3532 dump_lower_bound (MSG_NOTE
, lower_bound
);
3533 dump_printf (MSG_NOTE
, "\n");
3535 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3538 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3539 will span fewer than GAP bytes. */
3542 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3545 stmt_vec_info stmt_info
= dr_info
->stmt
;
3547 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3548 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3549 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3550 return (estimated_poly_value (gap
)
3551 <= count
* vect_get_scalar_dr_size (dr_info
));
3554 /* Return true if we know that there is no alias between DR_INFO_A and
3555 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3556 When returning true, set *LOWER_BOUND_OUT to this N. */
3559 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3560 poly_uint64
*lower_bound_out
)
3562 /* Check that there is a constant gap of known sign between DR_A
3564 data_reference
*dr_a
= dr_info_a
->dr
;
3565 data_reference
*dr_b
= dr_info_b
->dr
;
3566 poly_int64 init_a
, init_b
;
3567 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3568 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3569 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3570 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3571 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3572 || !ordered_p (init_a
, init_b
))
3575 /* Sort DR_A and DR_B by the address they access. */
3576 if (maybe_lt (init_b
, init_a
))
3578 std::swap (init_a
, init_b
);
3579 std::swap (dr_info_a
, dr_info_b
);
3580 std::swap (dr_a
, dr_b
);
3583 /* If the two accesses could be dependent within a scalar iteration,
3584 make sure that we'd retain their order. */
3585 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3586 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3589 /* There is no alias if abs (DR_STEP) is greater than or equal to
3590 the bytes spanned by the combination of the two accesses. */
3591 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3595 /* Function vect_prune_runtime_alias_test_list.
3597 Prune a list of ddrs to be tested at run-time by versioning for alias.
3598 Merge several alias checks into one if possible.
3599 Return FALSE if resulting list of ddrs is longer then allowed by
3600 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3603 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3605 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3606 hash_set
<tree_pair_hash
> compared_objects
;
3608 const vec
<ddr_p
> &may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3609 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3610 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3611 const vec
<vec_object_pair
> &check_unequal_addrs
3612 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3613 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3614 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3620 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3622 /* Step values are irrelevant for aliasing if the number of vector
3623 iterations is equal to the number of scalar iterations (which can
3624 happen for fully-SLP loops). */
3625 bool vf_one_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3629 /* Convert the checks for nonzero steps into bound tests. */
3631 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3632 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3635 if (may_alias_ddrs
.is_empty ())
3636 return opt_result::success ();
3638 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3640 unsigned int loop_depth
3641 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3642 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3644 /* First, we collect all data ref pairs for aliasing checks. */
3645 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3647 poly_uint64 lower_bound
;
3648 tree segment_length_a
, segment_length_b
;
3649 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3650 unsigned int align_a
, align_b
;
3652 /* Ignore the alias if the VF we chose ended up being no greater
3653 than the dependence distance. */
3654 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3657 if (DDR_OBJECT_A (ddr
))
3659 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3660 if (!compared_objects
.add (new_pair
))
3662 if (dump_enabled_p ())
3663 dump_printf_loc (MSG_NOTE
, vect_location
,
3664 "checking that %T and %T"
3665 " have different addresses\n",
3666 new_pair
.first
, new_pair
.second
);
3667 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3672 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3673 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3675 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3676 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3678 bool preserves_scalar_order_p
3679 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3682 && (preserves_scalar_order_p
3683 || operand_equal_p (DR_STEP (dr_info_a
->dr
),
3684 DR_STEP (dr_info_b
->dr
))));
3686 /* Skip the pair if inter-iteration dependencies are irrelevant
3687 and intra-iteration dependencies are guaranteed to be honored. */
3689 && (preserves_scalar_order_p
3690 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE
, vect_location
,
3695 "no need for alias check between "
3696 "%T and %T when VF is 1\n",
3697 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3701 /* See whether we can handle the alias using a bounds check on
3702 the step, and whether that's likely to be the best approach.
3703 (It might not be, for example, if the minimum step is much larger
3704 than the number of bytes handled by one vector iteration.) */
3706 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3707 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3709 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3710 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3712 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3713 if (dump_enabled_p ())
3715 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3716 "%T and %T when the step %T is outside ",
3717 DR_REF (dr_info_a
->dr
),
3718 DR_REF (dr_info_b
->dr
),
3719 DR_STEP (dr_info_a
->dr
));
3721 dump_printf (MSG_NOTE
, "[0");
3724 dump_printf (MSG_NOTE
, "(");
3725 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3727 dump_printf (MSG_NOTE
, ", ");
3728 dump_dec (MSG_NOTE
, lower_bound
);
3729 dump_printf (MSG_NOTE
, ")\n");
3731 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
3732 unsigned_p
, lower_bound
);
3736 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
3737 if (dr_group_first_a
)
3739 stmt_info_a
= dr_group_first_a
;
3740 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
3743 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
3744 if (dr_group_first_b
)
3746 stmt_info_b
= dr_group_first_b
;
3747 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
3752 segment_length_a
= size_zero_node
;
3753 segment_length_b
= size_zero_node
;
3757 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
3758 DR_STEP (dr_info_b
->dr
), 0))
3759 length_factor
= scalar_loop_iters
;
3761 length_factor
= size_int (vect_factor
);
3762 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
3763 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
3765 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
3766 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
3767 align_a
= vect_vfa_align (dr_info_a
);
3768 align_b
= vect_vfa_align (dr_info_b
);
3770 /* See whether the alias is known at compilation time. */
3771 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
3772 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
3773 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
3774 DR_OFFSET (dr_info_b
->dr
), 0)
3775 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
3776 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
3777 && poly_int_tree_p (segment_length_a
)
3778 && poly_int_tree_p (segment_length_b
))
3780 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
3785 if (res
>= 0 && dump_enabled_p ())
3787 dump_printf_loc (MSG_NOTE
, vect_location
,
3788 "can tell at compile time that %T and %T",
3789 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3791 dump_printf (MSG_NOTE
, " do not alias\n");
3793 dump_printf (MSG_NOTE
, " alias\n");
3800 return opt_result::failure_at (stmt_info_b
->stmt
,
3802 " compilation time alias: %G%G",
3807 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
3808 access_size_a
, align_a
);
3809 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
3810 access_size_b
, align_b
);
3811 /* Canonicalize the order to be the one that's needed for accurate
3812 RAW, WAR and WAW flags, in cases where the data references are
3813 well-ordered. The order doesn't really matter otherwise,
3814 but we might as well be consistent. */
3815 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
3816 std::swap (dr_a
, dr_b
);
3818 dr_with_seg_len_pair_t dr_with_seg_len_pair
3819 (dr_a
, dr_b
, (preserves_scalar_order_p
3820 ? dr_with_seg_len_pair_t::WELL_ORDERED
3821 : dr_with_seg_len_pair_t::REORDERED
));
3823 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3826 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3828 unsigned int count
= (comp_alias_ddrs
.length ()
3829 + check_unequal_addrs
.length ());
3832 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
))
3833 == VECT_COST_MODEL_VERY_CHEAP
))
3834 return opt_result::failure_at
3835 (vect_location
, "would need a runtime alias check\n");
3837 if (dump_enabled_p ())
3838 dump_printf_loc (MSG_NOTE
, vect_location
,
3839 "improved number of alias checks from %d to %d\n",
3840 may_alias_ddrs
.length (), count
);
3841 unsigned limit
= param_vect_max_version_for_alias_checks
;
3842 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)) == VECT_COST_MODEL_CHEAP
)
3843 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
3845 return opt_result::failure_at
3847 "number of versioning for alias run-time tests exceeds %d "
3848 "(--param vect-max-version-for-alias-checks)\n", limit
);
3850 return opt_result::success ();
3853 /* Check whether we can use an internal function for a gather load
3854 or scatter store. READ_P is true for loads and false for stores.
3855 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3856 the type of the memory elements being loaded or stored. OFFSET_TYPE
3857 is the type of the offset that is being applied to the invariant
3858 base address. SCALE is the amount by which the offset should
3859 be multiplied *after* it has been converted to address width.
3861 Return true if the function is supported, storing the function id in
3862 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3865 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
3866 tree vectype
, tree memory_type
, tree offset_type
,
3867 int scale
, internal_fn
*ifn_out
,
3868 tree
*offset_vectype_out
)
3870 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3871 unsigned int element_bits
= vector_element_bits (vectype
);
3872 if (element_bits
!= memory_bits
)
3873 /* For now the vector elements must be the same width as the
3877 /* Work out which function we need. */
3878 internal_fn ifn
, alt_ifn
;
3881 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3882 alt_ifn
= IFN_MASK_GATHER_LOAD
;
3886 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3887 alt_ifn
= IFN_MASK_SCATTER_STORE
;
3892 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
3893 if (!offset_vectype
)
3896 /* Test whether the target supports this combination. */
3897 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3898 offset_vectype
, scale
))
3901 *offset_vectype_out
= offset_vectype
;
3905 && internal_gather_scatter_fn_supported_p (alt_ifn
, vectype
,
3911 *offset_vectype_out
= offset_vectype
;
3915 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
3916 && TYPE_PRECISION (offset_type
) >= element_bits
)
3919 offset_type
= build_nonstandard_integer_type
3920 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
3924 /* STMT_INFO is a call to an internal gather load or scatter store function.
3925 Describe the operation in INFO. */
3928 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
3929 gather_scatter_info
*info
)
3931 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
3932 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3933 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3935 info
->ifn
= gimple_call_internal_fn (call
);
3936 info
->decl
= NULL_TREE
;
3937 info
->base
= gimple_call_arg (call
, 0);
3938 info
->offset
= gimple_call_arg (call
, 1);
3939 info
->offset_dt
= vect_unknown_def_type
;
3940 info
->offset_vectype
= NULL_TREE
;
3941 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3942 info
->element_type
= TREE_TYPE (vectype
);
3943 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3946 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3947 gather load or scatter store. Describe the operation in *INFO if so. */
3950 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
3951 gather_scatter_info
*info
)
3953 HOST_WIDE_INT scale
= 1;
3954 poly_int64 pbitpos
, pbitsize
;
3955 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3956 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3957 tree offtype
= NULL_TREE
;
3958 tree decl
= NULL_TREE
, base
, off
;
3959 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3960 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3962 int punsignedp
, reversep
, pvolatilep
= 0;
3964 tree offset_vectype
;
3965 bool masked_p
= false;
3967 /* See whether this is already a call to a gather/scatter internal function.
3968 If not, see whether it's a masked load or store. */
3969 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
3970 if (call
&& gimple_call_internal_p (call
))
3972 ifn
= gimple_call_internal_fn (call
);
3973 if (internal_gather_scatter_fn_p (ifn
))
3975 vect_describe_gather_scatter_call (stmt_info
, info
);
3978 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3981 /* True if we should aim to use internal functions rather than
3982 built-in functions. */
3983 bool use_ifn_p
= (DR_IS_READ (dr
)
3984 ? supports_vec_gather_load_p (TYPE_MODE (vectype
))
3985 : supports_vec_scatter_store_p (TYPE_MODE (vectype
)));
3988 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3989 see if we can use the def stmt of the address. */
3991 && TREE_CODE (base
) == MEM_REF
3992 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3993 && integer_zerop (TREE_OPERAND (base
, 1))
3994 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3996 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3997 if (is_gimple_assign (def_stmt
)
3998 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3999 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
4002 /* The gather and scatter builtins need address of the form
4003 loop_invariant + vector * {1, 2, 4, 8}
4005 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4006 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4007 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4008 multiplications and additions in it. To get a vector, we need
4009 a single SSA_NAME that will be defined in the loop and will
4010 contain everything that is not loop invariant and that can be
4011 vectorized. The following code attempts to find such a preexistng
4012 SSA_NAME OFF and put the loop invariants into a tree BASE
4013 that can be gimplified before the loop. */
4014 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
4015 &punsignedp
, &reversep
, &pvolatilep
);
4019 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
4021 if (TREE_CODE (base
) == MEM_REF
)
4023 if (!integer_zerop (TREE_OPERAND (base
, 1)))
4025 if (off
== NULL_TREE
)
4026 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
4028 off
= size_binop (PLUS_EXPR
, off
,
4029 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
4031 base
= TREE_OPERAND (base
, 0);
4034 base
= build_fold_addr_expr (base
);
4036 if (off
== NULL_TREE
)
4037 off
= size_zero_node
;
4039 /* If base is not loop invariant, either off is 0, then we start with just
4040 the constant offset in the loop invariant BASE and continue with base
4041 as OFF, otherwise give up.
4042 We could handle that case by gimplifying the addition of base + off
4043 into some SSA_NAME and use that as off, but for now punt. */
4044 if (!expr_invariant_in_loop_p (loop
, base
))
4046 if (!integer_zerop (off
))
4049 base
= size_int (pbytepos
);
4051 /* Otherwise put base + constant offset into the loop invariant BASE
4052 and continue with OFF. */
4055 base
= fold_convert (sizetype
, base
);
4056 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
4059 /* OFF at this point may be either a SSA_NAME or some tree expression
4060 from get_inner_reference. Try to peel off loop invariants from it
4061 into BASE as long as possible. */
4063 while (offtype
== NULL_TREE
)
4065 enum tree_code code
;
4066 tree op0
, op1
, add
= NULL_TREE
;
4068 if (TREE_CODE (off
) == SSA_NAME
)
4070 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
4072 if (expr_invariant_in_loop_p (loop
, off
))
4075 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
4078 op0
= gimple_assign_rhs1 (def_stmt
);
4079 code
= gimple_assign_rhs_code (def_stmt
);
4080 op1
= gimple_assign_rhs2 (def_stmt
);
4084 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
4086 code
= TREE_CODE (off
);
4087 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
4091 case POINTER_PLUS_EXPR
:
4093 if (expr_invariant_in_loop_p (loop
, op0
))
4098 add
= fold_convert (sizetype
, add
);
4100 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
4101 base
= size_binop (PLUS_EXPR
, base
, add
);
4104 if (expr_invariant_in_loop_p (loop
, op1
))
4112 if (expr_invariant_in_loop_p (loop
, op1
))
4114 add
= fold_convert (sizetype
, op1
);
4115 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
4121 if (scale
== 1 && tree_fits_shwi_p (op1
))
4123 int new_scale
= tree_to_shwi (op1
);
4124 /* Only treat this as a scaling operation if the target
4125 supports it for at least some offset type. */
4127 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4128 masked_p
, vectype
, memory_type
,
4129 signed_char_type_node
,
4132 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4133 masked_p
, vectype
, memory_type
,
4134 unsigned_char_type_node
,
4147 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
4148 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
4151 /* Don't include the conversion if the target is happy with
4152 the current offset type. */
4154 && TREE_CODE (off
) == SSA_NAME
4155 && !POINTER_TYPE_P (TREE_TYPE (off
))
4156 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4157 masked_p
, vectype
, memory_type
,
4158 TREE_TYPE (off
), scale
, &ifn
,
4162 if (TYPE_PRECISION (TREE_TYPE (op0
))
4163 == TYPE_PRECISION (TREE_TYPE (off
)))
4169 /* Include the conversion if it is widening and we're using
4170 the IFN path or the target can handle the converted from
4171 offset or the current size is not already the same as the
4172 data vector element size. */
4173 if ((TYPE_PRECISION (TREE_TYPE (op0
))
4174 < TYPE_PRECISION (TREE_TYPE (off
)))
4177 ? (targetm
.vectorize
.builtin_gather
4178 && targetm
.vectorize
.builtin_gather (vectype
,
4181 : (targetm
.vectorize
.builtin_scatter
4182 && targetm
.vectorize
.builtin_scatter (vectype
,
4185 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off
)),
4186 TYPE_SIZE (TREE_TYPE (vectype
)), 0)))
4189 offtype
= TREE_TYPE (off
);
4200 /* If at the end OFF still isn't a SSA_NAME or isn't
4201 defined in the loop, punt. */
4202 if (TREE_CODE (off
) != SSA_NAME
4203 || expr_invariant_in_loop_p (loop
, off
))
4206 if (offtype
== NULL_TREE
)
4207 offtype
= TREE_TYPE (off
);
4211 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4212 vectype
, memory_type
, offtype
, scale
,
4213 &ifn
, &offset_vectype
))
4219 if (DR_IS_READ (dr
))
4221 if (targetm
.vectorize
.builtin_gather
)
4222 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4226 if (targetm
.vectorize
.builtin_scatter
)
4227 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4230 /* The offset vector type will be read from DECL when needed. */
4231 offset_vectype
= NULL_TREE
;
4238 info
->offset_dt
= vect_unknown_def_type
;
4239 info
->offset_vectype
= offset_vectype
;
4240 info
->scale
= scale
;
4241 info
->element_type
= TREE_TYPE (vectype
);
4242 info
->memory_type
= memory_type
;
4246 /* Find the data references in STMT, analyze them with respect to LOOP and
4247 append them to DATAREFS. Return false if datarefs in this stmt cannot
4251 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4252 vec
<data_reference_p
> *datarefs
,
4253 vec
<int> *dataref_groups
, int group_id
)
4255 /* We can ignore clobbers for dataref analysis - they are removed during
4256 loop vectorization and BB vectorization checks dependences with a
4258 if (gimple_clobber_p (stmt
))
4259 return opt_result::success ();
4261 if (gimple_has_volatile_ops (stmt
))
4262 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4265 if (stmt_can_throw_internal (cfun
, stmt
))
4266 return opt_result::failure_at (stmt
,
4268 " statement can throw an exception: %G",
4271 auto_vec
<data_reference_p
, 2> refs
;
4272 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4276 if (refs
.is_empty ())
4277 return opt_result::success ();
4279 if (refs
.length () > 1)
4281 while (!refs
.is_empty ())
4282 free_data_ref (refs
.pop ());
4283 return opt_result::failure_at (stmt
,
4284 "not vectorized: more than one "
4285 "data ref in stmt: %G", stmt
);
4288 data_reference_p dr
= refs
.pop ();
4289 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4290 if (!gimple_call_internal_p (call
)
4291 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4292 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4295 return opt_result::failure_at (stmt
,
4296 "not vectorized: dr in a call %G", stmt
);
4299 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4300 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4303 return opt_result::failure_at (stmt
,
4305 " statement is bitfield access %G", stmt
);
4308 if (DR_BASE_ADDRESS (dr
)
4309 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4312 return opt_result::failure_at (stmt
,
4314 " base addr of dr is a constant\n");
4317 /* Check whether this may be a SIMD lane access and adjust the
4318 DR to make it easier for us to handle it. */
4321 && (!DR_BASE_ADDRESS (dr
)
4326 struct data_reference
*newdr
4327 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4328 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4329 if (DR_BASE_ADDRESS (newdr
)
4330 && DR_OFFSET (newdr
)
4333 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4334 && integer_zerop (DR_STEP (newdr
)))
4336 tree base_address
= DR_BASE_ADDRESS (newdr
);
4337 tree off
= DR_OFFSET (newdr
);
4338 tree step
= ssize_int (1);
4339 if (integer_zerop (off
)
4340 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4342 off
= TREE_OPERAND (base_address
, 1);
4343 base_address
= TREE_OPERAND (base_address
, 0);
4346 if (TREE_CODE (off
) == MULT_EXPR
4347 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4349 step
= TREE_OPERAND (off
, 1);
4350 off
= TREE_OPERAND (off
, 0);
4353 if (CONVERT_EXPR_P (off
)
4354 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4355 < TYPE_PRECISION (TREE_TYPE (off
))))
4356 off
= TREE_OPERAND (off
, 0);
4357 if (TREE_CODE (off
) == SSA_NAME
)
4359 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4360 /* Look through widening conversion. */
4361 if (is_gimple_assign (def
)
4362 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4364 tree rhs1
= gimple_assign_rhs1 (def
);
4365 if (TREE_CODE (rhs1
) == SSA_NAME
4366 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4367 && (TYPE_PRECISION (TREE_TYPE (off
))
4368 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4369 def
= SSA_NAME_DEF_STMT (rhs1
);
4371 if (is_gimple_call (def
)
4372 && gimple_call_internal_p (def
)
4373 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4375 tree arg
= gimple_call_arg (def
, 0);
4376 tree reft
= TREE_TYPE (DR_REF (newdr
));
4377 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4378 arg
= SSA_NAME_VAR (arg
);
4379 if (arg
== loop
->simduid
4381 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4383 DR_BASE_ADDRESS (newdr
) = base_address
;
4384 DR_OFFSET (newdr
) = ssize_int (0);
4385 DR_STEP (newdr
) = step
;
4386 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4387 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4388 /* Mark as simd-lane access. */
4389 tree arg2
= gimple_call_arg (def
, 1);
4390 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4392 datarefs
->safe_push (newdr
);
4394 dataref_groups
->safe_push (group_id
);
4395 return opt_result::success ();
4400 free_data_ref (newdr
);
4403 datarefs
->safe_push (dr
);
4405 dataref_groups
->safe_push (group_id
);
4406 return opt_result::success ();
4409 /* Function vect_analyze_data_refs.
4411 Find all the data references in the loop or basic block.
4413 The general structure of the analysis of data refs in the vectorizer is as
4415 1- vect_analyze_data_refs(loop/bb): call
4416 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4417 in the loop/bb and their dependences.
4418 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4419 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4420 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4425 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4427 class loop
*loop
= NULL
;
4429 struct data_reference
*dr
;
4432 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4434 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4435 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4437 /* Go through the data-refs, check that the analysis succeeded. Update
4438 pointer from stmt_vec_info struct to DR and vectype. */
4440 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4441 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4443 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4446 gcc_assert (DR_REF (dr
));
4447 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4448 gcc_assert (!stmt_info
->dr_aux
.dr
);
4449 stmt_info
->dr_aux
.dr
= dr
;
4450 stmt_info
->dr_aux
.stmt
= stmt_info
;
4452 /* Check that analysis of the data-ref succeeded. */
4453 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4458 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4461 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4462 && (targetm
.vectorize
.builtin_scatter
!= NULL
4463 || supports_vec_scatter_store_p ());
4465 /* If target supports vector gather loads or scatter stores,
4466 see if they can't be used. */
4467 if (is_a
<loop_vec_info
> (vinfo
)
4468 && !nested_in_vect_loop_p (loop
, stmt_info
))
4470 if (maybe_gather
|| maybe_scatter
)
4473 gatherscatter
= GATHER
;
4475 gatherscatter
= SCATTER
;
4479 if (gatherscatter
== SG_NONE
)
4481 if (dump_enabled_p ())
4482 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4483 "not vectorized: data ref analysis "
4484 "failed %G", stmt_info
->stmt
);
4485 if (is_a
<bb_vec_info
> (vinfo
))
4487 /* In BB vectorization the ref can still participate
4488 in dependence analysis, we just can't vectorize it. */
4489 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4492 return opt_result::failure_at (stmt_info
->stmt
,
4494 " data ref analysis failed: %G",
4499 /* See if this was detected as SIMD lane access. */
4500 if (dr
->aux
== (void *)-1
4501 || dr
->aux
== (void *)-2
4502 || dr
->aux
== (void *)-3
4503 || dr
->aux
== (void *)-4)
4505 if (nested_in_vect_loop_p (loop
, stmt_info
))
4506 return opt_result::failure_at (stmt_info
->stmt
,
4508 " data ref analysis failed: %G",
4510 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4511 = -(uintptr_t) dr
->aux
;
4514 tree base
= get_base_address (DR_REF (dr
));
4515 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4517 if (dump_enabled_p ())
4518 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4519 "not vectorized: base object not addressable "
4520 "for stmt: %G", stmt_info
->stmt
);
4521 if (is_a
<bb_vec_info
> (vinfo
))
4523 /* In BB vectorization the ref can still participate
4524 in dependence analysis, we just can't vectorize it. */
4525 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4528 return opt_result::failure_at (stmt_info
->stmt
,
4529 "not vectorized: base object not"
4530 " addressable for stmt: %G",
4534 if (is_a
<loop_vec_info
> (vinfo
)
4536 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4538 if (nested_in_vect_loop_p (loop
, stmt_info
))
4539 return opt_result::failure_at (stmt_info
->stmt
,
4541 "not suitable for strided load %G",
4543 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4546 /* Update DR field in stmt_vec_info struct. */
4548 /* If the dataref is in an inner-loop of the loop that is considered for
4549 for vectorization, we also want to analyze the access relative to
4550 the outer-loop (DR contains information only relative to the
4551 inner-most enclosing loop). We do that by building a reference to the
4552 first location accessed by the inner-loop, and analyze it relative to
4554 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4556 /* Build a reference to the first location accessed by the
4557 inner loop: *(BASE + INIT + OFFSET). By construction,
4558 this address must be invariant in the inner loop, so we
4559 can consider it as being used in the outer loop. */
4560 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4561 tree offset
= unshare_expr (DR_OFFSET (dr
));
4562 tree init
= unshare_expr (DR_INIT (dr
));
4563 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4565 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4566 tree init_ref
= build_fold_indirect_ref (init_addr
);
4568 if (dump_enabled_p ())
4569 dump_printf_loc (MSG_NOTE
, vect_location
,
4570 "analyze in outer loop: %T\n", init_ref
);
4573 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4574 init_ref
, loop
, stmt_info
->stmt
);
4576 /* dr_analyze_innermost already explained the failure. */
4579 if (dump_enabled_p ())
4580 dump_printf_loc (MSG_NOTE
, vect_location
,
4581 "\touter base_address: %T\n"
4582 "\touter offset from base address: %T\n"
4583 "\touter constant offset from base address: %T\n"
4584 "\touter step: %T\n"
4585 "\touter base alignment: %d\n\n"
4586 "\touter base misalignment: %d\n"
4587 "\touter offset alignment: %d\n"
4588 "\touter step alignment: %d\n",
4589 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4590 STMT_VINFO_DR_OFFSET (stmt_info
),
4591 STMT_VINFO_DR_INIT (stmt_info
),
4592 STMT_VINFO_DR_STEP (stmt_info
),
4593 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4594 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4595 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4596 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4599 /* Set vectype for STMT. */
4600 scalar_type
= TREE_TYPE (DR_REF (dr
));
4601 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4604 if (dump_enabled_p ())
4606 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4607 "not vectorized: no vectype for stmt: %G",
4609 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4610 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4612 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4615 if (is_a
<bb_vec_info
> (vinfo
))
4617 /* No vector type is fine, the ref can still participate
4618 in dependence analysis, we just can't vectorize it. */
4619 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4624 return opt_result::failure_at (stmt_info
->stmt
,
4626 " no vectype for stmt: %G"
4627 " scalar_type: %T\n",
4628 stmt_info
->stmt
, scalar_type
);
4632 if (dump_enabled_p ())
4633 dump_printf_loc (MSG_NOTE
, vect_location
,
4634 "got vectype for stmt: %G%T\n",
4635 stmt_info
->stmt
, vectype
);
4638 /* Adjust the minimal vectorization factor according to the
4640 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4641 *min_vf
= upper_bound (*min_vf
, vf
);
4643 /* Leave the BB vectorizer to pick the vector type later, based on
4644 the final dataref group size and SLP node size. */
4645 if (is_a
<loop_vec_info
> (vinfo
))
4646 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4648 if (gatherscatter
!= SG_NONE
)
4650 gather_scatter_info gs_info
;
4651 if (!vect_check_gather_scatter (stmt_info
,
4652 as_a
<loop_vec_info
> (vinfo
),
4654 || !get_vectype_for_scalar_type (vinfo
,
4655 TREE_TYPE (gs_info
.offset
)))
4659 return opt_result::failure_at
4661 (gatherscatter
== GATHER
)
4662 ? "not vectorized: not suitable for gather load %G"
4663 : "not vectorized: not suitable for scatter store %G",
4666 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4670 /* We used to stop processing and prune the list here. Verify we no
4672 gcc_assert (i
== datarefs
.length ());
4674 return opt_result::success ();
4678 /* Function vect_get_new_vect_var.
4680 Returns a name for a new variable. The current naming scheme appends the
4681 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4682 the name of vectorizer generated variables, and appends that to NAME if
4686 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4693 case vect_simple_var
:
4696 case vect_scalar_var
:
4702 case vect_pointer_var
:
4711 char* tmp
= concat (prefix
, "_", name
, NULL
);
4712 new_vect_var
= create_tmp_reg (type
, tmp
);
4716 new_vect_var
= create_tmp_reg (type
, prefix
);
4718 return new_vect_var
;
4721 /* Like vect_get_new_vect_var but return an SSA name. */
4724 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4731 case vect_simple_var
:
4734 case vect_scalar_var
:
4737 case vect_pointer_var
:
4746 char* tmp
= concat (prefix
, "_", name
, NULL
);
4747 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4751 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4753 return new_vect_var
;
4756 /* Duplicate points-to info on NAME from DR_INFO. */
4759 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
4761 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
4762 /* DR_PTR_INFO is for a base SSA name, not including constant or
4763 variable offsets in the ref so its alignment info does not apply. */
4764 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4767 /* Function vect_create_addr_base_for_vector_ref.
4769 Create an expression that computes the address of the first memory location
4770 that will be accessed for a data reference.
4773 STMT_INFO: The statement containing the data reference.
4774 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4775 OFFSET: Optional. If supplied, it is be added to the initial address.
4776 LOOP: Specify relative to which loop-nest should the address be computed.
4777 For example, when the dataref is in an inner-loop nested in an
4778 outer-loop that is now being vectorized, LOOP can be either the
4779 outer-loop, or the inner-loop. The first memory location accessed
4780 by the following dataref ('in' points to short):
4787 if LOOP=i_loop: &in (relative to i_loop)
4788 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4791 1. Return an SSA_NAME whose value is the address of the memory location of
4792 the first vector of the data reference.
4793 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4794 these statement(s) which define the returned SSA_NAME.
4796 FORNOW: We are only handling array accesses with step 1. */
4799 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4800 gimple_seq
*new_stmt_list
,
4803 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4804 struct data_reference
*dr
= dr_info
->dr
;
4805 const char *base_name
;
4808 gimple_seq seq
= NULL
;
4810 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4811 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
4813 tree data_ref_base
= unshare_expr (drb
->base_address
);
4814 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
4815 tree init
= unshare_expr (drb
->init
);
4818 base_name
= get_name (data_ref_base
);
4821 base_offset
= ssize_int (0);
4822 init
= ssize_int (0);
4823 base_name
= get_name (DR_REF (dr
));
4826 /* Create base_offset */
4827 base_offset
= size_binop (PLUS_EXPR
,
4828 fold_convert (sizetype
, base_offset
),
4829 fold_convert (sizetype
, init
));
4833 offset
= fold_convert (sizetype
, offset
);
4834 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4835 base_offset
, offset
);
4838 /* base + base_offset */
4840 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4843 addr_base
= build1 (ADDR_EXPR
,
4844 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4845 unshare_expr (DR_REF (dr
)));
4848 vect_ptr_type
= build_pointer_type (TREE_TYPE (DR_REF (dr
)));
4849 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4850 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4851 gimple_seq_add_seq (new_stmt_list
, seq
);
4853 if (DR_PTR_INFO (dr
)
4854 && TREE_CODE (addr_base
) == SSA_NAME
4855 /* We should only duplicate pointer info to newly created SSA names. */
4856 && SSA_NAME_VAR (addr_base
) == dest
)
4858 gcc_assert (!SSA_NAME_PTR_INFO (addr_base
));
4859 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
4862 if (dump_enabled_p ())
4863 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
4869 /* Function vect_create_data_ref_ptr.
4871 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4872 location accessed in the loop by STMT_INFO, along with the def-use update
4873 chain to appropriately advance the pointer through the loop iterations.
4874 Also set aliasing information for the pointer. This pointer is used by
4875 the callers to this function to create a memory reference expression for
4876 vector load/store access.
4879 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4880 GIMPLE_ASSIGN <name, data-ref> or
4881 GIMPLE_ASSIGN <data-ref, name>.
4882 2. AGGR_TYPE: the type of the reference, which should be either a vector
4884 3. AT_LOOP: the loop where the vector memref is to be created.
4885 4. OFFSET (optional): a byte offset to be added to the initial address
4886 accessed by the data-ref in STMT_INFO.
4887 5. BSI: location where the new stmts are to be placed if there is no loop
4888 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4889 pointing to the initial address.
4890 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4891 to the IV during each iteration of the loop. NULL says to move
4892 by one copy of AGGR_TYPE up or down, depending on the step of the
4896 1. Declare a new ptr to vector_type, and have it point to the base of the
4897 data reference (initial addressed accessed by the data reference).
4898 For example, for vector of type V8HI, the following code is generated:
4901 ap = (v8hi *)initial_address;
4903 if OFFSET is not supplied:
4904 initial_address = &a[init];
4905 if OFFSET is supplied:
4906 initial_address = &a[init] + OFFSET;
4907 if BYTE_OFFSET is supplied:
4908 initial_address = &a[init] + BYTE_OFFSET;
4910 Return the initial_address in INITIAL_ADDRESS.
4912 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4913 update the pointer in each iteration of the loop.
4915 Return the increment stmt that updates the pointer in PTR_INCR.
4917 3. Return the pointer. */
4920 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4921 tree aggr_type
, class loop
*at_loop
, tree offset
,
4922 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
4923 gimple
**ptr_incr
, bool only_init
,
4926 const char *base_name
;
4927 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4928 class loop
*loop
= NULL
;
4929 bool nested_in_vect_loop
= false;
4930 class loop
*containing_loop
= NULL
;
4934 gimple_seq new_stmt_list
= NULL
;
4938 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4939 struct data_reference
*dr
= dr_info
->dr
;
4941 gimple_stmt_iterator incr_gsi
;
4943 tree indx_before_incr
, indx_after_incr
;
4945 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
4947 gcc_assert (iv_step
!= NULL_TREE
4948 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4949 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4953 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4954 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
4955 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
4956 pe
= loop_preheader_edge (loop
);
4960 gcc_assert (bb_vinfo
);
4965 /* Create an expression for the first address accessed by this load
4967 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4969 if (dump_enabled_p ())
4971 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4972 dump_printf_loc (MSG_NOTE
, vect_location
,
4973 "create %s-pointer variable to type: %T",
4974 get_tree_code_name (TREE_CODE (aggr_type
)),
4976 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4977 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4978 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4979 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4980 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4981 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4983 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4984 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
4987 /* (1) Create the new aggregate-pointer variable.
4988 Vector and array types inherit the alias set of their component
4989 type by default so we need to use a ref-all pointer if the data
4990 reference does not conflict with the created aggregated data
4991 reference because it is not addressable. */
4992 bool need_ref_all
= false;
4993 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4994 get_alias_set (DR_REF (dr
))))
4995 need_ref_all
= true;
4996 /* Likewise for any of the data references in the stmt group. */
4997 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4999 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5002 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
5003 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5004 get_alias_set (DR_REF (sdr
))))
5006 need_ref_all
= true;
5009 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
5013 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
5015 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
5018 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5019 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5020 def-use update cycles for the pointer: one relative to the outer-loop
5021 (LOOP), which is what steps (3) and (4) below do. The other is relative
5022 to the inner-loop (which is the inner-most loop containing the dataref),
5023 and this is done be step (5) below.
5025 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5026 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5027 redundant. Steps (3),(4) create the following:
5030 LOOP: vp1 = phi(vp0,vp2)
5036 If there is an inner-loop nested in loop, then step (5) will also be
5037 applied, and an additional update in the inner-loop will be created:
5040 LOOP: vp1 = phi(vp0,vp2)
5042 inner: vp3 = phi(vp1,vp4)
5043 vp4 = vp3 + inner_step
5049 /* (2) Calculate the initial address of the aggregate-pointer, and set
5050 the aggregate-pointer to point to it before the loop. */
5052 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5054 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
5055 stmt_info
, &new_stmt_list
,
5061 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
5062 gcc_assert (!new_bb
);
5065 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
5068 *initial_address
= new_temp
;
5069 aggr_ptr_init
= new_temp
;
5071 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5072 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5073 inner-loop nested in LOOP (during outer-loop vectorization). */
5075 /* No update in loop is required. */
5076 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
5077 aptr
= aggr_ptr_init
;
5080 /* Accesses to invariant addresses should be handled specially
5082 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
5083 gcc_assert (!integer_zerop (step
));
5085 if (iv_step
== NULL_TREE
)
5087 /* The step of the aggregate pointer is the type size,
5088 negated for downward accesses. */
5089 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
5090 if (tree_int_cst_sgn (step
) == -1)
5091 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
5094 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5096 create_iv (aggr_ptr_init
,
5097 fold_convert (aggr_ptr_type
, iv_step
),
5098 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
5099 &indx_before_incr
, &indx_after_incr
);
5100 incr
= gsi_stmt (incr_gsi
);
5102 /* Copy the points-to information if it exists. */
5103 if (DR_PTR_INFO (dr
))
5105 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5106 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5111 aptr
= indx_before_incr
;
5114 if (!nested_in_vect_loop
|| only_init
)
5118 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5119 nested in LOOP, if exists. */
5121 gcc_assert (nested_in_vect_loop
);
5124 standard_iv_increment_position (containing_loop
, &incr_gsi
,
5126 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
5127 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
5129 incr
= gsi_stmt (incr_gsi
);
5131 /* Copy the points-to information if it exists. */
5132 if (DR_PTR_INFO (dr
))
5134 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5135 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5140 return indx_before_incr
;
5147 /* Function bump_vector_ptr
5149 Increment a pointer (to a vector type) by vector-size. If requested,
5150 i.e. if PTR-INCR is given, then also connect the new increment stmt
5151 to the existing def-use update-chain of the pointer, by modifying
5152 the PTR_INCR as illustrated below:
5154 The pointer def-use update-chain before this function:
5155 DATAREF_PTR = phi (p_0, p_2)
5157 PTR_INCR: p_2 = DATAREF_PTR + step
5159 The pointer def-use update-chain after this function:
5160 DATAREF_PTR = phi (p_0, p_2)
5162 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5164 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5167 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5169 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5170 the loop. The increment amount across iterations is expected
5172 BSI - location where the new update stmt is to be placed.
5173 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5174 BUMP - optional. The offset by which to bump the pointer. If not given,
5175 the offset is assumed to be vector_size.
5177 Output: Return NEW_DATAREF_PTR as illustrated above.
5182 bump_vector_ptr (vec_info
*vinfo
,
5183 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5184 stmt_vec_info stmt_info
, tree bump
)
5186 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5187 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5188 tree update
= TYPE_SIZE_UNIT (vectype
);
5191 use_operand_p use_p
;
5192 tree new_dataref_ptr
;
5197 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5198 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5199 else if (is_gimple_min_invariant (dataref_ptr
))
5200 /* When possible avoid emitting a separate increment stmt that will
5201 force the addressed object addressable. */
5202 return build1 (ADDR_EXPR
, TREE_TYPE (dataref_ptr
),
5203 fold_build2 (MEM_REF
,
5204 TREE_TYPE (TREE_TYPE (dataref_ptr
)),
5206 fold_convert (ptr_type_node
, update
)));
5208 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5209 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5210 dataref_ptr
, update
);
5211 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5212 /* Fold the increment, avoiding excessive chains use-def chains of
5213 those, leading to compile-time issues for passes until the next
5214 forwprop pass which would do this as well. */
5215 gimple_stmt_iterator fold_gsi
= gsi_for_stmt (incr_stmt
);
5216 if (fold_stmt (&fold_gsi
, follow_all_ssa_edges
))
5218 incr_stmt
= gsi_stmt (fold_gsi
);
5219 update_stmt (incr_stmt
);
5222 /* Copy the points-to information if it exists. */
5223 if (DR_PTR_INFO (dr
))
5225 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5226 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5230 return new_dataref_ptr
;
5232 /* Update the vector-pointer's cross-iteration increment. */
5233 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5235 tree use
= USE_FROM_PTR (use_p
);
5237 if (use
== dataref_ptr
)
5238 SET_USE (use_p
, new_dataref_ptr
);
5240 gcc_assert (operand_equal_p (use
, update
, 0));
5243 return new_dataref_ptr
;
5247 /* Copy memory reference info such as base/clique from the SRC reference
5248 to the DEST MEM_REF. */
5251 vect_copy_ref_info (tree dest
, tree src
)
5253 if (TREE_CODE (dest
) != MEM_REF
)
5256 tree src_base
= src
;
5257 while (handled_component_p (src_base
))
5258 src_base
= TREE_OPERAND (src_base
, 0);
5259 if (TREE_CODE (src_base
) != MEM_REF
5260 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5263 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5264 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5268 /* Function vect_create_destination_var.
5270 Create a new temporary of type VECTYPE. */
5273 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5279 enum vect_var_kind kind
;
5282 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5286 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5288 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5290 name
= get_name (scalar_dest
);
5292 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5294 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5295 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5301 /* Function vect_grouped_store_supported.
5303 Returns TRUE if interleave high and interleave low permutations
5304 are supported, and FALSE otherwise. */
5307 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5309 machine_mode mode
= TYPE_MODE (vectype
);
5311 /* vect_permute_store_chain requires the group size to be equal to 3 or
5312 be a power of two. */
5313 if (count
!= 3 && exact_log2 (count
) == -1)
5315 if (dump_enabled_p ())
5316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5317 "the size of the group of accesses"
5318 " is not a power of 2 or not eqaul to 3\n");
5322 /* Check that the permutation is supported. */
5323 if (VECTOR_MODE_P (mode
))
5328 unsigned int j0
= 0, j1
= 0, j2
= 0;
5332 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5334 if (dump_enabled_p ())
5335 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5336 "cannot handle groups of 3 stores for"
5337 " variable-length vectors\n");
5341 vec_perm_builder
sel (nelt
, nelt
, 1);
5342 sel
.quick_grow (nelt
);
5343 vec_perm_indices indices
;
5344 for (j
= 0; j
< 3; j
++)
5346 int nelt0
= ((3 - j
) * nelt
) % 3;
5347 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5348 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5349 for (i
= 0; i
< nelt
; i
++)
5351 if (3 * i
+ nelt0
< nelt
)
5352 sel
[3 * i
+ nelt0
] = j0
++;
5353 if (3 * i
+ nelt1
< nelt
)
5354 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5355 if (3 * i
+ nelt2
< nelt
)
5356 sel
[3 * i
+ nelt2
] = 0;
5358 indices
.new_vector (sel
, 2, nelt
);
5359 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5361 if (dump_enabled_p ())
5362 dump_printf (MSG_MISSED_OPTIMIZATION
,
5363 "permutation op not supported by target.\n");
5367 for (i
= 0; i
< nelt
; i
++)
5369 if (3 * i
+ nelt0
< nelt
)
5370 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5371 if (3 * i
+ nelt1
< nelt
)
5372 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5373 if (3 * i
+ nelt2
< nelt
)
5374 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5376 indices
.new_vector (sel
, 2, nelt
);
5377 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5379 if (dump_enabled_p ())
5380 dump_printf (MSG_MISSED_OPTIMIZATION
,
5381 "permutation op not supported by target.\n");
5389 /* If length is not equal to 3 then only power of 2 is supported. */
5390 gcc_assert (pow2p_hwi (count
));
5391 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5393 /* The encoding has 2 interleaved stepped patterns. */
5394 vec_perm_builder
sel (nelt
, 2, 3);
5396 for (i
= 0; i
< 3; i
++)
5399 sel
[i
* 2 + 1] = i
+ nelt
;
5401 vec_perm_indices
indices (sel
, 2, nelt
);
5402 if (can_vec_perm_const_p (mode
, mode
, indices
))
5404 for (i
= 0; i
< 6; i
++)
5405 sel
[i
] += exact_div (nelt
, 2);
5406 indices
.new_vector (sel
, 2, nelt
);
5407 if (can_vec_perm_const_p (mode
, mode
, indices
))
5413 if (dump_enabled_p ())
5414 dump_printf (MSG_MISSED_OPTIMIZATION
,
5415 "permutation op not supported by target.\n");
5420 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5421 type VECTYPE. MASKED_P says whether the masked form is needed. */
5424 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5428 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5429 vec_mask_store_lanes_optab
,
5432 return vect_lanes_optab_supported_p ("vec_store_lanes",
5433 vec_store_lanes_optab
,
5438 /* Function vect_permute_store_chain.
5440 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5441 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5442 the data correctly for the stores. Return the final references for stores
5445 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5446 The input is 4 vectors each containing 8 elements. We assign a number to
5447 each element, the input sequence is:
5449 1st vec: 0 1 2 3 4 5 6 7
5450 2nd vec: 8 9 10 11 12 13 14 15
5451 3rd vec: 16 17 18 19 20 21 22 23
5452 4th vec: 24 25 26 27 28 29 30 31
5454 The output sequence should be:
5456 1st vec: 0 8 16 24 1 9 17 25
5457 2nd vec: 2 10 18 26 3 11 19 27
5458 3rd vec: 4 12 20 28 5 13 21 30
5459 4th vec: 6 14 22 30 7 15 23 31
5461 i.e., we interleave the contents of the four vectors in their order.
5463 We use interleave_high/low instructions to create such output. The input of
5464 each interleave_high/low operation is two vectors:
5467 the even elements of the result vector are obtained left-to-right from the
5468 high/low elements of the first vector. The odd elements of the result are
5469 obtained left-to-right from the high/low elements of the second vector.
5470 The output of interleave_high will be: 0 4 1 5
5471 and of interleave_low: 2 6 3 7
5474 The permutation is done in log LENGTH stages. In each stage interleave_high
5475 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5476 where the first argument is taken from the first half of DR_CHAIN and the
5477 second argument from it's second half.
5480 I1: interleave_high (1st vec, 3rd vec)
5481 I2: interleave_low (1st vec, 3rd vec)
5482 I3: interleave_high (2nd vec, 4th vec)
5483 I4: interleave_low (2nd vec, 4th vec)
5485 The output for the first stage is:
5487 I1: 0 16 1 17 2 18 3 19
5488 I2: 4 20 5 21 6 22 7 23
5489 I3: 8 24 9 25 10 26 11 27
5490 I4: 12 28 13 29 14 30 15 31
5492 The output of the second stage, i.e. the final result is:
5494 I1: 0 8 16 24 1 9 17 25
5495 I2: 2 10 18 26 3 11 19 27
5496 I3: 4 12 20 28 5 13 21 30
5497 I4: 6 14 22 30 7 15 23 31. */
5500 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> &dr_chain
,
5501 unsigned int length
,
5502 stmt_vec_info stmt_info
,
5503 gimple_stmt_iterator
*gsi
,
5504 vec
<tree
> *result_chain
)
5506 tree vect1
, vect2
, high
, low
;
5508 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5509 tree perm_mask_low
, perm_mask_high
;
5511 tree perm3_mask_low
, perm3_mask_high
;
5512 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5514 result_chain
->quick_grow (length
);
5515 memcpy (result_chain
->address (), dr_chain
.address (),
5516 length
* sizeof (tree
));
5520 /* vect_grouped_store_supported ensures that this is constant. */
5521 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5522 unsigned int j0
= 0, j1
= 0, j2
= 0;
5524 vec_perm_builder
sel (nelt
, nelt
, 1);
5525 sel
.quick_grow (nelt
);
5526 vec_perm_indices indices
;
5527 for (j
= 0; j
< 3; j
++)
5529 int nelt0
= ((3 - j
) * nelt
) % 3;
5530 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5531 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5533 for (i
= 0; i
< nelt
; i
++)
5535 if (3 * i
+ nelt0
< nelt
)
5536 sel
[3 * i
+ nelt0
] = j0
++;
5537 if (3 * i
+ nelt1
< nelt
)
5538 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5539 if (3 * i
+ nelt2
< nelt
)
5540 sel
[3 * i
+ nelt2
] = 0;
5542 indices
.new_vector (sel
, 2, nelt
);
5543 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5545 for (i
= 0; i
< nelt
; i
++)
5547 if (3 * i
+ nelt0
< nelt
)
5548 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5549 if (3 * i
+ nelt1
< nelt
)
5550 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5551 if (3 * i
+ nelt2
< nelt
)
5552 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5554 indices
.new_vector (sel
, 2, nelt
);
5555 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5557 vect1
= dr_chain
[0];
5558 vect2
= dr_chain
[1];
5560 /* Create interleaving stmt:
5561 low = VEC_PERM_EXPR <vect1, vect2,
5562 {j, nelt, *, j + 1, nelt + j + 1, *,
5563 j + 2, nelt + j + 2, *, ...}> */
5564 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5565 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5566 vect2
, perm3_mask_low
);
5567 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5570 vect2
= dr_chain
[2];
5571 /* Create interleaving stmt:
5572 low = VEC_PERM_EXPR <vect1, vect2,
5573 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5574 6, 7, nelt + j + 2, ...}> */
5575 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5576 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5577 vect2
, perm3_mask_high
);
5578 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5579 (*result_chain
)[j
] = data_ref
;
5584 /* If length is not equal to 3 then only power of 2 is supported. */
5585 gcc_assert (pow2p_hwi (length
));
5587 /* The encoding has 2 interleaved stepped patterns. */
5588 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5589 vec_perm_builder
sel (nelt
, 2, 3);
5591 for (i
= 0; i
< 3; i
++)
5594 sel
[i
* 2 + 1] = i
+ nelt
;
5596 vec_perm_indices
indices (sel
, 2, nelt
);
5597 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5599 for (i
= 0; i
< 6; i
++)
5600 sel
[i
] += exact_div (nelt
, 2);
5601 indices
.new_vector (sel
, 2, nelt
);
5602 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5604 for (i
= 0, n
= log_length
; i
< n
; i
++)
5606 for (j
= 0; j
< length
/2; j
++)
5608 vect1
= dr_chain
[j
];
5609 vect2
= dr_chain
[j
+length
/2];
5611 /* Create interleaving stmt:
5612 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5614 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5615 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5616 vect2
, perm_mask_high
);
5617 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5618 (*result_chain
)[2*j
] = high
;
5620 /* Create interleaving stmt:
5621 low = VEC_PERM_EXPR <vect1, vect2,
5622 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5624 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5625 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5626 vect2
, perm_mask_low
);
5627 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5628 (*result_chain
)[2*j
+1] = low
;
5630 memcpy (dr_chain
.address (), result_chain
->address (),
5631 length
* sizeof (tree
));
5636 /* Function vect_setup_realignment
5638 This function is called when vectorizing an unaligned load using
5639 the dr_explicit_realign[_optimized] scheme.
5640 This function generates the following code at the loop prolog:
5643 x msq_init = *(floor(p)); # prolog load
5644 realignment_token = call target_builtin;
5646 x msq = phi (msq_init, ---)
5648 The stmts marked with x are generated only for the case of
5649 dr_explicit_realign_optimized.
5651 The code above sets up a new (vector) pointer, pointing to the first
5652 location accessed by STMT_INFO, and a "floor-aligned" load using that
5653 pointer. It also generates code to compute the "realignment-token"
5654 (if the relevant target hook was defined), and creates a phi-node at the
5655 loop-header bb whose arguments are the result of the prolog-load (created
5656 by this function) and the result of a load that takes place in the loop
5657 (to be created by the caller to this function).
5659 For the case of dr_explicit_realign_optimized:
5660 The caller to this function uses the phi-result (msq) to create the
5661 realignment code inside the loop, and sets up the missing phi argument,
5664 msq = phi (msq_init, lsq)
5665 lsq = *(floor(p')); # load in loop
5666 result = realign_load (msq, lsq, realignment_token);
5668 For the case of dr_explicit_realign:
5670 msq = *(floor(p)); # load in loop
5672 lsq = *(floor(p')); # load in loop
5673 result = realign_load (msq, lsq, realignment_token);
5676 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5677 a memory location that may be unaligned.
5678 BSI - place where new code is to be inserted.
5679 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5683 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5684 target hook, if defined.
5685 Return value - the result of the loop-header phi node. */
5688 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5689 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
5690 enum dr_alignment_support alignment_support_scheme
,
5692 class loop
**at_loop
)
5694 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5695 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5696 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5697 struct data_reference
*dr
= dr_info
->dr
;
5698 class loop
*loop
= NULL
;
5700 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
5706 tree msq_init
= NULL_TREE
;
5709 tree msq
= NULL_TREE
;
5710 gimple_seq stmts
= NULL
;
5711 bool compute_in_loop
= false;
5712 bool nested_in_vect_loop
= false;
5713 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5714 class loop
*loop_for_initial_load
= NULL
;
5718 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5719 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5722 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5723 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5725 /* We need to generate three things:
5726 1. the misalignment computation
5727 2. the extra vector load (for the optimized realignment scheme).
5728 3. the phi node for the two vectors from which the realignment is
5729 done (for the optimized realignment scheme). */
5731 /* 1. Determine where to generate the misalignment computation.
5733 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5734 calculation will be generated by this function, outside the loop (in the
5735 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5736 caller, inside the loop.
5738 Background: If the misalignment remains fixed throughout the iterations of
5739 the loop, then both realignment schemes are applicable, and also the
5740 misalignment computation can be done outside LOOP. This is because we are
5741 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5742 are a multiple of VS (the Vector Size), and therefore the misalignment in
5743 different vectorized LOOP iterations is always the same.
5744 The problem arises only if the memory access is in an inner-loop nested
5745 inside LOOP, which is now being vectorized using outer-loop vectorization.
5746 This is the only case when the misalignment of the memory access may not
5747 remain fixed throughout the iterations of the inner-loop (as explained in
5748 detail in vect_supportable_dr_alignment). In this case, not only is the
5749 optimized realignment scheme not applicable, but also the misalignment
5750 computation (and generation of the realignment token that is passed to
5751 REALIGN_LOAD) have to be done inside the loop.
5753 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5754 or not, which in turn determines if the misalignment is computed inside
5755 the inner-loop, or outside LOOP. */
5757 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5759 compute_in_loop
= true;
5760 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5764 /* 2. Determine where to generate the extra vector load.
5766 For the optimized realignment scheme, instead of generating two vector
5767 loads in each iteration, we generate a single extra vector load in the
5768 preheader of the loop, and in each iteration reuse the result of the
5769 vector load from the previous iteration. In case the memory access is in
5770 an inner-loop nested inside LOOP, which is now being vectorized using
5771 outer-loop vectorization, we need to determine whether this initial vector
5772 load should be generated at the preheader of the inner-loop, or can be
5773 generated at the preheader of LOOP. If the memory access has no evolution
5774 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5775 to be generated inside LOOP (in the preheader of the inner-loop). */
5777 if (nested_in_vect_loop
)
5779 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5780 bool invariant_in_outerloop
=
5781 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5782 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5785 loop_for_initial_load
= loop
;
5787 *at_loop
= loop_for_initial_load
;
5789 tree vuse
= NULL_TREE
;
5790 if (loop_for_initial_load
)
5792 pe
= loop_preheader_edge (loop_for_initial_load
);
5793 if (gphi
*vphi
= get_virtual_phi (loop_for_initial_load
->header
))
5794 vuse
= PHI_ARG_DEF_FROM_EDGE (vphi
, pe
);
5797 vuse
= gimple_vuse (gsi_stmt (*gsi
));
5799 /* 3. For the case of the optimized realignment, create the first vector
5800 load at the loop preheader. */
5802 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5804 /* Create msq_init = *(floor(p1)) in the loop preheader */
5807 gcc_assert (!compute_in_loop
);
5808 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5809 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
5810 loop_for_initial_load
, NULL_TREE
,
5811 &init_addr
, NULL
, &inc
, true);
5812 if (TREE_CODE (ptr
) == SSA_NAME
)
5813 new_temp
= copy_ssa_name (ptr
);
5815 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5816 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
5817 tree type
= TREE_TYPE (ptr
);
5818 new_stmt
= gimple_build_assign
5819 (new_temp
, BIT_AND_EXPR
, ptr
,
5820 fold_build2 (MINUS_EXPR
, type
,
5821 build_int_cst (type
, 0),
5822 build_int_cst (type
, align
)));
5823 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5824 gcc_assert (!new_bb
);
5826 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5827 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5828 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5829 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5830 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5831 gimple_assign_set_lhs (new_stmt
, new_temp
);
5832 gimple_set_vuse (new_stmt
, vuse
);
5835 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5836 gcc_assert (!new_bb
);
5839 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5841 msq_init
= gimple_assign_lhs (new_stmt
);
5844 /* 4. Create realignment token using a target builtin, if available.
5845 It is done either inside the containing loop, or before LOOP (as
5846 determined above). */
5848 if (targetm
.vectorize
.builtin_mask_for_load
)
5853 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5856 /* Generate the INIT_ADDR computation outside LOOP. */
5857 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
5862 pe
= loop_preheader_edge (loop
);
5863 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5864 gcc_assert (!new_bb
);
5867 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5870 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5871 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5873 vect_create_destination_var (scalar_dest
,
5874 gimple_call_return_type (new_stmt
));
5875 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5876 gimple_call_set_lhs (new_stmt
, new_temp
);
5878 if (compute_in_loop
)
5879 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5882 /* Generate the misalignment computation outside LOOP. */
5883 pe
= loop_preheader_edge (loop
);
5884 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5885 gcc_assert (!new_bb
);
5888 *realignment_token
= gimple_call_lhs (new_stmt
);
5890 /* The result of the CALL_EXPR to this builtin is determined from
5891 the value of the parameter and no global variables are touched
5892 which makes the builtin a "const" function. Requiring the
5893 builtin to have the "const" attribute makes it unnecessary
5894 to call mark_call_clobbered. */
5895 gcc_assert (TREE_READONLY (builtin_decl
));
5898 if (alignment_support_scheme
== dr_explicit_realign
)
5901 gcc_assert (!compute_in_loop
);
5902 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5905 /* 5. Create msq = phi <msq_init, lsq> in loop */
5907 pe
= loop_preheader_edge (containing_loop
);
5908 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5909 msq
= make_ssa_name (vec_dest
);
5910 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5911 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5917 /* Function vect_grouped_load_supported.
5919 COUNT is the size of the load group (the number of statements plus the
5920 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5921 only one statement, with a gap of COUNT - 1.
5923 Returns true if a suitable permute exists. */
5926 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5927 unsigned HOST_WIDE_INT count
)
5929 machine_mode mode
= TYPE_MODE (vectype
);
5931 /* If this is single-element interleaving with an element distance
5932 that leaves unused vector loads around punt - we at least create
5933 very sub-optimal code in that case (and blow up memory,
5935 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5937 if (dump_enabled_p ())
5938 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5939 "single-element interleaving not supported "
5940 "for not adjacent vector loads\n");
5944 /* vect_permute_load_chain requires the group size to be equal to 3 or
5945 be a power of two. */
5946 if (count
!= 3 && exact_log2 (count
) == -1)
5948 if (dump_enabled_p ())
5949 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5950 "the size of the group of accesses"
5951 " is not a power of 2 or not equal to 3\n");
5955 /* Check that the permutation is supported. */
5956 if (VECTOR_MODE_P (mode
))
5962 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5964 if (dump_enabled_p ())
5965 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5966 "cannot handle groups of 3 loads for"
5967 " variable-length vectors\n");
5971 vec_perm_builder
sel (nelt
, nelt
, 1);
5972 sel
.quick_grow (nelt
);
5973 vec_perm_indices indices
;
5975 for (k
= 0; k
< 3; k
++)
5977 for (i
= 0; i
< nelt
; i
++)
5978 if (3 * i
+ k
< 2 * nelt
)
5982 indices
.new_vector (sel
, 2, nelt
);
5983 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5985 if (dump_enabled_p ())
5986 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5987 "shuffle of 3 loads is not supported by"
5991 for (i
= 0, j
= 0; i
< nelt
; i
++)
5992 if (3 * i
+ k
< 2 * nelt
)
5995 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5996 indices
.new_vector (sel
, 2, nelt
);
5997 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5999 if (dump_enabled_p ())
6000 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6001 "shuffle of 3 loads is not supported by"
6010 /* If length is not equal to 3 then only power of 2 is supported. */
6011 gcc_assert (pow2p_hwi (count
));
6012 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
6014 /* The encoding has a single stepped pattern. */
6015 vec_perm_builder
sel (nelt
, 1, 3);
6017 for (i
= 0; i
< 3; i
++)
6019 vec_perm_indices
indices (sel
, 2, nelt
);
6020 if (can_vec_perm_const_p (mode
, mode
, indices
))
6022 for (i
= 0; i
< 3; i
++)
6024 indices
.new_vector (sel
, 2, nelt
);
6025 if (can_vec_perm_const_p (mode
, mode
, indices
))
6031 if (dump_enabled_p ())
6032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6033 "extract even/odd not supported by target\n");
6037 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6038 type VECTYPE. MASKED_P says whether the masked form is needed. */
6041 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
6045 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6046 vec_mask_load_lanes_optab
,
6049 return vect_lanes_optab_supported_p ("vec_load_lanes",
6050 vec_load_lanes_optab
,
6054 /* Function vect_permute_load_chain.
6056 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6057 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6058 the input data correctly. Return the final references for loads in
6061 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6062 The input is 4 vectors each containing 8 elements. We assign a number to each
6063 element, the input sequence is:
6065 1st vec: 0 1 2 3 4 5 6 7
6066 2nd vec: 8 9 10 11 12 13 14 15
6067 3rd vec: 16 17 18 19 20 21 22 23
6068 4th vec: 24 25 26 27 28 29 30 31
6070 The output sequence should be:
6072 1st vec: 0 4 8 12 16 20 24 28
6073 2nd vec: 1 5 9 13 17 21 25 29
6074 3rd vec: 2 6 10 14 18 22 26 30
6075 4th vec: 3 7 11 15 19 23 27 31
6077 i.e., the first output vector should contain the first elements of each
6078 interleaving group, etc.
6080 We use extract_even/odd instructions to create such output. The input of
6081 each extract_even/odd operation is two vectors
6085 and the output is the vector of extracted even/odd elements. The output of
6086 extract_even will be: 0 2 4 6
6087 and of extract_odd: 1 3 5 7
6090 The permutation is done in log LENGTH stages. In each stage extract_even
6091 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6092 their order. In our example,
6094 E1: extract_even (1st vec, 2nd vec)
6095 E2: extract_odd (1st vec, 2nd vec)
6096 E3: extract_even (3rd vec, 4th vec)
6097 E4: extract_odd (3rd vec, 4th vec)
6099 The output for the first stage will be:
6101 E1: 0 2 4 6 8 10 12 14
6102 E2: 1 3 5 7 9 11 13 15
6103 E3: 16 18 20 22 24 26 28 30
6104 E4: 17 19 21 23 25 27 29 31
6106 In order to proceed and create the correct sequence for the next stage (or
6107 for the correct output, if the second stage is the last one, as in our
6108 example), we first put the output of extract_even operation and then the
6109 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6110 The input for the second stage is:
6112 1st vec (E1): 0 2 4 6 8 10 12 14
6113 2nd vec (E3): 16 18 20 22 24 26 28 30
6114 3rd vec (E2): 1 3 5 7 9 11 13 15
6115 4th vec (E4): 17 19 21 23 25 27 29 31
6117 The output of the second stage:
6119 E1: 0 4 8 12 16 20 24 28
6120 E2: 2 6 10 14 18 22 26 30
6121 E3: 1 5 9 13 17 21 25 29
6122 E4: 3 7 11 15 19 23 27 31
6124 And RESULT_CHAIN after reordering:
6126 1st vec (E1): 0 4 8 12 16 20 24 28
6127 2nd vec (E3): 1 5 9 13 17 21 25 29
6128 3rd vec (E2): 2 6 10 14 18 22 26 30
6129 4th vec (E4): 3 7 11 15 19 23 27 31. */
6132 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6133 unsigned int length
,
6134 stmt_vec_info stmt_info
,
6135 gimple_stmt_iterator
*gsi
,
6136 vec
<tree
> *result_chain
)
6138 tree data_ref
, first_vect
, second_vect
;
6139 tree perm_mask_even
, perm_mask_odd
;
6140 tree perm3_mask_low
, perm3_mask_high
;
6142 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6143 unsigned int i
, j
, log_length
= exact_log2 (length
);
6145 result_chain
->quick_grow (length
);
6146 memcpy (result_chain
->address (), dr_chain
.address (),
6147 length
* sizeof (tree
));
6151 /* vect_grouped_load_supported ensures that this is constant. */
6152 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6155 vec_perm_builder
sel (nelt
, nelt
, 1);
6156 sel
.quick_grow (nelt
);
6157 vec_perm_indices indices
;
6158 for (k
= 0; k
< 3; k
++)
6160 for (i
= 0; i
< nelt
; i
++)
6161 if (3 * i
+ k
< 2 * nelt
)
6165 indices
.new_vector (sel
, 2, nelt
);
6166 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6168 for (i
= 0, j
= 0; i
< nelt
; i
++)
6169 if (3 * i
+ k
< 2 * nelt
)
6172 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6173 indices
.new_vector (sel
, 2, nelt
);
6174 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6176 first_vect
= dr_chain
[0];
6177 second_vect
= dr_chain
[1];
6179 /* Create interleaving stmt (low part of):
6180 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6182 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6183 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6184 second_vect
, perm3_mask_low
);
6185 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6187 /* Create interleaving stmt (high part of):
6188 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6190 first_vect
= data_ref
;
6191 second_vect
= dr_chain
[2];
6192 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6193 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6194 second_vect
, perm3_mask_high
);
6195 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6196 (*result_chain
)[k
] = data_ref
;
6201 /* If length is not equal to 3 then only power of 2 is supported. */
6202 gcc_assert (pow2p_hwi (length
));
6204 /* The encoding has a single stepped pattern. */
6205 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6206 vec_perm_builder
sel (nelt
, 1, 3);
6208 for (i
= 0; i
< 3; ++i
)
6210 vec_perm_indices
indices (sel
, 2, nelt
);
6211 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6213 for (i
= 0; i
< 3; ++i
)
6215 indices
.new_vector (sel
, 2, nelt
);
6216 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6218 for (i
= 0; i
< log_length
; i
++)
6220 for (j
= 0; j
< length
; j
+= 2)
6222 first_vect
= dr_chain
[j
];
6223 second_vect
= dr_chain
[j
+1];
6225 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6226 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6227 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6228 first_vect
, second_vect
,
6230 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6231 (*result_chain
)[j
/2] = data_ref
;
6233 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6234 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6235 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6236 first_vect
, second_vect
,
6238 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6239 (*result_chain
)[j
/2+length
/2] = data_ref
;
6241 memcpy (dr_chain
.address (), result_chain
->address (),
6242 length
* sizeof (tree
));
6247 /* Function vect_shift_permute_load_chain.
6249 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6250 sequence of stmts to reorder the input data accordingly.
6251 Return the final references for loads in RESULT_CHAIN.
6252 Return true if successed, false otherwise.
6254 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6255 The input is 3 vectors each containing 8 elements. We assign a
6256 number to each element, the input sequence is:
6258 1st vec: 0 1 2 3 4 5 6 7
6259 2nd vec: 8 9 10 11 12 13 14 15
6260 3rd vec: 16 17 18 19 20 21 22 23
6262 The output sequence should be:
6264 1st vec: 0 3 6 9 12 15 18 21
6265 2nd vec: 1 4 7 10 13 16 19 22
6266 3rd vec: 2 5 8 11 14 17 20 23
6268 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6270 First we shuffle all 3 vectors to get correct elements order:
6272 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6273 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6274 3rd vec: (16 19 22) (17 20 23) (18 21)
6276 Next we unite and shift vector 3 times:
6279 shift right by 6 the concatenation of:
6280 "1st vec" and "2nd vec"
6281 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6282 "2nd vec" and "3rd vec"
6283 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6284 "3rd vec" and "1st vec"
6285 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6288 So that now new vectors are:
6290 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6291 2nd vec: (10 13) (16 19 22) (17 20 23)
6292 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6295 shift right by 5 the concatenation of:
6296 "1st vec" and "3rd vec"
6297 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6298 "2nd vec" and "1st vec"
6299 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6300 "3rd vec" and "2nd vec"
6301 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6304 So that now new vectors are:
6306 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6307 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6308 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6311 shift right by 5 the concatenation of:
6312 "1st vec" and "1st vec"
6313 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6314 shift right by 3 the concatenation of:
6315 "2nd vec" and "2nd vec"
6316 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6319 So that now all vectors are READY:
6320 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6321 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6322 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6324 This algorithm is faster than one in vect_permute_load_chain if:
6325 1. "shift of a concatination" is faster than general permutation.
6327 2. The TARGET machine can't execute vector instructions in parallel.
6328 This is because each step of the algorithm depends on previous.
6329 The algorithm in vect_permute_load_chain is much more parallel.
6331 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6335 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6336 unsigned int length
,
6337 stmt_vec_info stmt_info
,
6338 gimple_stmt_iterator
*gsi
,
6339 vec
<tree
> *result_chain
)
6341 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6342 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6343 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6346 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6347 machine_mode vmode
= TYPE_MODE (vectype
);
6349 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6351 unsigned HOST_WIDE_INT nelt
, vf
;
6352 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6353 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6354 /* Not supported for variable-length vectors. */
6357 vec_perm_builder
sel (nelt
, nelt
, 1);
6358 sel
.quick_grow (nelt
);
6360 result_chain
->quick_grow (length
);
6361 memcpy (result_chain
->address (), dr_chain
.address (),
6362 length
* sizeof (tree
));
6364 if (pow2p_hwi (length
) && vf
> 4)
6366 unsigned int j
, log_length
= exact_log2 (length
);
6367 for (i
= 0; i
< nelt
/ 2; ++i
)
6369 for (i
= 0; i
< nelt
/ 2; ++i
)
6370 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6371 vec_perm_indices
indices (sel
, 2, nelt
);
6372 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6374 if (dump_enabled_p ())
6375 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6376 "shuffle of 2 fields structure is not \
6377 supported by target\n");
6380 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6382 for (i
= 0; i
< nelt
/ 2; ++i
)
6384 for (i
= 0; i
< nelt
/ 2; ++i
)
6385 sel
[nelt
/ 2 + i
] = i
* 2;
6386 indices
.new_vector (sel
, 2, nelt
);
6387 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6389 if (dump_enabled_p ())
6390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6391 "shuffle of 2 fields structure is not \
6392 supported by target\n");
6395 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6397 /* Generating permutation constant to shift all elements.
6398 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6399 for (i
= 0; i
< nelt
; i
++)
6400 sel
[i
] = nelt
/ 2 + i
;
6401 indices
.new_vector (sel
, 2, nelt
);
6402 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6404 if (dump_enabled_p ())
6405 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6406 "shift permutation is not supported by target\n");
6409 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6411 /* Generating permutation constant to select vector from 2.
6412 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6413 for (i
= 0; i
< nelt
/ 2; i
++)
6415 for (i
= nelt
/ 2; i
< nelt
; i
++)
6417 indices
.new_vector (sel
, 2, nelt
);
6418 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6420 if (dump_enabled_p ())
6421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6422 "select is not supported by target\n");
6425 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6427 for (i
= 0; i
< log_length
; i
++)
6429 for (j
= 0; j
< length
; j
+= 2)
6431 first_vect
= dr_chain
[j
];
6432 second_vect
= dr_chain
[j
+ 1];
6434 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6435 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6436 first_vect
, first_vect
,
6438 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6441 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6442 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6443 second_vect
, second_vect
,
6445 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6448 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6449 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6450 vect
[0], vect
[1], shift1_mask
);
6451 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6452 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6454 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6455 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6456 vect
[0], vect
[1], select_mask
);
6457 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6458 (*result_chain
)[j
/2] = data_ref
;
6460 memcpy (dr_chain
.address (), result_chain
->address (),
6461 length
* sizeof (tree
));
6465 if (length
== 3 && vf
> 2)
6467 unsigned int k
= 0, l
= 0;
6469 /* Generating permutation constant to get all elements in rigth order.
6470 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6471 for (i
= 0; i
< nelt
; i
++)
6473 if (3 * k
+ (l
% 3) >= nelt
)
6476 l
+= (3 - (nelt
% 3));
6478 sel
[i
] = 3 * k
+ (l
% 3);
6481 vec_perm_indices
indices (sel
, 2, nelt
);
6482 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6484 if (dump_enabled_p ())
6485 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6486 "shuffle of 3 fields structure is not \
6487 supported by target\n");
6490 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6492 /* Generating permutation constant to shift all elements.
6493 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6494 for (i
= 0; i
< nelt
; i
++)
6495 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6496 indices
.new_vector (sel
, 2, nelt
);
6497 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6499 if (dump_enabled_p ())
6500 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6501 "shift permutation is not supported by target\n");
6504 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6506 /* Generating permutation constant to shift all elements.
6507 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6508 for (i
= 0; i
< nelt
; i
++)
6509 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6510 indices
.new_vector (sel
, 2, nelt
);
6511 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6513 if (dump_enabled_p ())
6514 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6515 "shift permutation is not supported by target\n");
6518 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6520 /* Generating permutation constant to shift all elements.
6521 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6522 for (i
= 0; i
< nelt
; i
++)
6523 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6524 indices
.new_vector (sel
, 2, nelt
);
6525 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6527 if (dump_enabled_p ())
6528 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6529 "shift permutation is not supported by target\n");
6532 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6534 /* Generating permutation constant to shift all elements.
6535 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6536 for (i
= 0; i
< nelt
; i
++)
6537 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6538 indices
.new_vector (sel
, 2, nelt
);
6539 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6541 if (dump_enabled_p ())
6542 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6543 "shift permutation is not supported by target\n");
6546 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6548 for (k
= 0; k
< 3; k
++)
6550 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6551 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6552 dr_chain
[k
], dr_chain
[k
],
6554 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6558 for (k
= 0; k
< 3; k
++)
6560 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6561 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6562 vect
[k
% 3], vect
[(k
+ 1) % 3],
6564 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6565 vect_shift
[k
] = data_ref
;
6568 for (k
= 0; k
< 3; k
++)
6570 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6571 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6572 vect_shift
[(4 - k
) % 3],
6573 vect_shift
[(3 - k
) % 3],
6575 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6579 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6581 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6582 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6583 vect
[0], shift3_mask
);
6584 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6585 (*result_chain
)[nelt
% 3] = data_ref
;
6587 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6588 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6589 vect
[1], shift4_mask
);
6590 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6591 (*result_chain
)[0] = data_ref
;
6597 /* Function vect_transform_grouped_load.
6599 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6600 to perform their permutation and ascribe the result vectorized statements to
6601 the scalar statements.
6605 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6607 int size
, gimple_stmt_iterator
*gsi
)
6610 vec
<tree
> result_chain
= vNULL
;
6612 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6613 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6614 vectors, that are ready for vector computation. */
6615 result_chain
.create (size
);
6617 /* If reassociation width for vector type is 2 or greater target machine can
6618 execute 2 or more vector instructions in parallel. Otherwise try to
6619 get chain for loads group using vect_shift_permute_load_chain. */
6620 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6621 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6623 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6624 gsi
, &result_chain
))
6625 vect_permute_load_chain (vinfo
, dr_chain
,
6626 size
, stmt_info
, gsi
, &result_chain
);
6627 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6628 result_chain
.release ();
6631 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6632 generated as part of the vectorization of STMT_INFO. Assign the statement
6633 for each vector to the associated scalar statement. */
6636 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
6637 vec
<tree
> result_chain
)
6639 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6640 unsigned int i
, gap_count
;
6643 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6644 Since we scan the chain starting from it's first node, their order
6645 corresponds the order of data-refs in RESULT_CHAIN. */
6646 stmt_vec_info next_stmt_info
= first_stmt_info
;
6648 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6650 if (!next_stmt_info
)
6653 /* Skip the gaps. Loads created for the gaps will be removed by dead
6654 code elimination pass later. No need to check for the first stmt in
6655 the group, since it always exists.
6656 DR_GROUP_GAP is the number of steps in elements from the previous
6657 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6658 correspond to the gaps. */
6659 if (next_stmt_info
!= first_stmt_info
6660 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6666 /* ??? The following needs cleanup after the removal of
6667 DR_GROUP_SAME_DR_STMT. */
6670 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6671 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6672 copies, and we put the new vector statement last. */
6673 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
6675 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
6681 /* Function vect_force_dr_alignment_p.
6683 Returns whether the alignment of a DECL can be forced to be aligned
6684 on ALIGNMENT bit boundary. */
6687 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
6692 if (decl_in_symtab_p (decl
)
6693 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6696 if (TREE_STATIC (decl
))
6697 return (known_le (alignment
,
6698 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
6700 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
6703 /* Return whether the data reference DR_INFO is supported with respect to its
6705 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6706 it is aligned, i.e., check if it is possible to vectorize it with different
6709 enum dr_alignment_support
6710 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
6711 tree vectype
, int misalignment
)
6713 data_reference
*dr
= dr_info
->dr
;
6714 stmt_vec_info stmt_info
= dr_info
->stmt
;
6715 machine_mode mode
= TYPE_MODE (vectype
);
6716 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6717 class loop
*vect_loop
= NULL
;
6718 bool nested_in_vect_loop
= false;
6720 if (misalignment
== 0)
6723 /* For now assume all conditional loads/stores support unaligned
6724 access without any special code. */
6725 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
6726 if (gimple_call_internal_p (stmt
)
6727 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6728 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6729 return dr_unaligned_supported
;
6733 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6734 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
6737 /* Possibly unaligned access. */
6739 /* We can choose between using the implicit realignment scheme (generating
6740 a misaligned_move stmt) and the explicit realignment scheme (generating
6741 aligned loads with a REALIGN_LOAD). There are two variants to the
6742 explicit realignment scheme: optimized, and unoptimized.
6743 We can optimize the realignment only if the step between consecutive
6744 vector loads is equal to the vector size. Since the vector memory
6745 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6746 is guaranteed that the misalignment amount remains the same throughout the
6747 execution of the vectorized loop. Therefore, we can create the
6748 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6749 at the loop preheader.
6751 However, in the case of outer-loop vectorization, when vectorizing a
6752 memory access in the inner-loop nested within the LOOP that is now being
6753 vectorized, while it is guaranteed that the misalignment of the
6754 vectorized memory access will remain the same in different outer-loop
6755 iterations, it is *not* guaranteed that is will remain the same throughout
6756 the execution of the inner-loop. This is because the inner-loop advances
6757 with the original scalar step (and not in steps of VS). If the inner-loop
6758 step happens to be a multiple of VS, then the misalignment remains fixed
6759 and we can use the optimized realignment scheme. For example:
6765 When vectorizing the i-loop in the above example, the step between
6766 consecutive vector loads is 1, and so the misalignment does not remain
6767 fixed across the execution of the inner-loop, and the realignment cannot
6768 be optimized (as illustrated in the following pseudo vectorized loop):
6770 for (i=0; i<N; i+=4)
6771 for (j=0; j<M; j++){
6772 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6773 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6774 // (assuming that we start from an aligned address).
6777 We therefore have to use the unoptimized realignment scheme:
6779 for (i=0; i<N; i+=4)
6780 for (j=k; j<M; j+=4)
6781 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6782 // that the misalignment of the initial address is
6785 The loop can then be vectorized as follows:
6787 for (k=0; k<4; k++){
6788 rt = get_realignment_token (&vp[k]);
6789 for (i=0; i<N; i+=4){
6791 for (j=k; j<M; j+=4){
6793 va = REALIGN_LOAD <v1,v2,rt>;
6800 if (DR_IS_READ (dr
))
6802 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6803 && (!targetm
.vectorize
.builtin_mask_for_load
6804 || targetm
.vectorize
.builtin_mask_for_load ()))
6806 /* If we are doing SLP then the accesses need not have the
6807 same alignment, instead it depends on the SLP group size. */
6809 && STMT_SLP_TYPE (stmt_info
)
6810 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6812 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6813 TYPE_VECTOR_SUBPARTS (vectype
)))
6815 else if (!loop_vinfo
6816 || (nested_in_vect_loop
6817 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6818 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6819 return dr_explicit_realign
;
6821 return dr_explicit_realign_optimized
;
6825 bool is_packed
= false;
6826 tree type
= TREE_TYPE (DR_REF (dr
));
6827 if (misalignment
== DR_MISALIGNMENT_UNKNOWN
)
6828 is_packed
= not_size_aligned (DR_REF (dr
));
6829 if (targetm
.vectorize
.support_vector_misalignment (mode
, type
, misalignment
,
6831 return dr_unaligned_supported
;
6834 return dr_unaligned_unsupported
;