1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
62 tree vectype
, unsigned HOST_WIDE_INT count
)
64 machine_mode mode
, array_mode
;
67 mode
= TYPE_MODE (vectype
);
68 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
70 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
71 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
72 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode
), count
);
82 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
86 "cannot use %s<%s><%s>\n", name
,
87 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE
, vect_location
,
93 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
94 GET_MODE_NAME (mode
));
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
,
119 HOST_WIDE_INT
*lhs_size_unit
,
120 HOST_WIDE_INT
*rhs_size_unit
)
122 tree scalar_type
= gimple_expr_type (stmt_info
->stmt
);
123 HOST_WIDE_INT lhs
, rhs
;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
130 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
132 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
134 && (gimple_assign_cast_p (assign
)
135 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
))
141 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
143 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
145 scalar_type
= rhs_type
;
147 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
150 if (gimple_call_internal_p (call
))
152 internal_fn ifn
= gimple_call_internal_fn (call
);
153 if (internal_load_fn_p (ifn
) || internal_store_fn_p (ifn
))
154 /* gimple_expr_type already picked the type of the loaded
157 else if (internal_fn_mask_index (ifn
) == 0)
160 if (i
< gimple_call_num_args (call
))
162 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
165 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
167 scalar_type
= rhs_type
;
172 *lhs_size_unit
= lhs
;
173 *rhs_size_unit
= rhs
;
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
183 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
185 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
187 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
188 return opt_result::failure_at (vect_location
,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
194 = runtime_alias_check_p (ddr
, loop
,
195 optimize_loop_nest_for_speed_p (loop
));
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
200 return opt_result::success ();
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
206 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
208 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
209 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
210 if (checks
[i
] == value
)
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE
, vect_location
,
215 "need run-time check that %T is nonzero\n",
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
225 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
227 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
228 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
235 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
236 SLP group are emitted at the position of the last scalar load and
237 all loads in an interleaving group are emitted at the position
238 of the first scalar load.
239 Stores in a group are emitted at the position of the last scalar store.
240 Compute that position and check whether the resulting order matches
242 We have not yet decided between SLP and interleaving so we have
243 to conservatively assume both. */
245 stmt_vec_info last_a
= il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
248 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (last_a
); s
;
249 s
= DR_GROUP_NEXT_ELEMENT (s
))
250 last_a
= get_later_stmt (last_a
, s
);
251 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
253 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
254 s
= DR_GROUP_NEXT_ELEMENT (s
))
255 if (get_later_stmt (il_a
, s
) == il_a
)
262 last_a
= il_a
= stmtinfo_a
;
264 stmt_vec_info last_b
= il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
267 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (last_b
); s
;
268 s
= DR_GROUP_NEXT_ELEMENT (s
))
269 last_b
= get_later_stmt (last_b
, s
);
270 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
272 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
273 s
= DR_GROUP_NEXT_ELEMENT (s
))
274 if (get_later_stmt (il_b
, s
) == il_b
)
281 last_b
= il_b
= stmtinfo_b
;
282 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
284 (get_later_stmt (last_a
, last_b
) == last_a
) == a_after_b
286 && (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
288 && (get_later_stmt (il_a
, last_b
) == il_a
) == a_after_b
289 && (get_later_stmt (last_a
, il_b
) == last_a
) == a_after_b
);
292 /* A subroutine of vect_analyze_data_ref_dependence. Handle
293 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
294 distances. These distances are conservatively correct but they don't
295 reflect a guaranteed dependence.
297 Return true if this function does all the work necessary to avoid
298 an alias or false if the caller should use the dependence distances
299 to limit the vectorization factor in the usual way. LOOP_DEPTH is
300 the depth of the loop described by LOOP_VINFO and the other arguments
301 are as for vect_analyze_data_ref_dependence. */
304 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
305 loop_vec_info loop_vinfo
,
306 int loop_depth
, unsigned int *max_vf
)
308 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
309 lambda_vector dist_v
;
311 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
313 int dist
= dist_v
[loop_depth
];
314 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
316 /* If the user asserted safelen >= DIST consecutive iterations
317 can be executed concurrently, assume independence.
319 ??? An alternative would be to add the alias check even
320 in this case, and vectorize the fallback loop with the
321 maximum VF set to safelen. However, if the user has
322 explicitly given a length, it's less likely that that
324 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
326 if ((unsigned int) loop
->safelen
< *max_vf
)
327 *max_vf
= loop
->safelen
;
328 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
332 /* For dependence distances of 2 or more, we have the option
333 of limiting VF or checking for an alias at runtime.
334 Prefer to check at runtime if we can, to avoid limiting
335 the VF unnecessarily when the bases are in fact independent.
337 Note that the alias checks will be removed if the VF ends up
338 being small enough. */
339 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
340 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
341 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
342 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
343 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
350 /* Function vect_analyze_data_ref_dependence.
352 FIXME: I needed to change the sense of the returned flag.
354 Return FALSE if there (might) exist a dependence between a memory-reference
355 DRA and a memory-reference DRB. When versioning for alias may check a
356 dependence at run-time, return TRUE. Adjust *MAX_VF according to
357 the data dependence. */
360 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
361 loop_vec_info loop_vinfo
,
362 unsigned int *max_vf
)
365 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
366 struct data_reference
*dra
= DDR_A (ddr
);
367 struct data_reference
*drb
= DDR_B (ddr
);
368 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
369 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
370 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
371 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
372 lambda_vector dist_v
;
373 unsigned int loop_depth
;
375 /* In loop analysis all data references should be vectorizable. */
376 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
377 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
380 /* Independent data accesses. */
381 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
382 return opt_result::success ();
385 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
386 return opt_result::success ();
388 /* We do not have to consider dependences between accesses that belong
389 to the same group, unless the stride could be smaller than the
391 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
392 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
393 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
394 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
395 return opt_result::success ();
397 /* Even if we have an anti-dependence then, as the vectorized loop covers at
398 least two scalar iterations, there is always also a true dependence.
399 As the vectorizer does not re-order loads and stores we can ignore
400 the anti-dependence if TBAA can disambiguate both DRs similar to the
401 case with known negative distance anti-dependences (positive
402 distance anti-dependences would violate TBAA constraints). */
403 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
404 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
405 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
406 get_alias_set (DR_REF (drb
))))
407 return opt_result::success ();
409 /* Unknown data dependence. */
410 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
412 /* If user asserted safelen consecutive iterations can be
413 executed concurrently, assume independence. */
414 if (loop
->safelen
>= 2)
416 if ((unsigned int) loop
->safelen
< *max_vf
)
417 *max_vf
= loop
->safelen
;
418 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
419 return opt_result::success ();
422 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
423 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
424 return opt_result::failure_at
426 "versioning for alias not supported for: "
427 "can't determine dependence between %T and %T\n",
428 DR_REF (dra
), DR_REF (drb
));
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
432 "versioning for alias required: "
433 "can't determine dependence between %T and %T\n",
434 DR_REF (dra
), DR_REF (drb
));
436 /* Add to list of ddrs that need to be tested at run-time. */
437 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
440 /* Known data dependence. */
441 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
443 /* If user asserted safelen consecutive iterations can be
444 executed concurrently, assume independence. */
445 if (loop
->safelen
>= 2)
447 if ((unsigned int) loop
->safelen
< *max_vf
)
448 *max_vf
= loop
->safelen
;
449 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
450 return opt_result::success ();
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
455 return opt_result::failure_at
457 "versioning for alias not supported for: "
458 "bad dist vector for %T and %T\n",
459 DR_REF (dra
), DR_REF (drb
));
461 if (dump_enabled_p ())
462 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
463 "versioning for alias required: "
464 "bad dist vector for %T and %T\n",
465 DR_REF (dra
), DR_REF (drb
));
466 /* Add to list of ddrs that need to be tested at run-time. */
467 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
470 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
472 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
473 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
475 return opt_result::success ();
477 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
479 int dist
= dist_v
[loop_depth
];
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_NOTE
, vect_location
,
483 "dependence distance = %d.\n", dist
);
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE
, vect_location
,
489 "dependence distance == 0 between %T and %T\n",
490 DR_REF (dra
), DR_REF (drb
));
492 /* When we perform grouped accesses and perform implicit CSE
493 by detecting equal accesses and doing disambiguation with
494 runtime alias tests like for
502 where we will end up loading { a[i], a[i+1] } once, make
503 sure that inserting group loads before the first load and
504 stores after the last store will do the right thing.
505 Similar for groups like
509 where loads from the group interleave with the store. */
510 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
511 return opt_result::failure_at (stmtinfo_a
->stmt
,
512 "READ_WRITE dependence"
513 " in interleaving.\n");
515 if (loop
->safelen
< 2)
517 tree indicator
= dr_zero_step_indicator (dra
);
518 if (!indicator
|| integer_zerop (indicator
))
519 return opt_result::failure_at (stmtinfo_a
->stmt
,
520 "access also has a zero step\n");
521 else if (TREE_CODE (indicator
) != INTEGER_CST
)
522 vect_check_nonzero_value (loop_vinfo
, indicator
);
527 if (dist
> 0 && DDR_REVERSED_P (ddr
))
529 /* If DDR_REVERSED_P the order of the data-refs in DDR was
530 reversed (to make distance vector positive), and the actual
531 distance is negative. */
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_NOTE
, vect_location
,
534 "dependence distance negative.\n");
535 /* When doing outer loop vectorization, we need to check if there is
536 a backward dependence at the inner loop level if the dependence
537 at the outer loop is reversed. See PR81740. */
538 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
539 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
541 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
542 DDR_LOOP_NEST (ddr
));
543 if (dist_v
[inner_depth
] < 0)
544 return opt_result::failure_at (stmtinfo_a
->stmt
,
545 "not vectorized, dependence "
546 "between data-refs %T and %T\n",
547 DR_REF (dra
), DR_REF (drb
));
549 /* Record a negative dependence distance to later limit the
550 amount of stmt copying / unrolling we can perform.
551 Only need to handle read-after-write dependence. */
553 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
554 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
555 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
559 unsigned int abs_dist
= abs (dist
);
560 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
562 /* The dependence distance requires reduction of the maximal
563 vectorization factor. */
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE
, vect_location
,
567 "adjusting maximal vectorization factor to %i\n",
571 if (abs_dist
>= *max_vf
)
573 /* Dependence distance does not create dependence, as far as
574 vectorization is concerned, in this case. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE
, vect_location
,
577 "dependence distance >= VF.\n");
581 return opt_result::failure_at (stmtinfo_a
->stmt
,
582 "not vectorized, possible dependence "
583 "between data-refs %T and %T\n",
584 DR_REF (dra
), DR_REF (drb
));
587 return opt_result::success ();
590 /* Function vect_analyze_data_ref_dependences.
592 Examine all the data references in the loop, and make sure there do not
593 exist any data dependences between them. Set *MAX_VF according to
594 the maximum vectorization factor the data dependences allow. */
597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
598 unsigned int *max_vf
)
601 struct data_dependence_relation
*ddr
;
603 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
605 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
607 LOOP_VINFO_DDRS (loop_vinfo
)
608 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
609 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
610 /* We need read-read dependences to compute
611 STMT_VINFO_SAME_ALIGN_REFS. */
612 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
613 &LOOP_VINFO_DDRS (loop_vinfo
),
614 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
619 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
621 /* For epilogues we either have no aliases or alias versioning
622 was applied to original loop. Therefore we may just get max_vf
623 using VF of original loop. */
624 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
625 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
627 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
630 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
635 return opt_result::success ();
639 /* Function vect_slp_analyze_data_ref_dependence.
641 Return TRUE if there (might) exist a dependence between a memory-reference
642 DRA and a memory-reference DRB for VINFO. When versioning for alias
643 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
644 according to the data dependence. */
647 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
648 struct data_dependence_relation
*ddr
)
650 struct data_reference
*dra
= DDR_A (ddr
);
651 struct data_reference
*drb
= DDR_B (ddr
);
652 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
653 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
655 /* We need to check dependences of statements marked as unvectorizable
656 as well, they still can prohibit vectorization. */
658 /* Independent data accesses. */
659 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
665 /* Read-read is OK. */
666 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
669 /* If dra and drb are part of the same interleaving chain consider
671 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
672 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
673 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
676 /* Unknown data dependence. */
677 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
681 "can't determine dependence between %T and %T\n",
682 DR_REF (dra
), DR_REF (drb
));
684 else if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE
, vect_location
,
686 "determined dependence between %T and %T\n",
687 DR_REF (dra
), DR_REF (drb
));
693 /* Analyze dependences involved in the transform of SLP NODE. STORES
694 contain the vector of scalar stores of this instance if we are
695 disambiguating the loads. */
698 vect_slp_analyze_node_dependences (vec_info
*vinfo
,
699 slp_instance instance
, slp_tree node
,
700 vec
<stmt_vec_info
> stores
,
701 stmt_vec_info last_store_info
)
703 /* This walks over all stmts involved in the SLP load/store done
704 in NODE verifying we can sink them up to the last stmt in the
706 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
707 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
709 stmt_vec_info access_info
= SLP_TREE_SCALAR_STMTS (node
)[k
];
710 if (access_info
== last_access_info
)
712 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
714 bool ref_initialized_p
= false;
715 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
716 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
718 gimple
*stmt
= gsi_stmt (gsi
);
719 if (! gimple_vuse (stmt
)
720 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
723 /* If we couldn't record a (single) data reference for this
724 stmt we have to resort to the alias oracle. */
725 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
726 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
729 /* We are moving a store or sinking a load - this means
730 we cannot use TBAA for disambiguation. */
731 if (!ref_initialized_p
)
732 ao_ref_init (&ref
, DR_REF (dr_a
));
733 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
734 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
739 bool dependent
= false;
740 /* If we run into a store of this same instance (we've just
741 marked those) then delay dependence checking until we run
742 into the last store because this is where it will have
743 been sunk to (and we verify if we can do that as well). */
744 if (gimple_visited_p (stmt
))
746 if (stmt_info
!= last_store_info
)
749 stmt_vec_info store_info
;
750 FOR_EACH_VEC_ELT (stores
, i
, store_info
)
752 data_reference
*store_dr
= STMT_VINFO_DATA_REF (store_info
);
753 ddr_p ddr
= initialize_data_dependence_relation
754 (dr_a
, store_dr
, vNULL
);
756 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
757 free_dependence_relation (ddr
);
764 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
766 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
767 free_dependence_relation (ddr
);
777 /* Function vect_analyze_data_ref_dependences.
779 Examine all the data references in the basic-block, and make sure there
780 do not exist any data dependences between them. Set *MAX_VF according to
781 the maximum vectorization factor the data dependences allow. */
784 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
786 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
788 /* The stores of this instance are at the root of the SLP tree. */
789 slp_tree store
= SLP_INSTANCE_TREE (instance
);
790 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store
)[0]))
793 /* Verify we can sink stores to the vectorized stmt insert location. */
794 stmt_vec_info last_store_info
= NULL
;
797 if (! vect_slp_analyze_node_dependences (vinfo
, instance
, store
,
801 /* Mark stores in this instance and remember the last one. */
802 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
803 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
804 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
809 /* Verify we can sink loads to the vectorized stmt insert location,
810 special-casing stores of this instance. */
813 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
814 if (! vect_slp_analyze_node_dependences (vinfo
, instance
, load
,
816 ? SLP_TREE_SCALAR_STMTS (store
)
817 : vNULL
, last_store_info
))
823 /* Unset the visited flag. */
825 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
826 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
831 /* Record the base alignment guarantee given by DRB, which occurs
835 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
836 innermost_loop_behavior
*drb
)
839 innermost_loop_behavior
*&entry
840 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
841 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_NOTE
, vect_location
,
846 "recording new base alignment for %T\n"
848 " misalignment: %d\n"
852 drb
->base_misalignment
,
857 /* If the region we're going to vectorize is reached, all unconditional
858 data references occur at least once. We can therefore pool the base
859 alignment guarantees from each unconditional reference. Do this by
860 going through all the data references in VINFO and checking whether
861 the containing statement makes the reference unconditionally. If so,
862 record the alignment of the base address in VINFO so that it can be
863 used for all other references with the same base. */
866 vect_record_base_alignments (vec_info
*vinfo
)
868 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
869 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
872 FOR_EACH_VEC_ELT (vinfo
->shared
->datarefs
, i
, dr
)
874 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
875 stmt_vec_info stmt_info
= dr_info
->stmt
;
876 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
877 && STMT_VINFO_VECTORIZABLE (stmt_info
)
878 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
880 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
882 /* If DR is nested in the loop that is being vectorized, we can also
883 record the alignment of the base wrt the outer loop. */
884 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
885 vect_record_base_alignment
886 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
891 /* Return the target alignment for the vectorized form of DR_INFO. */
894 vect_calculate_target_alignment (dr_vec_info
*dr_info
)
896 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
897 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
900 /* Function vect_compute_data_ref_alignment
902 Compute the misalignment of the data reference DR_INFO.
905 1. DR_MISALIGNMENT (DR_INFO) is defined.
907 FOR NOW: No analysis is actually performed. Misalignment is calculated
908 only for trivial cases. TODO. */
911 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
)
913 stmt_vec_info stmt_info
= dr_info
->stmt
;
914 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
915 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
916 class loop
*loop
= NULL
;
917 tree ref
= DR_REF (dr_info
->dr
);
918 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
920 if (dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE
, vect_location
,
922 "vect_compute_data_ref_alignment:\n");
925 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
927 /* Initialize misalignment to unknown. */
928 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
930 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
933 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
934 bool step_preserves_misalignment_p
;
936 poly_uint64 vector_alignment
937 = exact_div (vect_calculate_target_alignment (dr_info
), BITS_PER_UNIT
);
938 DR_TARGET_ALIGNMENT (dr_info
) = vector_alignment
;
940 /* If the main loop has peeled for alignment we have no way of knowing
941 whether the data accesses in the epilogues are aligned. We can't at
942 compile time answer the question whether we have entered the main loop or
943 not. Fixes PR 92351. */
946 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
948 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
952 unsigned HOST_WIDE_INT vect_align_c
;
953 if (!vector_alignment
.is_constant (&vect_align_c
))
956 /* No step for BB vectorization. */
959 gcc_assert (integer_zerop (drb
->step
));
960 step_preserves_misalignment_p
= true;
963 /* In case the dataref is in an inner-loop of the loop that is being
964 vectorized (LOOP), we use the base and misalignment information
965 relative to the outer-loop (LOOP). This is ok only if the misalignment
966 stays the same throughout the execution of the inner-loop, which is why
967 we have to check that the stride of the dataref in the inner-loop evenly
968 divides by the vector alignment. */
969 else if (nested_in_vect_loop_p (loop
, stmt_info
))
971 step_preserves_misalignment_p
972 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
974 if (dump_enabled_p ())
976 if (step_preserves_misalignment_p
)
977 dump_printf_loc (MSG_NOTE
, vect_location
,
978 "inner step divides the vector alignment.\n");
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
981 "inner step doesn't divide the vector"
986 /* Similarly we can only use base and misalignment information relative to
987 an innermost loop if the misalignment stays the same throughout the
988 execution of the loop. As above, this is the case if the stride of
989 the dataref evenly divides by the alignment. */
992 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
993 step_preserves_misalignment_p
994 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
996 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
998 "step doesn't divide the vector alignment.\n");
1001 unsigned int base_alignment
= drb
->base_alignment
;
1002 unsigned int base_misalignment
= drb
->base_misalignment
;
1004 /* Calculate the maximum of the pooled base address alignment and the
1005 alignment that we can compute for DR itself. */
1006 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
1007 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
1009 base_alignment
= (*entry
)->base_alignment
;
1010 base_misalignment
= (*entry
)->base_misalignment
;
1013 if (drb
->offset_alignment
< vect_align_c
1014 || !step_preserves_misalignment_p
1015 /* We need to know whether the step wrt the vectorized loop is
1016 negative when computing the starting misalignment below. */
1017 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1021 "Unknown alignment for access: %T\n", ref
);
1025 if (base_alignment
< vect_align_c
)
1027 unsigned int max_alignment
;
1028 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1029 if (max_alignment
< vect_align_c
1030 || !vect_can_force_dr_alignment_p (base
,
1031 vect_align_c
* BITS_PER_UNIT
))
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_NOTE
, vect_location
,
1035 "can't force alignment of ref: %T\n", ref
);
1039 /* Force the alignment of the decl.
1040 NOTE: This is the only change to the code we make during
1041 the analysis phase, before deciding to vectorize the loop. */
1042 if (dump_enabled_p ())
1043 dump_printf_loc (MSG_NOTE
, vect_location
,
1044 "force alignment of %T\n", ref
);
1046 dr_info
->base_decl
= base
;
1047 dr_info
->base_misaligned
= true;
1048 base_misalignment
= 0;
1050 poly_int64 misalignment
1051 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1053 /* If this is a backward running DR then first access in the larger
1054 vectype actually is N-1 elements before the address in the DR.
1055 Adjust misalign accordingly. */
1056 if (tree_int_cst_sgn (drb
->step
) < 0)
1057 /* PLUS because STEP is negative. */
1058 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1059 * TREE_INT_CST_LOW (drb
->step
));
1061 unsigned int const_misalignment
;
1062 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1066 "Non-constant misalignment for access: %T\n", ref
);
1070 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1074 "misalign = %d bytes of ref %T\n",
1075 DR_MISALIGNMENT (dr_info
), ref
);
1080 /* Function vect_update_misalignment_for_peel.
1081 Sets DR_INFO's misalignment
1082 - to 0 if it has the same alignment as DR_PEEL_INFO,
1083 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084 - to -1 (unknown) otherwise.
1086 DR_INFO - the data reference whose misalignment is to be adjusted.
1087 DR_PEEL_INFO - the data reference whose misalignment is being made
1088 zero in the vector loop by the peel.
1089 NPEEL - the number of iterations in the peel loop if the misalignment
1090 of DR_PEEL_INFO is known at compile time. */
1093 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1094 dr_vec_info
*dr_peel_info
, int npeel
)
1097 vec
<dr_p
> same_aligned_drs
;
1098 struct data_reference
*current_dr
;
1099 stmt_vec_info peel_stmt_info
= dr_peel_info
->stmt
;
1101 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102 it is aligned in the vector loop. */
1103 same_aligned_drs
= STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info
);
1104 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1106 if (current_dr
!= dr_info
->dr
)
1108 gcc_assert (!known_alignment_for_access_p (dr_info
)
1109 || !known_alignment_for_access_p (dr_peel_info
)
1110 || (DR_MISALIGNMENT (dr_info
)
1111 == DR_MISALIGNMENT (dr_peel_info
)));
1112 SET_DR_MISALIGNMENT (dr_info
, 0);
1116 unsigned HOST_WIDE_INT alignment
;
1117 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1118 && known_alignment_for_access_p (dr_info
)
1119 && known_alignment_for_access_p (dr_peel_info
))
1121 int misal
= DR_MISALIGNMENT (dr_info
);
1122 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1123 misal
&= alignment
- 1;
1124 SET_DR_MISALIGNMENT (dr_info
, misal
);
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1130 "to unknown (-1).\n");
1131 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1135 /* Function verify_data_ref_alignment
1137 Return TRUE if DR_INFO can be handled with respect to alignment. */
1140 verify_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
)
1142 enum dr_alignment_support supportable_dr_alignment
1143 = vect_supportable_dr_alignment (vinfo
, dr_info
, false);
1144 if (!supportable_dr_alignment
)
1145 return opt_result::failure_at
1146 (dr_info
->stmt
->stmt
,
1147 DR_IS_READ (dr_info
->dr
)
1148 ? "not vectorized: unsupported unaligned load: %T\n"
1149 : "not vectorized: unsupported unaligned store: %T\n",
1150 DR_REF (dr_info
->dr
));
1152 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1153 dump_printf_loc (MSG_NOTE
, vect_location
,
1154 "Vectorizing an unaligned access.\n");
1156 return opt_result::success ();
1159 /* Function vect_verify_datarefs_alignment
1161 Return TRUE if all data references in the loop can be
1162 handled with respect to alignment. */
1165 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1167 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
1168 struct data_reference
*dr
;
1171 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1173 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
1174 stmt_vec_info stmt_info
= dr_info
->stmt
;
1176 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1179 /* For interleaving, only the alignment of the first access matters. */
1180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1181 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1184 /* Strided accesses perform only component accesses, alignment is
1185 irrelevant for them. */
1186 if (STMT_VINFO_STRIDED_P (stmt_info
)
1187 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1190 opt_result res
= verify_data_ref_alignment (vinfo
, dr_info
);
1195 return opt_result::success ();
1198 /* Given an memory reference EXP return whether its alignment is less
1202 not_size_aligned (tree exp
)
1204 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1207 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1208 > get_object_alignment (exp
));
1211 /* Function vector_alignment_reachable_p
1213 Return true if vector alignment for DR_INFO is reachable by peeling
1214 a few loop iterations. Return false otherwise. */
1217 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1219 stmt_vec_info stmt_info
= dr_info
->stmt
;
1220 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1222 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1224 /* For interleaved access we peel only if number of iterations in
1225 the prolog loop ({VF - misalignment}), is a multiple of the
1226 number of the interleaved accesses. */
1227 int elem_size
, mis_in_elements
;
1229 /* FORNOW: handle only known alignment. */
1230 if (!known_alignment_for_access_p (dr_info
))
1233 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1234 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1235 elem_size
= vector_element_size (vector_size
, nelements
);
1236 mis_in_elements
= DR_MISALIGNMENT (dr_info
) / elem_size
;
1238 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1242 /* If misalignment is known at the compile time then allow peeling
1243 only if natural alignment is reachable through peeling. */
1244 if (known_alignment_for_access_p (dr_info
) && !aligned_access_p (dr_info
))
1246 HOST_WIDE_INT elmsize
=
1247 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1248 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_NOTE
, vect_location
,
1251 "data size = %wd. misalignment = %d.\n", elmsize
,
1252 DR_MISALIGNMENT (dr_info
));
1254 if (DR_MISALIGNMENT (dr_info
) % elmsize
)
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1258 "data size does not divide the misalignment.\n");
1263 if (!known_alignment_for_access_p (dr_info
))
1265 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1266 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1269 "Unknown misalignment, %snaturally aligned\n",
1270 is_packed
? "not " : "");
1271 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1278 /* Calculate the cost of the memory access represented by DR_INFO. */
1281 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1282 unsigned int *inside_cost
,
1283 unsigned int *outside_cost
,
1284 stmt_vector_for_cost
*body_cost_vec
,
1285 stmt_vector_for_cost
*prologue_cost_vec
)
1287 stmt_vec_info stmt_info
= dr_info
->stmt
;
1288 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1291 if (PURE_SLP_STMT (stmt_info
))
1294 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1296 if (DR_IS_READ (dr_info
->dr
))
1297 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, true, inside_cost
,
1298 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1300 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, inside_cost
, body_cost_vec
);
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_NOTE
, vect_location
,
1304 "vect_get_data_access_cost: inside_cost = %d, "
1305 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1309 typedef struct _vect_peel_info
1311 dr_vec_info
*dr_info
;
1316 typedef struct _vect_peel_extended_info
1319 struct _vect_peel_info peel_info
;
1320 unsigned int inside_cost
;
1321 unsigned int outside_cost
;
1322 } *vect_peel_extended_info
;
1325 /* Peeling hashtable helpers. */
1327 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1329 static inline hashval_t
hash (const _vect_peel_info
*);
1330 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1334 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1336 return (hashval_t
) peel_info
->npeel
;
1340 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1342 return (a
->npeel
== b
->npeel
);
1346 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1349 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1350 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1353 struct _vect_peel_info elem
, *slot
;
1354 _vect_peel_info
**new_slot
;
1355 bool supportable_dr_alignment
1356 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, true);
1359 slot
= peeling_htab
->find (&elem
);
1364 slot
= XNEW (struct _vect_peel_info
);
1365 slot
->npeel
= npeel
;
1366 slot
->dr_info
= dr_info
;
1368 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1372 if (!supportable_dr_alignment
1373 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1374 slot
->count
+= VECT_MAX_COST
;
1378 /* Traverse peeling hash table to find peeling option that aligns maximum
1379 number of data accesses. */
1382 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1383 _vect_peel_extended_info
*max
)
1385 vect_peel_info elem
= *slot
;
1387 if (elem
->count
> max
->peel_info
.count
1388 || (elem
->count
== max
->peel_info
.count
1389 && max
->peel_info
.npeel
> elem
->npeel
))
1391 max
->peel_info
.npeel
= elem
->npeel
;
1392 max
->peel_info
.count
= elem
->count
;
1393 max
->peel_info
.dr_info
= elem
->dr_info
;
1399 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1400 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1401 we assume DR0_INFO's misalignment will be zero after peeling. */
1404 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1405 dr_vec_info
*dr0_info
,
1406 unsigned int *inside_cost
,
1407 unsigned int *outside_cost
,
1408 stmt_vector_for_cost
*body_cost_vec
,
1409 stmt_vector_for_cost
*prologue_cost_vec
,
1411 bool unknown_misalignment
)
1413 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1417 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1419 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1420 stmt_vec_info stmt_info
= dr_info
->stmt
;
1421 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1424 /* For interleaving, only the alignment of the first access
1426 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1427 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1430 /* Strided accesses perform only component accesses, alignment is
1431 irrelevant for them. */
1432 if (STMT_VINFO_STRIDED_P (stmt_info
)
1433 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1436 int save_misalignment
;
1437 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1440 else if (unknown_misalignment
&& dr_info
== dr0_info
)
1441 SET_DR_MISALIGNMENT (dr_info
, 0);
1443 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1444 vect_get_data_access_cost (loop_vinfo
, dr_info
, inside_cost
, outside_cost
,
1445 body_cost_vec
, prologue_cost_vec
);
1446 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1450 /* Traverse peeling hash table and calculate cost for each peeling option.
1451 Find the one with the lowest cost. */
1454 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1455 _vect_peel_extended_info
*min
)
1457 vect_peel_info elem
= *slot
;
1459 unsigned int inside_cost
= 0, outside_cost
= 0;
1460 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1461 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1464 prologue_cost_vec
.create (2);
1465 body_cost_vec
.create (2);
1466 epilogue_cost_vec
.create (2);
1468 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1469 &outside_cost
, &body_cost_vec
,
1470 &prologue_cost_vec
, elem
->npeel
, false);
1472 body_cost_vec
.release ();
1474 outside_cost
+= vect_get_known_peeling_cost
1475 (loop_vinfo
, elem
->npeel
, &dummy
,
1476 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1477 &prologue_cost_vec
, &epilogue_cost_vec
);
1479 /* Prologue and epilogue costs are added to the target model later.
1480 These costs depend only on the scalar iteration cost, the
1481 number of peeling iterations finally chosen, and the number of
1482 misaligned statements. So discard the information found here. */
1483 prologue_cost_vec
.release ();
1484 epilogue_cost_vec
.release ();
1486 if (inside_cost
< min
->inside_cost
1487 || (inside_cost
== min
->inside_cost
1488 && outside_cost
< min
->outside_cost
))
1490 min
->inside_cost
= inside_cost
;
1491 min
->outside_cost
= outside_cost
;
1492 min
->peel_info
.dr_info
= elem
->dr_info
;
1493 min
->peel_info
.npeel
= elem
->npeel
;
1494 min
->peel_info
.count
= elem
->count
;
1501 /* Choose best peeling option by traversing peeling hash table and either
1502 choosing an option with the lowest cost (if cost model is enabled) or the
1503 option that aligns as many accesses as possible. */
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1507 loop_vec_info loop_vinfo
)
1509 struct _vect_peel_extended_info res
;
1511 res
.peel_info
.dr_info
= NULL
;
1512 res
.vinfo
= loop_vinfo
;
1514 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1516 res
.inside_cost
= INT_MAX
;
1517 res
.outside_cost
= INT_MAX
;
1518 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1519 vect_peeling_hash_get_lowest_cost
> (&res
);
1523 res
.peel_info
.count
= 0;
1524 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1525 vect_peeling_hash_get_most_frequent
> (&res
);
1526 res
.inside_cost
= 0;
1527 res
.outside_cost
= 0;
1533 /* Return true if the new peeling NPEEL is supported. */
1536 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1540 struct data_reference
*dr
= NULL
;
1541 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1542 enum dr_alignment_support supportable_dr_alignment
;
1544 /* Ensure that all data refs can be vectorized after the peel. */
1545 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1547 int save_misalignment
;
1549 if (dr
== dr0_info
->dr
)
1552 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1553 stmt_vec_info stmt_info
= dr_info
->stmt
;
1554 /* For interleaving, only the alignment of the first access
1556 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1557 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1560 /* Strided accesses perform only component accesses, alignment is
1561 irrelevant for them. */
1562 if (STMT_VINFO_STRIDED_P (stmt_info
)
1563 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1566 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1567 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1568 supportable_dr_alignment
1569 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, false);
1570 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1572 if (!supportable_dr_alignment
)
1579 /* Function vect_enhance_data_refs_alignment
1581 This pass will use loop versioning and loop peeling in order to enhance
1582 the alignment of data references in the loop.
1584 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1585 original loop is to be vectorized. Any other loops that are created by
1586 the transformations performed in this pass - are not supposed to be
1587 vectorized. This restriction will be relaxed.
1589 This pass will require a cost model to guide it whether to apply peeling
1590 or versioning or a combination of the two. For example, the scheme that
1591 intel uses when given a loop with several memory accesses, is as follows:
1592 choose one memory access ('p') which alignment you want to force by doing
1593 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1594 other accesses are not necessarily aligned, or (2) use loop versioning to
1595 generate one loop in which all accesses are aligned, and another loop in
1596 which only 'p' is necessarily aligned.
1598 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1599 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1600 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1602 Devising a cost model is the most critical aspect of this work. It will
1603 guide us on which access to peel for, whether to use loop versioning, how
1604 many versions to create, etc. The cost model will probably consist of
1605 generic considerations as well as target specific considerations (on
1606 powerpc for example, misaligned stores are more painful than misaligned
1609 Here are the general steps involved in alignment enhancements:
1611 -- original loop, before alignment analysis:
1612 for (i=0; i<N; i++){
1613 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1614 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1617 -- After vect_compute_data_refs_alignment:
1618 for (i=0; i<N; i++){
1619 x = q[i]; # DR_MISALIGNMENT(q) = 3
1620 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1623 -- Possibility 1: we do loop versioning:
1625 for (i=0; i<N; i++){ # loop 1A
1626 x = q[i]; # DR_MISALIGNMENT(q) = 3
1627 p[i] = y; # DR_MISALIGNMENT(p) = 0
1631 for (i=0; i<N; i++){ # loop 1B
1632 x = q[i]; # DR_MISALIGNMENT(q) = 3
1633 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1637 -- Possibility 2: we do loop peeling:
1638 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1642 for (i = 3; i < N; i++){ # loop 2A
1643 x = q[i]; # DR_MISALIGNMENT(q) = 0
1644 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1647 -- Possibility 3: combination of loop peeling and versioning:
1648 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1653 for (i = 3; i<N; i++){ # loop 3A
1654 x = q[i]; # DR_MISALIGNMENT(q) = 0
1655 p[i] = y; # DR_MISALIGNMENT(p) = 0
1659 for (i = 3; i<N; i++){ # loop 3B
1660 x = q[i]; # DR_MISALIGNMENT(q) = 0
1661 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1665 These loops are later passed to loop_transform to be vectorized. The
1666 vectorizer will use the alignment information to guide the transformation
1667 (whether to generate regular loads/stores, or with special handling for
1671 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1673 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1674 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1675 enum dr_alignment_support supportable_dr_alignment
;
1676 dr_vec_info
*first_store
= NULL
;
1677 dr_vec_info
*dr0_info
= NULL
;
1678 struct data_reference
*dr
;
1680 bool do_peeling
= false;
1681 bool do_versioning
= false;
1682 unsigned int npeel
= 0;
1683 bool one_misalignment_known
= false;
1684 bool one_misalignment_unknown
= false;
1685 bool one_dr_unsupportable
= false;
1686 dr_vec_info
*unsupportable_dr_info
= NULL
;
1687 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1688 unsigned possible_npeel_number
= 1;
1690 unsigned int mis
, same_align_drs_max
= 0;
1691 hash_table
<peel_info_hasher
> peeling_htab (1);
1693 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1695 /* Reset data so we can safely be called multiple times. */
1696 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1697 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1699 /* While cost model enhancements are expected in the future, the high level
1700 view of the code at this time is as follows:
1702 A) If there is a misaligned access then see if peeling to align
1703 this access can make all data references satisfy
1704 vect_supportable_dr_alignment. If so, update data structures
1705 as needed and return true.
1707 B) If peeling wasn't possible and there is a data reference with an
1708 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709 then see if loop versioning checks can be used to make all data
1710 references satisfy vect_supportable_dr_alignment. If so, update
1711 data structures as needed and return true.
1713 C) If neither peeling nor versioning were successful then return false if
1714 any data reference does not satisfy vect_supportable_dr_alignment.
1716 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1718 Note, Possibility 3 above (which is peeling and versioning together) is not
1719 being done at this time. */
1721 /* (1) Peeling to force alignment. */
1723 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1725 + How many accesses will become aligned due to the peeling
1726 - How many accesses will become unaligned due to the peeling,
1727 and the cost of misaligned accesses.
1728 - The cost of peeling (the extra runtime checks, the increase
1731 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1733 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1734 stmt_vec_info stmt_info
= dr_info
->stmt
;
1736 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1739 /* For interleaving, only the alignment of the first access
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1742 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1745 /* For scatter-gather or invariant accesses there is nothing
1747 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1748 || integer_zerop (DR_STEP (dr
)))
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info
)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1757 supportable_dr_alignment
1758 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, true);
1759 do_peeling
= vector_alignment_reachable_p (dr_info
);
1762 if (known_alignment_for_access_p (dr_info
))
1764 unsigned int npeel_tmp
= 0;
1765 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1766 size_zero_node
) < 0;
1768 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1769 /* If known_alignment_for_access_p then we have set
1770 DR_MISALIGNMENT which is only done if we know it at compiler
1771 time, so it is safe to assume target alignment is constant.
1773 unsigned int target_align
=
1774 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
1775 unsigned int dr_size
= vect_get_scalar_dr_size (dr_info
);
1777 ? DR_MISALIGNMENT (dr_info
)
1778 : -DR_MISALIGNMENT (dr_info
));
1779 if (DR_MISALIGNMENT (dr_info
) != 0)
1780 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1782 /* For multiple types, it is possible that the bigger type access
1783 will have more than one peeling option. E.g., a loop with two
1784 types: one of size (vector size / 4), and the other one of
1785 size (vector size / 8). Vectorization factor will 8. If both
1786 accesses are misaligned by 3, the first one needs one scalar
1787 iteration to be aligned, and the second one needs 5. But the
1788 first one will be aligned also by peeling 5 scalar
1789 iterations, and in that case both accesses will be aligned.
1790 Hence, except for the immediate peeling amount, we also want
1791 to try to add full vector size, while we don't exceed
1792 vectorization factor.
1793 We do this automatically for cost model, since we calculate
1794 cost for every peeling option. */
1795 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1797 poly_uint64 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1798 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1799 possible_npeel_number
1800 = vect_get_num_vectors (nscalars
, vectype
);
1802 /* NPEEL_TMP is 0 when there is no misalignment, but also
1803 allow peeling NELEMENTS. */
1804 if (DR_MISALIGNMENT (dr_info
) == 0)
1805 possible_npeel_number
++;
1808 /* Save info about DR in the hash table. Also include peeling
1809 amounts according to the explanation above. */
1810 for (j
= 0; j
< possible_npeel_number
; j
++)
1812 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1813 dr_info
, npeel_tmp
);
1814 npeel_tmp
+= target_align
/ dr_size
;
1817 one_misalignment_known
= true;
1821 /* If we don't know any misalignment values, we prefer
1822 peeling for data-ref that has the maximum number of data-refs
1823 with the same alignment, unless the target prefers to align
1824 stores over load. */
1825 unsigned same_align_drs
1826 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1828 || same_align_drs_max
< same_align_drs
)
1830 same_align_drs_max
= same_align_drs
;
1833 /* For data-refs with the same number of related
1834 accesses prefer the one where the misalign
1835 computation will be invariant in the outermost loop. */
1836 else if (same_align_drs_max
== same_align_drs
)
1838 class loop
*ivloop0
, *ivloop
;
1839 ivloop0
= outermost_invariant_loop_for_expr
1840 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
1841 ivloop
= outermost_invariant_loop_for_expr
1842 (loop
, DR_BASE_ADDRESS (dr
));
1843 if ((ivloop
&& !ivloop0
)
1844 || (ivloop
&& ivloop0
1845 && flow_loop_nested_p (ivloop
, ivloop0
)))
1849 one_misalignment_unknown
= true;
1851 /* Check for data refs with unsupportable alignment that
1853 if (!supportable_dr_alignment
)
1855 one_dr_unsupportable
= true;
1856 unsupportable_dr_info
= dr_info
;
1859 if (!first_store
&& DR_IS_WRITE (dr
))
1860 first_store
= dr_info
;
1865 if (!aligned_access_p (dr_info
))
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1869 "vector alignment may not be reachable\n");
1875 /* Check if we can possibly peel the loop. */
1876 if (!vect_can_advance_ivs_p (loop_vinfo
)
1877 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1881 struct _vect_peel_extended_info peel_for_known_alignment
;
1882 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1883 struct _vect_peel_extended_info best_peel
;
1885 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1886 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1887 peel_for_unknown_alignment
.peel_info
.count
= 0;
1890 && one_misalignment_unknown
)
1892 /* Check if the target requires to prefer stores over loads, i.e., if
1893 misaligned stores are more expensive than misaligned loads (taking
1894 drs with same alignment into account). */
1895 unsigned int load_inside_cost
= 0;
1896 unsigned int load_outside_cost
= 0;
1897 unsigned int store_inside_cost
= 0;
1898 unsigned int store_outside_cost
= 0;
1899 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1901 stmt_vector_for_cost dummy
;
1903 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
1906 &dummy
, &dummy
, estimated_npeels
, true);
1912 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
1914 &store_outside_cost
,
1916 estimated_npeels
, true);
1921 store_inside_cost
= INT_MAX
;
1922 store_outside_cost
= INT_MAX
;
1925 if (load_inside_cost
> store_inside_cost
1926 || (load_inside_cost
== store_inside_cost
1927 && load_outside_cost
> store_outside_cost
))
1929 dr0_info
= first_store
;
1930 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1931 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1935 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1936 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1939 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1940 prologue_cost_vec
.create (2);
1941 epilogue_cost_vec
.create (2);
1944 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1945 (loop_vinfo
, estimated_npeels
, &dummy2
,
1946 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1947 &prologue_cost_vec
, &epilogue_cost_vec
);
1949 prologue_cost_vec
.release ();
1950 epilogue_cost_vec
.release ();
1952 peel_for_unknown_alignment
.peel_info
.count
= 1
1953 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info
->stmt
).length ();
1956 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1957 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
1959 best_peel
= peel_for_unknown_alignment
;
1961 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1962 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1963 peel_for_known_alignment
.peel_info
.count
= 0;
1964 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
1966 if (do_peeling
&& one_misalignment_known
)
1968 /* Peeling is possible, but there is no data access that is not supported
1969 unless aligned. So we try to choose the best possible peeling from
1971 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1972 (&peeling_htab
, loop_vinfo
);
1975 /* Compare costs of peeling for known and unknown alignment. */
1976 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
1977 && peel_for_unknown_alignment
.inside_cost
1978 >= peel_for_known_alignment
.inside_cost
)
1980 best_peel
= peel_for_known_alignment
;
1982 /* If the best peeling for known alignment has NPEEL == 0, perform no
1983 peeling at all except if there is an unsupportable dr that we can
1985 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1989 /* If there is an unsupportable data ref, prefer this over all choices so far
1990 since we'd have to discard a chosen peeling except when it accidentally
1991 aligned the unsupportable data ref. */
1992 if (one_dr_unsupportable
)
1993 dr0_info
= unsupportable_dr_info
;
1994 else if (do_peeling
)
1996 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1997 TODO: Use nopeel_outside_cost or get rid of it? */
1998 unsigned nopeel_inside_cost
= 0;
1999 unsigned nopeel_outside_cost
= 0;
2001 stmt_vector_for_cost dummy
;
2003 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2004 &nopeel_outside_cost
, &dummy
, &dummy
,
2008 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2009 costs will be recorded. */
2010 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2011 prologue_cost_vec
.create (2);
2012 epilogue_cost_vec
.create (2);
2015 nopeel_outside_cost
+= vect_get_known_peeling_cost
2016 (loop_vinfo
, 0, &dummy2
,
2017 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2018 &prologue_cost_vec
, &epilogue_cost_vec
);
2020 prologue_cost_vec
.release ();
2021 epilogue_cost_vec
.release ();
2023 npeel
= best_peel
.peel_info
.npeel
;
2024 dr0_info
= best_peel
.peel_info
.dr_info
;
2026 /* If no peeling is not more expensive than the best peeling we
2027 have so far, don't perform any peeling. */
2028 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2034 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2035 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2037 if (known_alignment_for_access_p (dr0_info
))
2039 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2040 size_zero_node
) < 0;
2043 /* Since it's known at compile time, compute the number of
2044 iterations in the peeled loop (the peeling factor) for use in
2045 updating DR_MISALIGNMENT values. The peeling factor is the
2046 vectorization factor minus the misalignment as an element
2049 ? DR_MISALIGNMENT (dr0_info
)
2050 : -DR_MISALIGNMENT (dr0_info
));
2051 /* If known_alignment_for_access_p then we have set
2052 DR_MISALIGNMENT which is only done if we know it at compiler
2053 time, so it is safe to assume target alignment is constant.
2055 unsigned int target_align
=
2056 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2057 npeel
= ((mis
& (target_align
- 1))
2058 / vect_get_scalar_dr_size (dr0_info
));
2061 /* For interleaved data access every iteration accesses all the
2062 members of the group, therefore we divide the number of iterations
2063 by the group size. */
2064 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2065 npeel
/= DR_GROUP_SIZE (stmt_info
);
2067 if (dump_enabled_p ())
2068 dump_printf_loc (MSG_NOTE
, vect_location
,
2069 "Try peeling by %d\n", npeel
);
2072 /* Ensure that all datarefs can be vectorized after the peel. */
2073 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2076 /* Check if all datarefs are supportable and log. */
2077 if (do_peeling
&& known_alignment_for_access_p (dr0_info
) && npeel
== 0)
2079 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2086 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2089 unsigned max_allowed_peel
2090 = param_vect_max_peeling_for_alignment
;
2091 if (flag_vect_cost_model
== VECT_COST_MODEL_CHEAP
)
2092 max_allowed_peel
= 0;
2093 if (max_allowed_peel
!= (unsigned)-1)
2095 unsigned max_peel
= npeel
;
2098 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2099 unsigned HOST_WIDE_INT target_align_c
;
2100 if (target_align
.is_constant (&target_align_c
))
2102 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_NOTE
, vect_location
,
2108 "Disable peeling, max peels set and vector"
2109 " alignment unknown\n");
2112 if (max_peel
> max_allowed_peel
)
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_NOTE
, vect_location
,
2117 "Disable peeling, max peels reached: %d\n", max_peel
);
2122 /* Cost model #2 - if peeling may result in a remaining loop not
2123 iterating enough to be vectorized then do not peel. Since this
2124 is a cost heuristic rather than a correctness decision, use the
2125 most likely runtime value for variable vectorization factors. */
2127 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2129 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2130 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2131 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2132 < assumed_vf
+ max_peel
)
2138 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2139 If the misalignment of DR_i is identical to that of dr0 then set
2140 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2141 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2142 by the peeling factor times the element size of DR_i (MOD the
2143 vectorization factor times the size). Otherwise, the
2144 misalignment of DR_i must be set to unknown. */
2145 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2146 if (dr
!= dr0_info
->dr
)
2148 /* Strided accesses perform only component accesses, alignment
2149 is irrelevant for them. */
2150 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2151 stmt_info
= dr_info
->stmt
;
2152 if (STMT_VINFO_STRIDED_P (stmt_info
)
2153 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2156 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2159 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2161 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2163 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2164 = DR_MISALIGNMENT (dr0_info
);
2165 SET_DR_MISALIGNMENT (dr0_info
, 0);
2166 if (dump_enabled_p ())
2168 dump_printf_loc (MSG_NOTE
, vect_location
,
2169 "Alignment of access forced using peeling.\n");
2170 dump_printf_loc (MSG_NOTE
, vect_location
,
2171 "Peeling for alignment will be applied.\n");
2174 /* The inside-loop cost will be accounted for in vectorizable_load
2175 and vectorizable_store correctly with adjusted alignments.
2176 Drop the body_cst_vec on the floor here. */
2177 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2183 /* (2) Versioning to force alignment. */
2185 /* Try versioning if:
2186 1) optimize loop for speed and the cost-model is not cheap
2187 2) there is at least one unsupported misaligned data ref with an unknown
2189 3) all misaligned data refs with a known misalignment are supported, and
2190 4) the number of runtime alignment checks is within reason. */
2193 = (optimize_loop_nest_for_speed_p (loop
)
2194 && !loop
->inner
/* FORNOW */
2195 && flag_vect_cost_model
!= VECT_COST_MODEL_CHEAP
);
2199 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2201 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2202 stmt_vec_info stmt_info
= dr_info
->stmt
;
2204 /* For interleaving, only the alignment of the first access
2206 if (aligned_access_p (dr_info
)
2207 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2208 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
))
2211 if (STMT_VINFO_STRIDED_P (stmt_info
))
2213 /* Strided loads perform only component accesses, alignment is
2214 irrelevant for them. */
2215 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2217 do_versioning
= false;
2221 supportable_dr_alignment
2222 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, false);
2224 if (!supportable_dr_alignment
)
2229 if (known_alignment_for_access_p (dr_info
)
2230 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2231 >= (unsigned) param_vect_max_version_for_alignment_checks
)
2233 do_versioning
= false;
2237 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2238 gcc_assert (vectype
);
2240 /* At present we don't support versioning for alignment
2241 with variable VF, since there's no guarantee that the
2242 VF is a power of two. We could relax this if we added
2243 a way of enforcing a power-of-two size. */
2244 unsigned HOST_WIDE_INT size
;
2245 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2247 do_versioning
= false;
2251 /* Forcing alignment in the first iteration is no good if
2252 we don't keep it across iterations. For now, just disable
2253 versioning in this case.
2254 ?? We could actually unroll the loop to achieve the required
2255 overall step alignment, and forcing the alignment could be
2256 done by doing some iterations of the non-vectorized loop. */
2257 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2258 * DR_STEP_ALIGNMENT (dr
),
2259 DR_TARGET_ALIGNMENT (dr_info
)))
2261 do_versioning
= false;
2265 /* The rightmost bits of an aligned address must be zeros.
2266 Construct the mask needed for this test. For example,
2267 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2268 mask must be 15 = 0xf. */
2271 /* FORNOW: use the same mask to test all potentially unaligned
2272 references in the loop. */
2273 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2274 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2276 do_versioning
= false;
2280 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2281 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2285 /* Versioning requires at least one misaligned data reference. */
2286 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2287 do_versioning
= false;
2288 else if (!do_versioning
)
2289 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2294 vec
<stmt_vec_info
> may_misalign_stmts
2295 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2296 stmt_vec_info stmt_info
;
2298 /* It can now be assumed that the data references in the statements
2299 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2300 of the loop being vectorized. */
2301 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2303 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2304 SET_DR_MISALIGNMENT (dr_info
, 0);
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_NOTE
, vect_location
,
2307 "Alignment of access forced using versioning.\n");
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE
, vect_location
,
2312 "Versioning for alignment will be applied.\n");
2314 /* Peeling and versioning can't be done together at this time. */
2315 gcc_assert (! (do_peeling
&& do_versioning
));
2317 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2322 /* This point is reached if neither peeling nor versioning is being done. */
2323 gcc_assert (! (do_peeling
|| do_versioning
));
2325 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2330 /* Function vect_find_same_alignment_drs.
2332 Update group and alignment relations in VINFO according to the chosen
2333 vectorization factor. */
2336 vect_find_same_alignment_drs (vec_info
*vinfo
, data_dependence_relation
*ddr
)
2338 struct data_reference
*dra
= DDR_A (ddr
);
2339 struct data_reference
*drb
= DDR_B (ddr
);
2340 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
2341 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
2342 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
2343 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
2345 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2351 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
2352 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2355 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2356 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2357 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2360 /* Two references with distance zero have the same alignment. */
2361 poly_offset_int diff
= (wi::to_poly_offset (DR_INIT (dra
))
2362 - wi::to_poly_offset (DR_INIT (drb
)));
2363 if (maybe_ne (diff
, 0))
2365 /* Get the wider of the two alignments. */
2366 poly_uint64 align_a
=
2367 exact_div (vect_calculate_target_alignment (dr_info_a
),
2369 poly_uint64 align_b
=
2370 exact_div (vect_calculate_target_alignment (dr_info_b
),
2372 unsigned HOST_WIDE_INT align_a_c
, align_b_c
;
2373 if (!align_a
.is_constant (&align_a_c
)
2374 || !align_b
.is_constant (&align_b_c
))
2377 unsigned HOST_WIDE_INT max_align
= MAX (align_a_c
, align_b_c
);
2379 /* Require the gap to be a multiple of the larger vector alignment. */
2380 if (!multiple_p (diff
, max_align
))
2384 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2385 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2386 if (dump_enabled_p ())
2387 dump_printf_loc (MSG_NOTE
, vect_location
,
2388 "accesses have the same alignment: %T and %T\n",
2389 DR_REF (dra
), DR_REF (drb
));
2393 /* Function vect_analyze_data_refs_alignment
2395 Analyze the alignment of the data-references in the loop.
2396 Return FALSE if a data reference is found that cannot be vectorized. */
2399 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2401 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2403 /* Mark groups of data references with same alignment using
2404 data dependence information. */
2405 vec
<ddr_p
> ddrs
= vinfo
->shared
->ddrs
;
2406 struct data_dependence_relation
*ddr
;
2409 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2410 vect_find_same_alignment_drs (vinfo
, ddr
);
2412 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2413 struct data_reference
*dr
;
2415 vect_record_base_alignments (vinfo
);
2416 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2418 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
2419 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2420 vect_compute_data_ref_alignment (vinfo
, dr_info
);
2423 return opt_result::success ();
2427 /* Analyze alignment of DRs of stmts in NODE. */
2430 vect_slp_analyze_and_verify_node_alignment (vec_info
*vinfo
, slp_tree node
)
2432 /* We vectorize from the first scalar stmt in the node unless
2433 the node is permuted in which case we start from the first
2434 element in the group. */
2435 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2436 dr_vec_info
*first_dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2437 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2438 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2440 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2441 vect_compute_data_ref_alignment (vinfo
, dr_info
);
2442 /* For creating the data-ref pointer we need alignment of the
2443 first element anyway. */
2444 if (dr_info
!= first_dr_info
)
2445 vect_compute_data_ref_alignment (vinfo
, first_dr_info
);
2446 if (! verify_data_ref_alignment (vinfo
, dr_info
))
2448 if (dump_enabled_p ())
2449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2450 "not vectorized: bad data alignment in basic "
2458 /* Function vect_slp_analyze_instance_alignment
2460 Analyze the alignment of the data-references in the SLP instance.
2461 Return FALSE if a data reference is found that cannot be vectorized. */
2464 vect_slp_analyze_and_verify_instance_alignment (vec_info
*vinfo
,
2465 slp_instance instance
)
2467 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2471 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2472 if (! vect_slp_analyze_and_verify_node_alignment (vinfo
, node
))
2475 node
= SLP_INSTANCE_TREE (instance
);
2476 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node
)[0])
2477 && ! vect_slp_analyze_and_verify_node_alignment
2478 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2485 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2486 accesses of legal size, step, etc. Detect gaps, single element
2487 interleaving, and other special cases. Set grouped access info.
2488 Collect groups of strided stores for further use in SLP analysis.
2489 Worker for vect_analyze_group_access. */
2492 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2494 data_reference
*dr
= dr_info
->dr
;
2495 tree step
= DR_STEP (dr
);
2496 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2497 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2498 stmt_vec_info stmt_info
= dr_info
->stmt
;
2499 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2500 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2501 HOST_WIDE_INT dr_step
= -1;
2502 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2503 bool slp_impossible
= false;
2505 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2506 size of the interleaving group (including gaps). */
2507 if (tree_fits_shwi_p (step
))
2509 dr_step
= tree_to_shwi (step
);
2510 /* Check that STEP is a multiple of type size. Otherwise there is
2511 a non-element-sized gap at the end of the group which we
2512 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2513 ??? As we can handle non-constant step fine here we should
2514 simply remove uses of DR_GROUP_GAP between the last and first
2515 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2516 simply not include that gap. */
2517 if ((dr_step
% type_size
) != 0)
2519 if (dump_enabled_p ())
2520 dump_printf_loc (MSG_NOTE
, vect_location
,
2521 "Step %T is not a multiple of the element size"
2526 groupsize
= absu_hwi (dr_step
) / type_size
;
2531 /* Not consecutive access is possible only if it is a part of interleaving. */
2532 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2534 /* Check if it this DR is a part of interleaving, and is a single
2535 element of the group that is accessed in the loop. */
2537 /* Gaps are supported only for loads. STEP must be a multiple of the type
2540 && (dr_step
% type_size
) == 0
2543 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2544 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2545 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_NOTE
, vect_location
,
2548 "Detected single element interleaving %T"
2555 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2557 "not consecutive access %G", stmt_info
->stmt
);
2561 /* Mark the statement as unvectorizable. */
2562 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2566 if (dump_enabled_p ())
2567 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2568 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2572 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2574 /* First stmt in the interleaving chain. Check the chain. */
2575 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2576 struct data_reference
*data_ref
= dr
;
2577 unsigned int count
= 1;
2578 tree prev_init
= DR_INIT (data_ref
);
2579 HOST_WIDE_INT diff
, gaps
= 0;
2581 /* By construction, all group members have INTEGER_CST DR_INITs. */
2584 /* We never have the same DR multiple times. */
2585 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
2586 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
2588 data_ref
= STMT_VINFO_DATA_REF (next
);
2590 /* All group members have the same STEP by construction. */
2591 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2593 /* Check that the distance between two accesses is equal to the type
2594 size. Otherwise, we have gaps. */
2595 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2596 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2599 /* FORNOW: SLP of accesses with gaps is not supported. */
2600 slp_impossible
= true;
2601 if (DR_IS_WRITE (data_ref
))
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2605 "interleaved store with gaps\n");
2612 last_accessed_element
+= diff
;
2614 /* Store the gap from the previous member of the group. If there is no
2615 gap in the access, DR_GROUP_GAP is always 1. */
2616 DR_GROUP_GAP (next
) = diff
;
2618 prev_init
= DR_INIT (data_ref
);
2619 next
= DR_GROUP_NEXT_ELEMENT (next
);
2620 /* Count the number of data-refs in the chain. */
2625 groupsize
= count
+ gaps
;
2627 /* This could be UINT_MAX but as we are generating code in a very
2628 inefficient way we have to cap earlier. See PR78699 for example. */
2629 if (groupsize
> 4096)
2631 if (dump_enabled_p ())
2632 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2633 "group is too large\n");
2637 /* Check that the size of the interleaving is equal to count for stores,
2638 i.e., that there are no gaps. */
2639 if (groupsize
!= count
2640 && !DR_IS_READ (dr
))
2643 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2646 /* If there is a gap after the last load in the group it is the
2647 difference between the groupsize and the last accessed
2649 When there is no gap, this difference should be 0. */
2650 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
2652 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2653 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_NOTE
, vect_location
,
2656 "Detected interleaving ");
2657 if (DR_IS_READ (dr
))
2658 dump_printf (MSG_NOTE
, "load ");
2659 else if (STMT_VINFO_STRIDED_P (stmt_info
))
2660 dump_printf (MSG_NOTE
, "strided store ");
2662 dump_printf (MSG_NOTE
, "store ");
2663 dump_printf (MSG_NOTE
, "of size %u\n",
2664 (unsigned)groupsize
);
2665 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
2666 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2669 if (DR_GROUP_GAP (next
) != 1)
2670 dump_printf_loc (MSG_NOTE
, vect_location
,
2671 "\t<gap of %d elements>\n",
2672 DR_GROUP_GAP (next
) - 1);
2673 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
2674 next
= DR_GROUP_NEXT_ELEMENT (next
);
2676 if (DR_GROUP_GAP (stmt_info
) != 0)
2677 dump_printf_loc (MSG_NOTE
, vect_location
,
2678 "\t<gap of %d elements>\n",
2679 DR_GROUP_GAP (stmt_info
));
2682 /* SLP: create an SLP data structure for every interleaving group of
2683 stores for further analysis in vect_analyse_slp. */
2684 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2687 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
2689 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
2696 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2697 accesses of legal size, step, etc. Detect gaps, single element
2698 interleaving, and other special cases. Set grouped access info.
2699 Collect groups of strided stores for further use in SLP analysis. */
2702 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2704 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
2706 /* Dissolve the group if present. */
2707 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
2710 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2711 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2712 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
2720 /* Analyze the access pattern of the data-reference DR_INFO.
2721 In case of non-consecutive accesses call vect_analyze_group_access() to
2722 analyze groups of accesses. */
2725 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2727 data_reference
*dr
= dr_info
->dr
;
2728 tree step
= DR_STEP (dr
);
2729 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2730 stmt_vec_info stmt_info
= dr_info
->stmt
;
2731 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2732 class loop
*loop
= NULL
;
2734 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2738 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2740 if (loop_vinfo
&& !step
)
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2744 "bad data-ref access in loop\n");
2748 /* Allow loads with zero step in inner-loop vectorization. */
2749 if (loop_vinfo
&& integer_zerop (step
))
2751 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2752 if (!nested_in_vect_loop_p (loop
, stmt_info
))
2753 return DR_IS_READ (dr
);
2754 /* Allow references with zero step for outer loops marked
2755 with pragma omp simd only - it guarantees absence of
2756 loop-carried dependencies between inner loop iterations. */
2757 if (loop
->safelen
< 2)
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_NOTE
, vect_location
,
2761 "zero step in inner loop of nest\n");
2766 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2768 /* Interleaved accesses are not yet supported within outer-loop
2769 vectorization for references in the inner-loop. */
2770 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2772 /* For the rest of the analysis we use the outer-loop step. */
2773 step
= STMT_VINFO_DR_STEP (stmt_info
);
2774 if (integer_zerop (step
))
2776 if (dump_enabled_p ())
2777 dump_printf_loc (MSG_NOTE
, vect_location
,
2778 "zero step in outer loop.\n");
2779 return DR_IS_READ (dr
);
2784 if (TREE_CODE (step
) == INTEGER_CST
)
2786 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2787 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2789 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2791 /* Mark that it is not interleaving. */
2792 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2797 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_NOTE
, vect_location
,
2801 "grouped access in outer loop.\n");
2806 /* Assume this is a DR handled by non-constant strided load case. */
2807 if (TREE_CODE (step
) != INTEGER_CST
)
2808 return (STMT_VINFO_STRIDED_P (stmt_info
)
2809 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2810 || vect_analyze_group_access (vinfo
, dr_info
)));
2812 /* Not consecutive access - check if it's a part of interleaving group. */
2813 return vect_analyze_group_access (vinfo
, dr_info
);
2816 /* Compare two data-references DRA and DRB to group them into chunks
2817 suitable for grouping. */
2820 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2822 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2823 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2826 /* Stabilize sort. */
2830 /* DRs in different loops never belong to the same group. */
2831 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2832 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2834 return loopa
->num
< loopb
->num
? -1 : 1;
2836 /* Ordering of DRs according to base. */
2837 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2838 DR_BASE_ADDRESS (drb
));
2842 /* And according to DR_OFFSET. */
2843 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2847 /* Put reads before writes. */
2848 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2849 return DR_IS_READ (dra
) ? -1 : 1;
2851 /* Then sort after access size. */
2852 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2853 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2857 /* And after step. */
2858 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2862 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2863 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2865 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2869 /* If OP is the result of a conversion, return the unconverted value,
2870 otherwise return null. */
2873 strip_conversion (tree op
)
2875 if (TREE_CODE (op
) != SSA_NAME
)
2877 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2878 if (!is_gimple_assign (stmt
)
2879 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2881 return gimple_assign_rhs1 (stmt
);
2884 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2885 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2886 be grouped in SLP mode. */
2889 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
2892 if (gimple_assign_single_p (stmt1_info
->stmt
))
2893 return gimple_assign_single_p (stmt2_info
->stmt
);
2895 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
2896 if (call1
&& gimple_call_internal_p (call1
))
2898 /* Check for two masked loads or two masked stores. */
2899 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
2900 if (!call2
|| !gimple_call_internal_p (call2
))
2902 internal_fn ifn
= gimple_call_internal_fn (call1
);
2903 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2905 if (ifn
!= gimple_call_internal_fn (call2
))
2908 /* Check that the masks are the same. Cope with casts of masks,
2909 like those created by build_mask_conversion. */
2910 tree mask1
= gimple_call_arg (call1
, 2);
2911 tree mask2
= gimple_call_arg (call2
, 2);
2912 if (!operand_equal_p (mask1
, mask2
, 0)
2913 && (ifn
== IFN_MASK_STORE
|| !allow_slp_p
))
2915 mask1
= strip_conversion (mask1
);
2918 mask2
= strip_conversion (mask2
);
2921 if (!operand_equal_p (mask1
, mask2
, 0))
2930 /* Function vect_analyze_data_ref_accesses.
2932 Analyze the access pattern of all the data references in the loop.
2934 FORNOW: the only access pattern that is considered vectorizable is a
2935 simple step 1 (consecutive) access.
2937 FORNOW: handle only arrays and pointer accesses. */
2940 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2943 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2944 struct data_reference
*dr
;
2946 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2948 if (datarefs
.is_empty ())
2949 return opt_result::success ();
2951 /* Sort the array of datarefs to make building the interleaving chains
2952 linear. Don't modify the original vector's order, it is needed for
2953 determining what dependencies are reversed. */
2954 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2955 datarefs_copy
.qsort (dr_group_sort_cmp
);
2956 hash_set
<stmt_vec_info
> to_fixup
;
2958 /* Build the interleaving chains. */
2959 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2961 data_reference_p dra
= datarefs_copy
[i
];
2962 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
2963 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
2964 stmt_vec_info lastinfo
= NULL
;
2965 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2966 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2971 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2973 data_reference_p drb
= datarefs_copy
[i
];
2974 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
2975 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
2976 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2977 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2980 /* ??? Imperfect sorting (non-compatible types, non-modulo
2981 accesses, same accesses) can lead to a group to be artificially
2982 split here as we don't just skip over those. If it really
2983 matters we can push those to a worklist and re-iterate
2984 over them. The we can just skip ahead to the next DR here. */
2986 /* DRs in a different loop should not be put into the same
2987 interleaving group. */
2988 if (gimple_bb (DR_STMT (dra
))->loop_father
2989 != gimple_bb (DR_STMT (drb
))->loop_father
)
2992 /* Check that the data-refs have same first location (except init)
2993 and they are both either store or load (not load and store,
2994 not masked loads or stores). */
2995 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2996 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2997 DR_BASE_ADDRESS (drb
)) != 0
2998 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2999 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3002 /* Check that the data-refs have the same constant size. */
3003 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3004 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3005 if (!tree_fits_uhwi_p (sza
)
3006 || !tree_fits_uhwi_p (szb
)
3007 || !tree_int_cst_equal (sza
, szb
))
3010 /* Check that the data-refs have the same step. */
3011 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3014 /* Check the types are compatible.
3015 ??? We don't distinguish this during sorting. */
3016 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3017 TREE_TYPE (DR_REF (drb
))))
3020 /* Check that the DR_INITs are compile-time constants. */
3021 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
3022 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
3025 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3026 just hold extra information. */
3027 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3028 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3029 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3032 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3033 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3034 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3035 HOST_WIDE_INT init_prev
3036 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
3037 gcc_assert (init_a
<= init_b
3038 && init_a
<= init_prev
3039 && init_prev
<= init_b
);
3041 /* Do not place the same access in the interleaving chain twice. */
3042 if (init_b
== init_prev
)
3044 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
3045 < gimple_uid (DR_STMT (drb
)));
3046 /* Simply link in duplicates and fix up the chain below. */
3050 /* If init_b == init_a + the size of the type * k, we have an
3051 interleaving, and DRA is accessed before DRB. */
3052 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3053 if (type_size_a
== 0
3054 || (init_b
- init_a
) % type_size_a
!= 0)
3057 /* If we have a store, the accesses are adjacent. This splits
3058 groups into chunks we support (we don't support vectorization
3059 of stores with gaps). */
3060 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
3063 /* If the step (if not zero or non-constant) is greater than the
3064 difference between data-refs' inits this splits groups into
3066 if (tree_fits_shwi_p (DR_STEP (dra
)))
3068 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
3069 if (step
!= 0 && step
<= (init_b
- init_a
))
3074 if (dump_enabled_p ())
3075 dump_printf_loc (MSG_NOTE
, vect_location
,
3077 ? "Detected interleaving load %T and %T\n"
3078 : "Detected interleaving store %T and %T\n",
3079 DR_REF (dra
), DR_REF (drb
));
3081 /* Link the found element into the group list. */
3082 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3084 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3085 lastinfo
= stmtinfo_a
;
3087 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3088 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3089 lastinfo
= stmtinfo_b
;
3091 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3092 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3094 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3095 dump_printf_loc (MSG_NOTE
, vect_location
,
3096 "Load suitable for SLP vectorization only.\n");
3098 if (init_b
== init_prev
3099 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3100 && dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE
, vect_location
,
3102 "Queuing group with duplicate access for fixup\n");
3106 /* Fixup groups with duplicate entries by splitting it. */
3109 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3110 if (!(it
!= to_fixup
.end ()))
3112 stmt_vec_info grp
= *it
;
3113 to_fixup
.remove (grp
);
3115 /* Find the earliest duplicate group member. */
3116 unsigned first_duplicate
= -1u;
3117 stmt_vec_info next
, g
= grp
;
3118 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3120 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3121 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3122 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3123 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3126 if (first_duplicate
== -1U)
3129 /* Then move all stmts after the first duplicate to a new group.
3130 Note this is a heuristic but one with the property that *it
3131 is fixed up completely. */
3133 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3134 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3136 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3138 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3142 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3144 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3147 g
= DR_GROUP_NEXT_ELEMENT (g
);
3149 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3151 /* Fixup the new group which still may contain duplicates. */
3152 to_fixup
.add (newgroup
);
3155 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
3157 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
3158 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3159 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3161 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3163 "not vectorized: complicated access pattern.\n");
3165 if (is_a
<bb_vec_info
> (vinfo
))
3167 /* Mark the statement as not vectorizable. */
3168 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3173 datarefs_copy
.release ();
3174 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3176 " complicated access pattern.\n");
3181 datarefs_copy
.release ();
3182 return opt_result::success ();
3185 /* Function vect_vfa_segment_size.
3188 DR_INFO: The data reference.
3189 LENGTH_FACTOR: segment length to consider.
3191 Return a value suitable for the dr_with_seg_len::seg_len field.
3192 This is the "distance travelled" by the pointer from the first
3193 iteration in the segment to the last. Note that it does not include
3194 the size of the access; in effect it only describes the first byte. */
3197 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3199 length_factor
= size_binop (MINUS_EXPR
,
3200 fold_convert (sizetype
, length_factor
),
3202 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3206 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3207 gives the worst-case number of bytes covered by the segment. */
3209 static unsigned HOST_WIDE_INT
3210 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3212 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3213 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3214 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3215 unsigned HOST_WIDE_INT access_size
= ref_size
;
3216 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3218 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3219 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3221 if (STMT_VINFO_VEC_STMT (stmt_vinfo
)
3222 && (vect_supportable_dr_alignment (vinfo
, dr_info
, false)
3223 == dr_explicit_realign_optimized
))
3225 /* We might access a full vector's worth. */
3226 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3227 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3232 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3236 vect_vfa_align (dr_vec_info
*dr_info
)
3238 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info
->dr
)));
3241 /* Function vect_no_alias_p.
3243 Given data references A and B with equal base and offset, see whether
3244 the alias relation can be decided at compilation time. Return 1 if
3245 it can and the references alias, 0 if it can and the references do
3246 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3247 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3248 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3251 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3252 tree segment_length_a
, tree segment_length_b
,
3253 unsigned HOST_WIDE_INT access_size_a
,
3254 unsigned HOST_WIDE_INT access_size_b
)
3256 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3257 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3258 poly_uint64 const_length_a
;
3259 poly_uint64 const_length_b
;
3261 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3262 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3264 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3266 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3267 offset_a
-= const_length_a
;
3270 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3271 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3273 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3274 offset_b
-= const_length_b
;
3277 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3279 const_length_a
+= access_size_a
;
3280 const_length_b
+= access_size_b
;
3282 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3283 offset_b
, const_length_b
))
3286 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3287 offset_b
, const_length_b
))
3293 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3297 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3298 unsigned int loop_depth
, poly_uint64 vf
)
3300 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3301 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3304 /* If the dependence is exact, we should have limited the VF instead. */
3305 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3308 lambda_vector dist_v
;
3309 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3311 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3313 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3314 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_NOTE
, vect_location
,
3320 "dependence distance between %T and %T is >= VF\n",
3321 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3326 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3329 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3331 dump_printf (dump_kind
, "%s (%T) >= ",
3332 lower_bound
.unsigned_p
? "unsigned" : "abs",
3334 dump_dec (dump_kind
, lower_bound
.min_value
);
3337 /* Record that the vectorized loop requires the vec_lower_bound described
3338 by EXPR, UNSIGNED_P and MIN_VALUE. */
3341 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3342 poly_uint64 min_value
)
3344 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3345 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3346 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3348 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3349 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3350 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3351 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3353 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3354 lower_bounds
[i
].min_value
= min_value
;
3355 if (dump_enabled_p ())
3357 dump_printf_loc (MSG_NOTE
, vect_location
,
3358 "updating run-time check to ");
3359 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3360 dump_printf (MSG_NOTE
, "\n");
3366 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3367 if (dump_enabled_p ())
3369 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3370 dump_lower_bound (MSG_NOTE
, lower_bound
);
3371 dump_printf (MSG_NOTE
, "\n");
3373 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3376 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3377 will span fewer than GAP bytes. */
3380 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3383 stmt_vec_info stmt_info
= dr_info
->stmt
;
3385 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3386 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3387 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3388 return (estimated_poly_value (gap
)
3389 <= count
* vect_get_scalar_dr_size (dr_info
));
3392 /* Return true if we know that there is no alias between DR_INFO_A and
3393 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3394 When returning true, set *LOWER_BOUND_OUT to this N. */
3397 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3398 poly_uint64
*lower_bound_out
)
3400 /* Check that there is a constant gap of known sign between DR_A
3402 data_reference
*dr_a
= dr_info_a
->dr
;
3403 data_reference
*dr_b
= dr_info_b
->dr
;
3404 poly_int64 init_a
, init_b
;
3405 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3406 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3407 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3408 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3409 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3410 || !ordered_p (init_a
, init_b
))
3413 /* Sort DR_A and DR_B by the address they access. */
3414 if (maybe_lt (init_b
, init_a
))
3416 std::swap (init_a
, init_b
);
3417 std::swap (dr_info_a
, dr_info_b
);
3418 std::swap (dr_a
, dr_b
);
3421 /* If the two accesses could be dependent within a scalar iteration,
3422 make sure that we'd retain their order. */
3423 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3424 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3427 /* There is no alias if abs (DR_STEP) is greater than or equal to
3428 the bytes spanned by the combination of the two accesses. */
3429 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3433 /* Function vect_prune_runtime_alias_test_list.
3435 Prune a list of ddrs to be tested at run-time by versioning for alias.
3436 Merge several alias checks into one if possible.
3437 Return FALSE if resulting list of ddrs is longer then allowed by
3438 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3441 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3443 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3444 hash_set
<tree_pair_hash
> compared_objects
;
3446 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3447 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3448 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3449 vec
<vec_object_pair
> &check_unequal_addrs
3450 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3451 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3452 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3458 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3460 /* Step values are irrelevant for aliasing if the number of vector
3461 iterations is equal to the number of scalar iterations (which can
3462 happen for fully-SLP loops). */
3463 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3467 /* Convert the checks for nonzero steps into bound tests. */
3469 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3470 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3473 if (may_alias_ddrs
.is_empty ())
3474 return opt_result::success ();
3476 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3478 unsigned int loop_depth
3479 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3480 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3482 /* First, we collect all data ref pairs for aliasing checks. */
3483 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3485 poly_uint64 lower_bound
;
3486 tree segment_length_a
, segment_length_b
;
3487 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3488 unsigned int align_a
, align_b
;
3490 /* Ignore the alias if the VF we chose ended up being no greater
3491 than the dependence distance. */
3492 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3495 if (DDR_OBJECT_A (ddr
))
3497 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3498 if (!compared_objects
.add (new_pair
))
3500 if (dump_enabled_p ())
3501 dump_printf_loc (MSG_NOTE
, vect_location
,
3502 "checking that %T and %T"
3503 " have different addresses\n",
3504 new_pair
.first
, new_pair
.second
);
3505 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3510 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3511 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3513 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3514 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3516 bool preserves_scalar_order_p
3517 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3519 /* Skip the pair if inter-iteration dependencies are irrelevant
3520 and intra-iteration dependencies are guaranteed to be honored. */
3522 && (preserves_scalar_order_p
3523 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3526 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_NOTE
, vect_location
,
3528 "no need for alias check between "
3529 "%T and %T when VF is 1\n",
3530 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3534 /* See whether we can handle the alias using a bounds check on
3535 the step, and whether that's likely to be the best approach.
3536 (It might not be, for example, if the minimum step is much larger
3537 than the number of bytes handled by one vector iteration.) */
3539 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3540 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3542 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3543 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3545 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3546 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3549 "%T and %T when the step %T is outside ",
3550 DR_REF (dr_info_a
->dr
),
3551 DR_REF (dr_info_b
->dr
),
3552 DR_STEP (dr_info_a
->dr
));
3554 dump_printf (MSG_NOTE
, "[0");
3557 dump_printf (MSG_NOTE
, "(");
3558 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3560 dump_printf (MSG_NOTE
, ", ");
3561 dump_dec (MSG_NOTE
, lower_bound
);
3562 dump_printf (MSG_NOTE
, ")\n");
3564 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
3565 unsigned_p
, lower_bound
);
3569 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
3570 if (dr_group_first_a
)
3572 stmt_info_a
= dr_group_first_a
;
3573 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
3576 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
3577 if (dr_group_first_b
)
3579 stmt_info_b
= dr_group_first_b
;
3580 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
3585 segment_length_a
= size_zero_node
;
3586 segment_length_b
= size_zero_node
;
3590 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
3591 DR_STEP (dr_info_b
->dr
), 0))
3592 length_factor
= scalar_loop_iters
;
3594 length_factor
= size_int (vect_factor
);
3595 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
3596 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
3598 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
3599 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
3600 align_a
= vect_vfa_align (dr_info_a
);
3601 align_b
= vect_vfa_align (dr_info_b
);
3603 /* See whether the alias is known at compilation time. */
3604 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
3605 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
3606 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
3607 DR_OFFSET (dr_info_b
->dr
), 0)
3608 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
3609 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
3610 && poly_int_tree_p (segment_length_a
)
3611 && poly_int_tree_p (segment_length_b
))
3613 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
3618 if (res
>= 0 && dump_enabled_p ())
3620 dump_printf_loc (MSG_NOTE
, vect_location
,
3621 "can tell at compile time that %T and %T",
3622 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3624 dump_printf (MSG_NOTE
, " do not alias\n");
3626 dump_printf (MSG_NOTE
, " alias\n");
3633 return opt_result::failure_at (stmt_info_b
->stmt
,
3635 " compilation time alias: %G%G",
3640 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
3641 access_size_a
, align_a
);
3642 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
3643 access_size_b
, align_b
);
3644 /* Canonicalize the order to be the one that's needed for accurate
3645 RAW, WAR and WAW flags, in cases where the data references are
3646 well-ordered. The order doesn't really matter otherwise,
3647 but we might as well be consistent. */
3648 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
3649 std::swap (dr_a
, dr_b
);
3651 dr_with_seg_len_pair_t dr_with_seg_len_pair
3652 (dr_a
, dr_b
, (preserves_scalar_order_p
3653 ? dr_with_seg_len_pair_t::WELL_ORDERED
3654 : dr_with_seg_len_pair_t::REORDERED
));
3656 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3659 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3661 unsigned int count
= (comp_alias_ddrs
.length ()
3662 + check_unequal_addrs
.length ());
3664 if (dump_enabled_p ())
3665 dump_printf_loc (MSG_NOTE
, vect_location
,
3666 "improved number of alias checks from %d to %d\n",
3667 may_alias_ddrs
.length (), count
);
3668 unsigned limit
= param_vect_max_version_for_alias_checks
;
3669 if (flag_simd_cost_model
== VECT_COST_MODEL_CHEAP
)
3670 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
3672 return opt_result::failure_at
3674 "number of versioning for alias run-time tests exceeds %d "
3675 "(--param vect-max-version-for-alias-checks)\n", limit
);
3677 return opt_result::success ();
3680 /* Check whether we can use an internal function for a gather load
3681 or scatter store. READ_P is true for loads and false for stores.
3682 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3683 the type of the memory elements being loaded or stored. OFFSET_TYPE
3684 is the type of the offset that is being applied to the invariant
3685 base address. SCALE is the amount by which the offset should
3686 be multiplied *after* it has been converted to address width.
3688 Return true if the function is supported, storing the function id in
3689 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3692 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
3693 tree vectype
, tree memory_type
, tree offset_type
,
3694 int scale
, internal_fn
*ifn_out
,
3695 tree
*offset_vectype_out
)
3697 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3698 unsigned int element_bits
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype
)));
3699 if (element_bits
!= memory_bits
)
3700 /* For now the vector elements must be the same width as the
3704 /* Work out which function we need. */
3707 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3709 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3713 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
3714 if (!offset_vectype
)
3717 /* Test whether the target supports this combination. */
3718 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3719 offset_vectype
, scale
))
3722 *offset_vectype_out
= offset_vectype
;
3726 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
3727 && TYPE_PRECISION (offset_type
) >= element_bits
)
3730 offset_type
= build_nonstandard_integer_type
3731 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
3735 /* STMT_INFO is a call to an internal gather load or scatter store function.
3736 Describe the operation in INFO. */
3739 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
3740 gather_scatter_info
*info
)
3742 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
3743 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3744 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3746 info
->ifn
= gimple_call_internal_fn (call
);
3747 info
->decl
= NULL_TREE
;
3748 info
->base
= gimple_call_arg (call
, 0);
3749 info
->offset
= gimple_call_arg (call
, 1);
3750 info
->offset_dt
= vect_unknown_def_type
;
3751 info
->offset_vectype
= NULL_TREE
;
3752 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3753 info
->element_type
= TREE_TYPE (vectype
);
3754 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3757 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3758 gather load or scatter store. Describe the operation in *INFO if so. */
3761 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
3762 gather_scatter_info
*info
)
3764 HOST_WIDE_INT scale
= 1;
3765 poly_int64 pbitpos
, pbitsize
;
3766 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3767 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3768 tree offtype
= NULL_TREE
;
3769 tree decl
= NULL_TREE
, base
, off
;
3770 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3771 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3773 int punsignedp
, reversep
, pvolatilep
= 0;
3775 tree offset_vectype
;
3776 bool masked_p
= false;
3778 /* See whether this is already a call to a gather/scatter internal function.
3779 If not, see whether it's a masked load or store. */
3780 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
3781 if (call
&& gimple_call_internal_p (call
))
3783 ifn
= gimple_call_internal_fn (call
);
3784 if (internal_gather_scatter_fn_p (ifn
))
3786 vect_describe_gather_scatter_call (stmt_info
, info
);
3789 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3792 /* True if we should aim to use internal functions rather than
3793 built-in functions. */
3794 bool use_ifn_p
= (DR_IS_READ (dr
)
3795 ? supports_vec_gather_load_p ()
3796 : supports_vec_scatter_store_p ());
3799 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3800 see if we can use the def stmt of the address. */
3802 && TREE_CODE (base
) == MEM_REF
3803 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3804 && integer_zerop (TREE_OPERAND (base
, 1))
3805 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3807 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3808 if (is_gimple_assign (def_stmt
)
3809 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3810 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3813 /* The gather and scatter builtins need address of the form
3814 loop_invariant + vector * {1, 2, 4, 8}
3816 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3817 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3818 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3819 multiplications and additions in it. To get a vector, we need
3820 a single SSA_NAME that will be defined in the loop and will
3821 contain everything that is not loop invariant and that can be
3822 vectorized. The following code attempts to find such a preexistng
3823 SSA_NAME OFF and put the loop invariants into a tree BASE
3824 that can be gimplified before the loop. */
3825 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3826 &punsignedp
, &reversep
, &pvolatilep
);
3830 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3832 if (TREE_CODE (base
) == MEM_REF
)
3834 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3836 if (off
== NULL_TREE
)
3837 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3839 off
= size_binop (PLUS_EXPR
, off
,
3840 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3842 base
= TREE_OPERAND (base
, 0);
3845 base
= build_fold_addr_expr (base
);
3847 if (off
== NULL_TREE
)
3848 off
= size_zero_node
;
3850 /* If base is not loop invariant, either off is 0, then we start with just
3851 the constant offset in the loop invariant BASE and continue with base
3852 as OFF, otherwise give up.
3853 We could handle that case by gimplifying the addition of base + off
3854 into some SSA_NAME and use that as off, but for now punt. */
3855 if (!expr_invariant_in_loop_p (loop
, base
))
3857 if (!integer_zerop (off
))
3860 base
= size_int (pbytepos
);
3862 /* Otherwise put base + constant offset into the loop invariant BASE
3863 and continue with OFF. */
3866 base
= fold_convert (sizetype
, base
);
3867 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3870 /* OFF at this point may be either a SSA_NAME or some tree expression
3871 from get_inner_reference. Try to peel off loop invariants from it
3872 into BASE as long as possible. */
3874 while (offtype
== NULL_TREE
)
3876 enum tree_code code
;
3877 tree op0
, op1
, add
= NULL_TREE
;
3879 if (TREE_CODE (off
) == SSA_NAME
)
3881 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3883 if (expr_invariant_in_loop_p (loop
, off
))
3886 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3889 op0
= gimple_assign_rhs1 (def_stmt
);
3890 code
= gimple_assign_rhs_code (def_stmt
);
3891 op1
= gimple_assign_rhs2 (def_stmt
);
3895 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3897 code
= TREE_CODE (off
);
3898 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3902 case POINTER_PLUS_EXPR
:
3904 if (expr_invariant_in_loop_p (loop
, op0
))
3909 add
= fold_convert (sizetype
, add
);
3911 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3912 base
= size_binop (PLUS_EXPR
, base
, add
);
3915 if (expr_invariant_in_loop_p (loop
, op1
))
3923 if (expr_invariant_in_loop_p (loop
, op1
))
3925 add
= fold_convert (sizetype
, op1
);
3926 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3932 if (scale
== 1 && tree_fits_shwi_p (op1
))
3934 int new_scale
= tree_to_shwi (op1
);
3935 /* Only treat this as a scaling operation if the target
3936 supports it for at least some offset type. */
3938 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3939 masked_p
, vectype
, memory_type
,
3940 signed_char_type_node
,
3943 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3944 masked_p
, vectype
, memory_type
,
3945 unsigned_char_type_node
,
3958 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3959 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3962 /* Don't include the conversion if the target is happy with
3963 the current offset type. */
3965 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3966 masked_p
, vectype
, memory_type
,
3967 TREE_TYPE (off
), scale
, &ifn
,
3971 if (TYPE_PRECISION (TREE_TYPE (op0
))
3972 == TYPE_PRECISION (TREE_TYPE (off
)))
3978 if (TYPE_PRECISION (TREE_TYPE (op0
))
3979 < TYPE_PRECISION (TREE_TYPE (off
)))
3982 offtype
= TREE_TYPE (off
);
3993 /* If at the end OFF still isn't a SSA_NAME or isn't
3994 defined in the loop, punt. */
3995 if (TREE_CODE (off
) != SSA_NAME
3996 || expr_invariant_in_loop_p (loop
, off
))
3999 if (offtype
== NULL_TREE
)
4000 offtype
= TREE_TYPE (off
);
4004 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4005 vectype
, memory_type
, offtype
, scale
,
4006 &ifn
, &offset_vectype
))
4011 if (DR_IS_READ (dr
))
4013 if (targetm
.vectorize
.builtin_gather
)
4014 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4018 if (targetm
.vectorize
.builtin_scatter
)
4019 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4026 /* The offset vector type will be read from DECL when needed. */
4027 offset_vectype
= NULL_TREE
;
4034 info
->offset_dt
= vect_unknown_def_type
;
4035 info
->offset_vectype
= offset_vectype
;
4036 info
->scale
= scale
;
4037 info
->element_type
= TREE_TYPE (vectype
);
4038 info
->memory_type
= memory_type
;
4042 /* Find the data references in STMT, analyze them with respect to LOOP and
4043 append them to DATAREFS. Return false if datarefs in this stmt cannot
4047 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4048 vec
<data_reference_p
> *datarefs
)
4050 /* We can ignore clobbers for dataref analysis - they are removed during
4051 loop vectorization and BB vectorization checks dependences with a
4053 if (gimple_clobber_p (stmt
))
4054 return opt_result::success ();
4056 if (gimple_has_volatile_ops (stmt
))
4057 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4060 if (stmt_can_throw_internal (cfun
, stmt
))
4061 return opt_result::failure_at (stmt
,
4063 " statement can throw an exception: %G",
4066 auto_vec
<data_reference_p
, 2> refs
;
4067 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4071 if (refs
.is_empty ())
4072 return opt_result::success ();
4074 if (refs
.length () > 1)
4075 return opt_result::failure_at (stmt
,
4077 " more than one data ref in stmt: %G", stmt
);
4079 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4080 if (!gimple_call_internal_p (call
)
4081 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4082 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4083 return opt_result::failure_at (stmt
,
4084 "not vectorized: dr in a call %G", stmt
);
4086 data_reference_p dr
= refs
.pop ();
4087 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4088 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4089 return opt_result::failure_at (stmt
,
4091 " statement is bitfield access %G", stmt
);
4093 if (DR_BASE_ADDRESS (dr
)
4094 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4095 return opt_result::failure_at (stmt
,
4097 " base addr of dr is a constant\n");
4099 /* Check whether this may be a SIMD lane access and adjust the
4100 DR to make it easier for us to handle it. */
4103 && (!DR_BASE_ADDRESS (dr
)
4108 struct data_reference
*newdr
4109 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4110 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4111 if (DR_BASE_ADDRESS (newdr
)
4112 && DR_OFFSET (newdr
)
4115 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4116 && integer_zerop (DR_STEP (newdr
)))
4118 tree base_address
= DR_BASE_ADDRESS (newdr
);
4119 tree off
= DR_OFFSET (newdr
);
4120 tree step
= ssize_int (1);
4121 if (integer_zerop (off
)
4122 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4124 off
= TREE_OPERAND (base_address
, 1);
4125 base_address
= TREE_OPERAND (base_address
, 0);
4128 if (TREE_CODE (off
) == MULT_EXPR
4129 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4131 step
= TREE_OPERAND (off
, 1);
4132 off
= TREE_OPERAND (off
, 0);
4135 if (CONVERT_EXPR_P (off
)
4136 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4137 < TYPE_PRECISION (TREE_TYPE (off
))))
4138 off
= TREE_OPERAND (off
, 0);
4139 if (TREE_CODE (off
) == SSA_NAME
)
4141 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4142 /* Look through widening conversion. */
4143 if (is_gimple_assign (def
)
4144 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4146 tree rhs1
= gimple_assign_rhs1 (def
);
4147 if (TREE_CODE (rhs1
) == SSA_NAME
4148 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4149 && (TYPE_PRECISION (TREE_TYPE (off
))
4150 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4151 def
= SSA_NAME_DEF_STMT (rhs1
);
4153 if (is_gimple_call (def
)
4154 && gimple_call_internal_p (def
)
4155 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4157 tree arg
= gimple_call_arg (def
, 0);
4158 tree reft
= TREE_TYPE (DR_REF (newdr
));
4159 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4160 arg
= SSA_NAME_VAR (arg
);
4161 if (arg
== loop
->simduid
4163 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4165 DR_BASE_ADDRESS (newdr
) = base_address
;
4166 DR_OFFSET (newdr
) = ssize_int (0);
4167 DR_STEP (newdr
) = step
;
4168 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4169 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4170 /* Mark as simd-lane access. */
4171 tree arg2
= gimple_call_arg (def
, 1);
4172 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4174 datarefs
->safe_push (newdr
);
4175 return opt_result::success ();
4180 free_data_ref (newdr
);
4183 datarefs
->safe_push (dr
);
4184 return opt_result::success ();
4187 /* Function vect_analyze_data_refs.
4189 Find all the data references in the loop or basic block.
4191 The general structure of the analysis of data refs in the vectorizer is as
4193 1- vect_analyze_data_refs(loop/bb): call
4194 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4195 in the loop/bb and their dependences.
4196 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4197 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4198 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4203 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4205 class loop
*loop
= NULL
;
4207 struct data_reference
*dr
;
4210 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4212 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4213 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4215 /* Go through the data-refs, check that the analysis succeeded. Update
4216 pointer from stmt_vec_info struct to DR and vectype. */
4218 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4219 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4221 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4224 gcc_assert (DR_REF (dr
));
4225 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4226 gcc_assert (!stmt_info
->dr_aux
.dr
);
4227 stmt_info
->dr_aux
.dr
= dr
;
4228 stmt_info
->dr_aux
.stmt
= stmt_info
;
4230 /* Check that analysis of the data-ref succeeded. */
4231 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4236 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4237 && (targetm
.vectorize
.builtin_gather
!= NULL
4238 || supports_vec_gather_load_p ());
4241 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4242 && (targetm
.vectorize
.builtin_scatter
!= NULL
4243 || supports_vec_scatter_store_p ());
4245 /* If target supports vector gather loads or scatter stores,
4246 see if they can't be used. */
4247 if (is_a
<loop_vec_info
> (vinfo
)
4248 && !nested_in_vect_loop_p (loop
, stmt_info
))
4250 if (maybe_gather
|| maybe_scatter
)
4253 gatherscatter
= GATHER
;
4255 gatherscatter
= SCATTER
;
4259 if (gatherscatter
== SG_NONE
)
4261 if (dump_enabled_p ())
4262 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4263 "not vectorized: data ref analysis "
4264 "failed %G", stmt_info
->stmt
);
4265 if (is_a
<bb_vec_info
> (vinfo
))
4267 /* In BB vectorization the ref can still participate
4268 in dependence analysis, we just can't vectorize it. */
4269 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4272 return opt_result::failure_at (stmt_info
->stmt
,
4274 " data ref analysis failed: %G",
4279 /* See if this was detected as SIMD lane access. */
4280 if (dr
->aux
== (void *)-1
4281 || dr
->aux
== (void *)-2
4282 || dr
->aux
== (void *)-3
4283 || dr
->aux
== (void *)-4)
4285 if (nested_in_vect_loop_p (loop
, stmt_info
))
4286 return opt_result::failure_at (stmt_info
->stmt
,
4288 " data ref analysis failed: %G",
4290 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4291 = -(uintptr_t) dr
->aux
;
4294 tree base
= get_base_address (DR_REF (dr
));
4295 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4297 if (dump_enabled_p ())
4298 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4299 "not vectorized: base object not addressable "
4300 "for stmt: %G", stmt_info
->stmt
);
4301 if (is_a
<bb_vec_info
> (vinfo
))
4303 /* In BB vectorization the ref can still participate
4304 in dependence analysis, we just can't vectorize it. */
4305 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4308 return opt_result::failure_at (stmt_info
->stmt
,
4309 "not vectorized: base object not"
4310 " addressable for stmt: %G",
4314 if (is_a
<loop_vec_info
> (vinfo
)
4316 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4318 if (nested_in_vect_loop_p (loop
, stmt_info
))
4319 return opt_result::failure_at (stmt_info
->stmt
,
4321 "not suitable for strided load %G",
4323 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4326 /* Update DR field in stmt_vec_info struct. */
4328 /* If the dataref is in an inner-loop of the loop that is considered for
4329 for vectorization, we also want to analyze the access relative to
4330 the outer-loop (DR contains information only relative to the
4331 inner-most enclosing loop). We do that by building a reference to the
4332 first location accessed by the inner-loop, and analyze it relative to
4334 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4336 /* Build a reference to the first location accessed by the
4337 inner loop: *(BASE + INIT + OFFSET). By construction,
4338 this address must be invariant in the inner loop, so we
4339 can consider it as being used in the outer loop. */
4340 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4341 tree offset
= unshare_expr (DR_OFFSET (dr
));
4342 tree init
= unshare_expr (DR_INIT (dr
));
4343 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4345 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4346 tree init_ref
= build_fold_indirect_ref (init_addr
);
4348 if (dump_enabled_p ())
4349 dump_printf_loc (MSG_NOTE
, vect_location
,
4350 "analyze in outer loop: %T\n", init_ref
);
4353 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4354 init_ref
, loop
, stmt_info
->stmt
);
4356 /* dr_analyze_innermost already explained the failure. */
4359 if (dump_enabled_p ())
4360 dump_printf_loc (MSG_NOTE
, vect_location
,
4361 "\touter base_address: %T\n"
4362 "\touter offset from base address: %T\n"
4363 "\touter constant offset from base address: %T\n"
4364 "\touter step: %T\n"
4365 "\touter base alignment: %d\n\n"
4366 "\touter base misalignment: %d\n"
4367 "\touter offset alignment: %d\n"
4368 "\touter step alignment: %d\n",
4369 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4370 STMT_VINFO_DR_OFFSET (stmt_info
),
4371 STMT_VINFO_DR_INIT (stmt_info
),
4372 STMT_VINFO_DR_STEP (stmt_info
),
4373 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4374 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4375 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4376 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4379 /* Set vectype for STMT. */
4380 scalar_type
= TREE_TYPE (DR_REF (dr
));
4381 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4384 if (dump_enabled_p ())
4386 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4387 "not vectorized: no vectype for stmt: %G",
4389 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4390 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4392 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4395 if (is_a
<bb_vec_info
> (vinfo
))
4397 /* No vector type is fine, the ref can still participate
4398 in dependence analysis, we just can't vectorize it. */
4399 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4404 return opt_result::failure_at (stmt_info
->stmt
,
4406 " no vectype for stmt: %G"
4407 " scalar_type: %T\n",
4408 stmt_info
->stmt
, scalar_type
);
4412 if (dump_enabled_p ())
4413 dump_printf_loc (MSG_NOTE
, vect_location
,
4414 "got vectype for stmt: %G%T\n",
4415 stmt_info
->stmt
, vectype
);
4418 /* Adjust the minimal vectorization factor according to the
4420 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4421 *min_vf
= upper_bound (*min_vf
, vf
);
4423 /* Leave the BB vectorizer to pick the vector type later, based on
4424 the final dataref group size and SLP node size. */
4425 if (is_a
<loop_vec_info
> (vinfo
))
4426 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4428 if (gatherscatter
!= SG_NONE
)
4430 gather_scatter_info gs_info
;
4431 if (!vect_check_gather_scatter (stmt_info
,
4432 as_a
<loop_vec_info
> (vinfo
),
4434 || !get_vectype_for_scalar_type (vinfo
,
4435 TREE_TYPE (gs_info
.offset
)))
4439 return opt_result::failure_at
4441 (gatherscatter
== GATHER
)
4442 ? "not vectorized: not suitable for gather load %G"
4443 : "not vectorized: not suitable for scatter store %G",
4446 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4450 /* We used to stop processing and prune the list here. Verify we no
4452 gcc_assert (i
== datarefs
.length ());
4454 return opt_result::success ();
4458 /* Function vect_get_new_vect_var.
4460 Returns a name for a new variable. The current naming scheme appends the
4461 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4462 the name of vectorizer generated variables, and appends that to NAME if
4466 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4473 case vect_simple_var
:
4476 case vect_scalar_var
:
4482 case vect_pointer_var
:
4491 char* tmp
= concat (prefix
, "_", name
, NULL
);
4492 new_vect_var
= create_tmp_reg (type
, tmp
);
4496 new_vect_var
= create_tmp_reg (type
, prefix
);
4498 return new_vect_var
;
4501 /* Like vect_get_new_vect_var but return an SSA name. */
4504 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4511 case vect_simple_var
:
4514 case vect_scalar_var
:
4517 case vect_pointer_var
:
4526 char* tmp
= concat (prefix
, "_", name
, NULL
);
4527 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4531 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4533 return new_vect_var
;
4536 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4539 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
4541 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
4542 int misalign
= DR_MISALIGNMENT (dr_info
);
4543 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4544 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4546 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4547 known_alignment (DR_TARGET_ALIGNMENT (dr_info
)),
4551 /* Function vect_create_addr_base_for_vector_ref.
4553 Create an expression that computes the address of the first memory location
4554 that will be accessed for a data reference.
4557 STMT_INFO: The statement containing the data reference.
4558 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4559 OFFSET: Optional. If supplied, it is be added to the initial address.
4560 LOOP: Specify relative to which loop-nest should the address be computed.
4561 For example, when the dataref is in an inner-loop nested in an
4562 outer-loop that is now being vectorized, LOOP can be either the
4563 outer-loop, or the inner-loop. The first memory location accessed
4564 by the following dataref ('in' points to short):
4571 if LOOP=i_loop: &in (relative to i_loop)
4572 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4573 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4574 initial address. Unlike OFFSET, which is number of elements to
4575 be added, BYTE_OFFSET is measured in bytes.
4578 1. Return an SSA_NAME whose value is the address of the memory location of
4579 the first vector of the data reference.
4580 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4581 these statement(s) which define the returned SSA_NAME.
4583 FORNOW: We are only handling array accesses with step 1. */
4586 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4587 gimple_seq
*new_stmt_list
,
4591 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4592 struct data_reference
*dr
= dr_info
->dr
;
4593 const char *base_name
;
4596 gimple_seq seq
= NULL
;
4598 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4599 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4600 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
4602 tree data_ref_base
= unshare_expr (drb
->base_address
);
4603 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
4604 tree init
= unshare_expr (drb
->init
);
4607 base_name
= get_name (data_ref_base
);
4610 base_offset
= ssize_int (0);
4611 init
= ssize_int (0);
4612 base_name
= get_name (DR_REF (dr
));
4615 /* Create base_offset */
4616 base_offset
= size_binop (PLUS_EXPR
,
4617 fold_convert (sizetype
, base_offset
),
4618 fold_convert (sizetype
, init
));
4622 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4623 fold_convert (sizetype
, offset
), step
);
4624 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4625 base_offset
, offset
);
4629 byte_offset
= fold_convert (sizetype
, byte_offset
);
4630 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4631 base_offset
, byte_offset
);
4634 /* base + base_offset */
4636 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4639 addr_base
= build1 (ADDR_EXPR
,
4640 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4641 unshare_expr (DR_REF (dr
)));
4644 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4645 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4646 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4647 gimple_seq_add_seq (new_stmt_list
, seq
);
4649 if (DR_PTR_INFO (dr
)
4650 && TREE_CODE (addr_base
) == SSA_NAME
4651 && !SSA_NAME_PTR_INFO (addr_base
))
4653 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
4654 if (offset
|| byte_offset
)
4655 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4658 if (dump_enabled_p ())
4659 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
4665 /* Function vect_create_data_ref_ptr.
4667 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4668 location accessed in the loop by STMT_INFO, along with the def-use update
4669 chain to appropriately advance the pointer through the loop iterations.
4670 Also set aliasing information for the pointer. This pointer is used by
4671 the callers to this function to create a memory reference expression for
4672 vector load/store access.
4675 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4676 GIMPLE_ASSIGN <name, data-ref> or
4677 GIMPLE_ASSIGN <data-ref, name>.
4678 2. AGGR_TYPE: the type of the reference, which should be either a vector
4680 3. AT_LOOP: the loop where the vector memref is to be created.
4681 4. OFFSET (optional): an offset to be added to the initial address accessed
4682 by the data-ref in STMT_INFO.
4683 5. BSI: location where the new stmts are to be placed if there is no loop
4684 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4685 pointing to the initial address.
4686 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4687 to the initial address accessed by the data-ref in STMT_INFO. This is
4688 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4690 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4691 to the IV during each iteration of the loop. NULL says to move
4692 by one copy of AGGR_TYPE up or down, depending on the step of the
4696 1. Declare a new ptr to vector_type, and have it point to the base of the
4697 data reference (initial addressed accessed by the data reference).
4698 For example, for vector of type V8HI, the following code is generated:
4701 ap = (v8hi *)initial_address;
4703 if OFFSET is not supplied:
4704 initial_address = &a[init];
4705 if OFFSET is supplied:
4706 initial_address = &a[init + OFFSET];
4707 if BYTE_OFFSET is supplied:
4708 initial_address = &a[init] + BYTE_OFFSET;
4710 Return the initial_address in INITIAL_ADDRESS.
4712 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4713 update the pointer in each iteration of the loop.
4715 Return the increment stmt that updates the pointer in PTR_INCR.
4717 3. Return the pointer. */
4720 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4721 tree aggr_type
, class loop
*at_loop
, tree offset
,
4722 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
4723 gimple
**ptr_incr
, bool only_init
,
4724 tree byte_offset
, tree iv_step
)
4726 const char *base_name
;
4727 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4728 class loop
*loop
= NULL
;
4729 bool nested_in_vect_loop
= false;
4730 class loop
*containing_loop
= NULL
;
4734 gimple_seq new_stmt_list
= NULL
;
4738 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4739 struct data_reference
*dr
= dr_info
->dr
;
4741 gimple_stmt_iterator incr_gsi
;
4743 tree indx_before_incr
, indx_after_incr
;
4745 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
4747 gcc_assert (iv_step
!= NULL_TREE
4748 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4749 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4753 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4754 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
4755 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
4756 pe
= loop_preheader_edge (loop
);
4760 gcc_assert (bb_vinfo
);
4765 /* Create an expression for the first address accessed by this load
4767 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4769 if (dump_enabled_p ())
4771 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4772 dump_printf_loc (MSG_NOTE
, vect_location
,
4773 "create %s-pointer variable to type: %T",
4774 get_tree_code_name (TREE_CODE (aggr_type
)),
4776 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4777 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4778 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4779 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4780 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4781 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4783 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4784 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
4787 /* (1) Create the new aggregate-pointer variable.
4788 Vector and array types inherit the alias set of their component
4789 type by default so we need to use a ref-all pointer if the data
4790 reference does not conflict with the created aggregated data
4791 reference because it is not addressable. */
4792 bool need_ref_all
= false;
4793 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4794 get_alias_set (DR_REF (dr
))))
4795 need_ref_all
= true;
4796 /* Likewise for any of the data references in the stmt group. */
4797 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4799 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4802 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4803 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4804 get_alias_set (DR_REF (sdr
))))
4806 need_ref_all
= true;
4809 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
4813 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4815 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4818 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4819 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4820 def-use update cycles for the pointer: one relative to the outer-loop
4821 (LOOP), which is what steps (3) and (4) below do. The other is relative
4822 to the inner-loop (which is the inner-most loop containing the dataref),
4823 and this is done be step (5) below.
4825 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4826 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4827 redundant. Steps (3),(4) create the following:
4830 LOOP: vp1 = phi(vp0,vp2)
4836 If there is an inner-loop nested in loop, then step (5) will also be
4837 applied, and an additional update in the inner-loop will be created:
4840 LOOP: vp1 = phi(vp0,vp2)
4842 inner: vp3 = phi(vp1,vp4)
4843 vp4 = vp3 + inner_step
4849 /* (2) Calculate the initial address of the aggregate-pointer, and set
4850 the aggregate-pointer to point to it before the loop. */
4852 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4854 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
4855 stmt_info
, &new_stmt_list
,
4856 offset
, byte_offset
);
4861 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4862 gcc_assert (!new_bb
);
4865 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4868 *initial_address
= new_temp
;
4869 aggr_ptr_init
= new_temp
;
4871 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4872 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4873 inner-loop nested in LOOP (during outer-loop vectorization). */
4875 /* No update in loop is required. */
4876 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4877 aptr
= aggr_ptr_init
;
4880 /* Accesses to invariant addresses should be handled specially
4882 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
4883 gcc_assert (!integer_zerop (step
));
4885 if (iv_step
== NULL_TREE
)
4887 /* The step of the aggregate pointer is the type size,
4888 negated for downward accesses. */
4889 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4890 if (tree_int_cst_sgn (step
) == -1)
4891 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4894 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4896 create_iv (aggr_ptr_init
,
4897 fold_convert (aggr_ptr_type
, iv_step
),
4898 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4899 &indx_before_incr
, &indx_after_incr
);
4900 incr
= gsi_stmt (incr_gsi
);
4901 loop_vinfo
->add_stmt (incr
);
4903 /* Copy the points-to information if it exists. */
4904 if (DR_PTR_INFO (dr
))
4906 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4907 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4912 aptr
= indx_before_incr
;
4915 if (!nested_in_vect_loop
|| only_init
)
4919 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4920 nested in LOOP, if exists. */
4922 gcc_assert (nested_in_vect_loop
);
4925 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4927 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4928 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4930 incr
= gsi_stmt (incr_gsi
);
4931 loop_vinfo
->add_stmt (incr
);
4933 /* Copy the points-to information if it exists. */
4934 if (DR_PTR_INFO (dr
))
4936 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4937 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4942 return indx_before_incr
;
4949 /* Function bump_vector_ptr
4951 Increment a pointer (to a vector type) by vector-size. If requested,
4952 i.e. if PTR-INCR is given, then also connect the new increment stmt
4953 to the existing def-use update-chain of the pointer, by modifying
4954 the PTR_INCR as illustrated below:
4956 The pointer def-use update-chain before this function:
4957 DATAREF_PTR = phi (p_0, p_2)
4959 PTR_INCR: p_2 = DATAREF_PTR + step
4961 The pointer def-use update-chain after this function:
4962 DATAREF_PTR = phi (p_0, p_2)
4964 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4966 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4969 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4971 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4972 the loop. The increment amount across iterations is expected
4974 BSI - location where the new update stmt is to be placed.
4975 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4976 BUMP - optional. The offset by which to bump the pointer. If not given,
4977 the offset is assumed to be vector_size.
4979 Output: Return NEW_DATAREF_PTR as illustrated above.
4984 bump_vector_ptr (vec_info
*vinfo
,
4985 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4986 stmt_vec_info stmt_info
, tree bump
)
4988 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4989 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4990 tree update
= TYPE_SIZE_UNIT (vectype
);
4993 use_operand_p use_p
;
4994 tree new_dataref_ptr
;
4999 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5000 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5002 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5003 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5004 dataref_ptr
, update
);
5005 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5007 /* Copy the points-to information if it exists. */
5008 if (DR_PTR_INFO (dr
))
5010 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5011 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5015 return new_dataref_ptr
;
5017 /* Update the vector-pointer's cross-iteration increment. */
5018 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5020 tree use
= USE_FROM_PTR (use_p
);
5022 if (use
== dataref_ptr
)
5023 SET_USE (use_p
, new_dataref_ptr
);
5025 gcc_assert (operand_equal_p (use
, update
, 0));
5028 return new_dataref_ptr
;
5032 /* Copy memory reference info such as base/clique from the SRC reference
5033 to the DEST MEM_REF. */
5036 vect_copy_ref_info (tree dest
, tree src
)
5038 if (TREE_CODE (dest
) != MEM_REF
)
5041 tree src_base
= src
;
5042 while (handled_component_p (src_base
))
5043 src_base
= TREE_OPERAND (src_base
, 0);
5044 if (TREE_CODE (src_base
) != MEM_REF
5045 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5048 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5049 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5053 /* Function vect_create_destination_var.
5055 Create a new temporary of type VECTYPE. */
5058 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5064 enum vect_var_kind kind
;
5067 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5071 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5073 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5075 name
= get_name (scalar_dest
);
5077 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5079 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5080 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5086 /* Function vect_grouped_store_supported.
5088 Returns TRUE if interleave high and interleave low permutations
5089 are supported, and FALSE otherwise. */
5092 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5094 machine_mode mode
= TYPE_MODE (vectype
);
5096 /* vect_permute_store_chain requires the group size to be equal to 3 or
5097 be a power of two. */
5098 if (count
!= 3 && exact_log2 (count
) == -1)
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5102 "the size of the group of accesses"
5103 " is not a power of 2 or not eqaul to 3\n");
5107 /* Check that the permutation is supported. */
5108 if (VECTOR_MODE_P (mode
))
5113 unsigned int j0
= 0, j1
= 0, j2
= 0;
5117 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5119 if (dump_enabled_p ())
5120 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5121 "cannot handle groups of 3 stores for"
5122 " variable-length vectors\n");
5126 vec_perm_builder
sel (nelt
, nelt
, 1);
5127 sel
.quick_grow (nelt
);
5128 vec_perm_indices indices
;
5129 for (j
= 0; j
< 3; j
++)
5131 int nelt0
= ((3 - j
) * nelt
) % 3;
5132 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5133 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5134 for (i
= 0; i
< nelt
; i
++)
5136 if (3 * i
+ nelt0
< nelt
)
5137 sel
[3 * i
+ nelt0
] = j0
++;
5138 if (3 * i
+ nelt1
< nelt
)
5139 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5140 if (3 * i
+ nelt2
< nelt
)
5141 sel
[3 * i
+ nelt2
] = 0;
5143 indices
.new_vector (sel
, 2, nelt
);
5144 if (!can_vec_perm_const_p (mode
, indices
))
5146 if (dump_enabled_p ())
5147 dump_printf (MSG_MISSED_OPTIMIZATION
,
5148 "permutation op not supported by target.\n");
5152 for (i
= 0; i
< nelt
; i
++)
5154 if (3 * i
+ nelt0
< nelt
)
5155 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5156 if (3 * i
+ nelt1
< nelt
)
5157 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5158 if (3 * i
+ nelt2
< nelt
)
5159 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5161 indices
.new_vector (sel
, 2, nelt
);
5162 if (!can_vec_perm_const_p (mode
, indices
))
5164 if (dump_enabled_p ())
5165 dump_printf (MSG_MISSED_OPTIMIZATION
,
5166 "permutation op not supported by target.\n");
5174 /* If length is not equal to 3 then only power of 2 is supported. */
5175 gcc_assert (pow2p_hwi (count
));
5176 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5178 /* The encoding has 2 interleaved stepped patterns. */
5179 vec_perm_builder
sel (nelt
, 2, 3);
5181 for (i
= 0; i
< 3; i
++)
5184 sel
[i
* 2 + 1] = i
+ nelt
;
5186 vec_perm_indices
indices (sel
, 2, nelt
);
5187 if (can_vec_perm_const_p (mode
, indices
))
5189 for (i
= 0; i
< 6; i
++)
5190 sel
[i
] += exact_div (nelt
, 2);
5191 indices
.new_vector (sel
, 2, nelt
);
5192 if (can_vec_perm_const_p (mode
, indices
))
5198 if (dump_enabled_p ())
5199 dump_printf (MSG_MISSED_OPTIMIZATION
,
5200 "permutation op not supported by target.\n");
5205 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5206 type VECTYPE. MASKED_P says whether the masked form is needed. */
5209 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5213 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5214 vec_mask_store_lanes_optab
,
5217 return vect_lanes_optab_supported_p ("vec_store_lanes",
5218 vec_store_lanes_optab
,
5223 /* Function vect_permute_store_chain.
5225 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5226 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5227 the data correctly for the stores. Return the final references for stores
5230 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5231 The input is 4 vectors each containing 8 elements. We assign a number to
5232 each element, the input sequence is:
5234 1st vec: 0 1 2 3 4 5 6 7
5235 2nd vec: 8 9 10 11 12 13 14 15
5236 3rd vec: 16 17 18 19 20 21 22 23
5237 4th vec: 24 25 26 27 28 29 30 31
5239 The output sequence should be:
5241 1st vec: 0 8 16 24 1 9 17 25
5242 2nd vec: 2 10 18 26 3 11 19 27
5243 3rd vec: 4 12 20 28 5 13 21 30
5244 4th vec: 6 14 22 30 7 15 23 31
5246 i.e., we interleave the contents of the four vectors in their order.
5248 We use interleave_high/low instructions to create such output. The input of
5249 each interleave_high/low operation is two vectors:
5252 the even elements of the result vector are obtained left-to-right from the
5253 high/low elements of the first vector. The odd elements of the result are
5254 obtained left-to-right from the high/low elements of the second vector.
5255 The output of interleave_high will be: 0 4 1 5
5256 and of interleave_low: 2 6 3 7
5259 The permutation is done in log LENGTH stages. In each stage interleave_high
5260 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5261 where the first argument is taken from the first half of DR_CHAIN and the
5262 second argument from it's second half.
5265 I1: interleave_high (1st vec, 3rd vec)
5266 I2: interleave_low (1st vec, 3rd vec)
5267 I3: interleave_high (2nd vec, 4th vec)
5268 I4: interleave_low (2nd vec, 4th vec)
5270 The output for the first stage is:
5272 I1: 0 16 1 17 2 18 3 19
5273 I2: 4 20 5 21 6 22 7 23
5274 I3: 8 24 9 25 10 26 11 27
5275 I4: 12 28 13 29 14 30 15 31
5277 The output of the second stage, i.e. the final result is:
5279 I1: 0 8 16 24 1 9 17 25
5280 I2: 2 10 18 26 3 11 19 27
5281 I3: 4 12 20 28 5 13 21 30
5282 I4: 6 14 22 30 7 15 23 31. */
5285 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
5286 unsigned int length
,
5287 stmt_vec_info stmt_info
,
5288 gimple_stmt_iterator
*gsi
,
5289 vec
<tree
> *result_chain
)
5291 tree vect1
, vect2
, high
, low
;
5293 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5294 tree perm_mask_low
, perm_mask_high
;
5296 tree perm3_mask_low
, perm3_mask_high
;
5297 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5299 result_chain
->quick_grow (length
);
5300 memcpy (result_chain
->address (), dr_chain
.address (),
5301 length
* sizeof (tree
));
5305 /* vect_grouped_store_supported ensures that this is constant. */
5306 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5307 unsigned int j0
= 0, j1
= 0, j2
= 0;
5309 vec_perm_builder
sel (nelt
, nelt
, 1);
5310 sel
.quick_grow (nelt
);
5311 vec_perm_indices indices
;
5312 for (j
= 0; j
< 3; j
++)
5314 int nelt0
= ((3 - j
) * nelt
) % 3;
5315 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5316 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5318 for (i
= 0; i
< nelt
; i
++)
5320 if (3 * i
+ nelt0
< nelt
)
5321 sel
[3 * i
+ nelt0
] = j0
++;
5322 if (3 * i
+ nelt1
< nelt
)
5323 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5324 if (3 * i
+ nelt2
< nelt
)
5325 sel
[3 * i
+ nelt2
] = 0;
5327 indices
.new_vector (sel
, 2, nelt
);
5328 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5330 for (i
= 0; i
< nelt
; i
++)
5332 if (3 * i
+ nelt0
< nelt
)
5333 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5334 if (3 * i
+ nelt1
< nelt
)
5335 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5336 if (3 * i
+ nelt2
< nelt
)
5337 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5339 indices
.new_vector (sel
, 2, nelt
);
5340 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5342 vect1
= dr_chain
[0];
5343 vect2
= dr_chain
[1];
5345 /* Create interleaving stmt:
5346 low = VEC_PERM_EXPR <vect1, vect2,
5347 {j, nelt, *, j + 1, nelt + j + 1, *,
5348 j + 2, nelt + j + 2, *, ...}> */
5349 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5350 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5351 vect2
, perm3_mask_low
);
5352 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5355 vect2
= dr_chain
[2];
5356 /* Create interleaving stmt:
5357 low = VEC_PERM_EXPR <vect1, vect2,
5358 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5359 6, 7, nelt + j + 2, ...}> */
5360 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5361 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5362 vect2
, perm3_mask_high
);
5363 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5364 (*result_chain
)[j
] = data_ref
;
5369 /* If length is not equal to 3 then only power of 2 is supported. */
5370 gcc_assert (pow2p_hwi (length
));
5372 /* The encoding has 2 interleaved stepped patterns. */
5373 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5374 vec_perm_builder
sel (nelt
, 2, 3);
5376 for (i
= 0; i
< 3; i
++)
5379 sel
[i
* 2 + 1] = i
+ nelt
;
5381 vec_perm_indices
indices (sel
, 2, nelt
);
5382 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5384 for (i
= 0; i
< 6; i
++)
5385 sel
[i
] += exact_div (nelt
, 2);
5386 indices
.new_vector (sel
, 2, nelt
);
5387 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5389 for (i
= 0, n
= log_length
; i
< n
; i
++)
5391 for (j
= 0; j
< length
/2; j
++)
5393 vect1
= dr_chain
[j
];
5394 vect2
= dr_chain
[j
+length
/2];
5396 /* Create interleaving stmt:
5397 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5399 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5400 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5401 vect2
, perm_mask_high
);
5402 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5403 (*result_chain
)[2*j
] = high
;
5405 /* Create interleaving stmt:
5406 low = VEC_PERM_EXPR <vect1, vect2,
5407 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5409 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5410 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5411 vect2
, perm_mask_low
);
5412 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5413 (*result_chain
)[2*j
+1] = low
;
5415 memcpy (dr_chain
.address (), result_chain
->address (),
5416 length
* sizeof (tree
));
5421 /* Function vect_setup_realignment
5423 This function is called when vectorizing an unaligned load using
5424 the dr_explicit_realign[_optimized] scheme.
5425 This function generates the following code at the loop prolog:
5428 x msq_init = *(floor(p)); # prolog load
5429 realignment_token = call target_builtin;
5431 x msq = phi (msq_init, ---)
5433 The stmts marked with x are generated only for the case of
5434 dr_explicit_realign_optimized.
5436 The code above sets up a new (vector) pointer, pointing to the first
5437 location accessed by STMT_INFO, and a "floor-aligned" load using that
5438 pointer. It also generates code to compute the "realignment-token"
5439 (if the relevant target hook was defined), and creates a phi-node at the
5440 loop-header bb whose arguments are the result of the prolog-load (created
5441 by this function) and the result of a load that takes place in the loop
5442 (to be created by the caller to this function).
5444 For the case of dr_explicit_realign_optimized:
5445 The caller to this function uses the phi-result (msq) to create the
5446 realignment code inside the loop, and sets up the missing phi argument,
5449 msq = phi (msq_init, lsq)
5450 lsq = *(floor(p')); # load in loop
5451 result = realign_load (msq, lsq, realignment_token);
5453 For the case of dr_explicit_realign:
5455 msq = *(floor(p)); # load in loop
5457 lsq = *(floor(p')); # load in loop
5458 result = realign_load (msq, lsq, realignment_token);
5461 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5462 a memory location that may be unaligned.
5463 BSI - place where new code is to be inserted.
5464 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5468 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5469 target hook, if defined.
5470 Return value - the result of the loop-header phi node. */
5473 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5474 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
5475 enum dr_alignment_support alignment_support_scheme
,
5477 class loop
**at_loop
)
5479 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5480 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5481 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5482 struct data_reference
*dr
= dr_info
->dr
;
5483 class loop
*loop
= NULL
;
5485 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
5491 tree msq_init
= NULL_TREE
;
5494 tree msq
= NULL_TREE
;
5495 gimple_seq stmts
= NULL
;
5496 bool compute_in_loop
= false;
5497 bool nested_in_vect_loop
= false;
5498 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5499 class loop
*loop_for_initial_load
= NULL
;
5503 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5504 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5507 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5508 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5510 /* We need to generate three things:
5511 1. the misalignment computation
5512 2. the extra vector load (for the optimized realignment scheme).
5513 3. the phi node for the two vectors from which the realignment is
5514 done (for the optimized realignment scheme). */
5516 /* 1. Determine where to generate the misalignment computation.
5518 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5519 calculation will be generated by this function, outside the loop (in the
5520 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5521 caller, inside the loop.
5523 Background: If the misalignment remains fixed throughout the iterations of
5524 the loop, then both realignment schemes are applicable, and also the
5525 misalignment computation can be done outside LOOP. This is because we are
5526 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5527 are a multiple of VS (the Vector Size), and therefore the misalignment in
5528 different vectorized LOOP iterations is always the same.
5529 The problem arises only if the memory access is in an inner-loop nested
5530 inside LOOP, which is now being vectorized using outer-loop vectorization.
5531 This is the only case when the misalignment of the memory access may not
5532 remain fixed throughout the iterations of the inner-loop (as explained in
5533 detail in vect_supportable_dr_alignment). In this case, not only is the
5534 optimized realignment scheme not applicable, but also the misalignment
5535 computation (and generation of the realignment token that is passed to
5536 REALIGN_LOAD) have to be done inside the loop.
5538 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5539 or not, which in turn determines if the misalignment is computed inside
5540 the inner-loop, or outside LOOP. */
5542 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5544 compute_in_loop
= true;
5545 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5549 /* 2. Determine where to generate the extra vector load.
5551 For the optimized realignment scheme, instead of generating two vector
5552 loads in each iteration, we generate a single extra vector load in the
5553 preheader of the loop, and in each iteration reuse the result of the
5554 vector load from the previous iteration. In case the memory access is in
5555 an inner-loop nested inside LOOP, which is now being vectorized using
5556 outer-loop vectorization, we need to determine whether this initial vector
5557 load should be generated at the preheader of the inner-loop, or can be
5558 generated at the preheader of LOOP. If the memory access has no evolution
5559 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5560 to be generated inside LOOP (in the preheader of the inner-loop). */
5562 if (nested_in_vect_loop
)
5564 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5565 bool invariant_in_outerloop
=
5566 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5567 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5570 loop_for_initial_load
= loop
;
5572 *at_loop
= loop_for_initial_load
;
5574 if (loop_for_initial_load
)
5575 pe
= loop_preheader_edge (loop_for_initial_load
);
5577 /* 3. For the case of the optimized realignment, create the first vector
5578 load at the loop preheader. */
5580 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5582 /* Create msq_init = *(floor(p1)) in the loop preheader */
5585 gcc_assert (!compute_in_loop
);
5586 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5587 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
5588 loop_for_initial_load
, NULL_TREE
,
5589 &init_addr
, NULL
, &inc
, true);
5590 if (TREE_CODE (ptr
) == SSA_NAME
)
5591 new_temp
= copy_ssa_name (ptr
);
5593 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5594 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
5595 tree type
= TREE_TYPE (ptr
);
5596 new_stmt
= gimple_build_assign
5597 (new_temp
, BIT_AND_EXPR
, ptr
,
5598 fold_build2 (MINUS_EXPR
, type
,
5599 build_int_cst (type
, 0),
5600 build_int_cst (type
, align
)));
5601 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5602 gcc_assert (!new_bb
);
5604 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5605 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5606 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5607 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5608 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5609 gimple_assign_set_lhs (new_stmt
, new_temp
);
5612 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5613 gcc_assert (!new_bb
);
5616 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5618 msq_init
= gimple_assign_lhs (new_stmt
);
5621 /* 4. Create realignment token using a target builtin, if available.
5622 It is done either inside the containing loop, or before LOOP (as
5623 determined above). */
5625 if (targetm
.vectorize
.builtin_mask_for_load
)
5630 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5633 /* Generate the INIT_ADDR computation outside LOOP. */
5634 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
5639 pe
= loop_preheader_edge (loop
);
5640 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5641 gcc_assert (!new_bb
);
5644 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5647 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5648 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5650 vect_create_destination_var (scalar_dest
,
5651 gimple_call_return_type (new_stmt
));
5652 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5653 gimple_call_set_lhs (new_stmt
, new_temp
);
5655 if (compute_in_loop
)
5656 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5659 /* Generate the misalignment computation outside LOOP. */
5660 pe
= loop_preheader_edge (loop
);
5661 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5662 gcc_assert (!new_bb
);
5665 *realignment_token
= gimple_call_lhs (new_stmt
);
5667 /* The result of the CALL_EXPR to this builtin is determined from
5668 the value of the parameter and no global variables are touched
5669 which makes the builtin a "const" function. Requiring the
5670 builtin to have the "const" attribute makes it unnecessary
5671 to call mark_call_clobbered. */
5672 gcc_assert (TREE_READONLY (builtin_decl
));
5675 if (alignment_support_scheme
== dr_explicit_realign
)
5678 gcc_assert (!compute_in_loop
);
5679 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5682 /* 5. Create msq = phi <msq_init, lsq> in loop */
5684 pe
= loop_preheader_edge (containing_loop
);
5685 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5686 msq
= make_ssa_name (vec_dest
);
5687 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5688 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5694 /* Function vect_grouped_load_supported.
5696 COUNT is the size of the load group (the number of statements plus the
5697 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5698 only one statement, with a gap of COUNT - 1.
5700 Returns true if a suitable permute exists. */
5703 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5704 unsigned HOST_WIDE_INT count
)
5706 machine_mode mode
= TYPE_MODE (vectype
);
5708 /* If this is single-element interleaving with an element distance
5709 that leaves unused vector loads around punt - we at least create
5710 very sub-optimal code in that case (and blow up memory,
5712 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5714 if (dump_enabled_p ())
5715 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5716 "single-element interleaving not supported "
5717 "for not adjacent vector loads\n");
5721 /* vect_permute_load_chain requires the group size to be equal to 3 or
5722 be a power of two. */
5723 if (count
!= 3 && exact_log2 (count
) == -1)
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5727 "the size of the group of accesses"
5728 " is not a power of 2 or not equal to 3\n");
5732 /* Check that the permutation is supported. */
5733 if (VECTOR_MODE_P (mode
))
5739 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5741 if (dump_enabled_p ())
5742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5743 "cannot handle groups of 3 loads for"
5744 " variable-length vectors\n");
5748 vec_perm_builder
sel (nelt
, nelt
, 1);
5749 sel
.quick_grow (nelt
);
5750 vec_perm_indices indices
;
5752 for (k
= 0; k
< 3; k
++)
5754 for (i
= 0; i
< nelt
; i
++)
5755 if (3 * i
+ k
< 2 * nelt
)
5759 indices
.new_vector (sel
, 2, nelt
);
5760 if (!can_vec_perm_const_p (mode
, indices
))
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5764 "shuffle of 3 loads is not supported by"
5768 for (i
= 0, j
= 0; i
< nelt
; i
++)
5769 if (3 * i
+ k
< 2 * nelt
)
5772 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5773 indices
.new_vector (sel
, 2, nelt
);
5774 if (!can_vec_perm_const_p (mode
, indices
))
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5778 "shuffle of 3 loads is not supported by"
5787 /* If length is not equal to 3 then only power of 2 is supported. */
5788 gcc_assert (pow2p_hwi (count
));
5789 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5791 /* The encoding has a single stepped pattern. */
5792 vec_perm_builder
sel (nelt
, 1, 3);
5794 for (i
= 0; i
< 3; i
++)
5796 vec_perm_indices
indices (sel
, 2, nelt
);
5797 if (can_vec_perm_const_p (mode
, indices
))
5799 for (i
= 0; i
< 3; i
++)
5801 indices
.new_vector (sel
, 2, nelt
);
5802 if (can_vec_perm_const_p (mode
, indices
))
5808 if (dump_enabled_p ())
5809 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5810 "extract even/odd not supported by target\n");
5814 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5815 type VECTYPE. MASKED_P says whether the masked form is needed. */
5818 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5822 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5823 vec_mask_load_lanes_optab
,
5826 return vect_lanes_optab_supported_p ("vec_load_lanes",
5827 vec_load_lanes_optab
,
5831 /* Function vect_permute_load_chain.
5833 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5834 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5835 the input data correctly. Return the final references for loads in
5838 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5839 The input is 4 vectors each containing 8 elements. We assign a number to each
5840 element, the input sequence is:
5842 1st vec: 0 1 2 3 4 5 6 7
5843 2nd vec: 8 9 10 11 12 13 14 15
5844 3rd vec: 16 17 18 19 20 21 22 23
5845 4th vec: 24 25 26 27 28 29 30 31
5847 The output sequence should be:
5849 1st vec: 0 4 8 12 16 20 24 28
5850 2nd vec: 1 5 9 13 17 21 25 29
5851 3rd vec: 2 6 10 14 18 22 26 30
5852 4th vec: 3 7 11 15 19 23 27 31
5854 i.e., the first output vector should contain the first elements of each
5855 interleaving group, etc.
5857 We use extract_even/odd instructions to create such output. The input of
5858 each extract_even/odd operation is two vectors
5862 and the output is the vector of extracted even/odd elements. The output of
5863 extract_even will be: 0 2 4 6
5864 and of extract_odd: 1 3 5 7
5867 The permutation is done in log LENGTH stages. In each stage extract_even
5868 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5869 their order. In our example,
5871 E1: extract_even (1st vec, 2nd vec)
5872 E2: extract_odd (1st vec, 2nd vec)
5873 E3: extract_even (3rd vec, 4th vec)
5874 E4: extract_odd (3rd vec, 4th vec)
5876 The output for the first stage will be:
5878 E1: 0 2 4 6 8 10 12 14
5879 E2: 1 3 5 7 9 11 13 15
5880 E3: 16 18 20 22 24 26 28 30
5881 E4: 17 19 21 23 25 27 29 31
5883 In order to proceed and create the correct sequence for the next stage (or
5884 for the correct output, if the second stage is the last one, as in our
5885 example), we first put the output of extract_even operation and then the
5886 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5887 The input for the second stage is:
5889 1st vec (E1): 0 2 4 6 8 10 12 14
5890 2nd vec (E3): 16 18 20 22 24 26 28 30
5891 3rd vec (E2): 1 3 5 7 9 11 13 15
5892 4th vec (E4): 17 19 21 23 25 27 29 31
5894 The output of the second stage:
5896 E1: 0 4 8 12 16 20 24 28
5897 E2: 2 6 10 14 18 22 26 30
5898 E3: 1 5 9 13 17 21 25 29
5899 E4: 3 7 11 15 19 23 27 31
5901 And RESULT_CHAIN after reordering:
5903 1st vec (E1): 0 4 8 12 16 20 24 28
5904 2nd vec (E3): 1 5 9 13 17 21 25 29
5905 3rd vec (E2): 2 6 10 14 18 22 26 30
5906 4th vec (E4): 3 7 11 15 19 23 27 31. */
5909 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
5910 unsigned int length
,
5911 stmt_vec_info stmt_info
,
5912 gimple_stmt_iterator
*gsi
,
5913 vec
<tree
> *result_chain
)
5915 tree data_ref
, first_vect
, second_vect
;
5916 tree perm_mask_even
, perm_mask_odd
;
5917 tree perm3_mask_low
, perm3_mask_high
;
5919 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5920 unsigned int i
, j
, log_length
= exact_log2 (length
);
5922 result_chain
->quick_grow (length
);
5923 memcpy (result_chain
->address (), dr_chain
.address (),
5924 length
* sizeof (tree
));
5928 /* vect_grouped_load_supported ensures that this is constant. */
5929 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5932 vec_perm_builder
sel (nelt
, nelt
, 1);
5933 sel
.quick_grow (nelt
);
5934 vec_perm_indices indices
;
5935 for (k
= 0; k
< 3; k
++)
5937 for (i
= 0; i
< nelt
; i
++)
5938 if (3 * i
+ k
< 2 * nelt
)
5942 indices
.new_vector (sel
, 2, nelt
);
5943 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5945 for (i
= 0, j
= 0; i
< nelt
; i
++)
5946 if (3 * i
+ k
< 2 * nelt
)
5949 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5950 indices
.new_vector (sel
, 2, nelt
);
5951 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5953 first_vect
= dr_chain
[0];
5954 second_vect
= dr_chain
[1];
5956 /* Create interleaving stmt (low part of):
5957 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5959 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5960 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5961 second_vect
, perm3_mask_low
);
5962 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5964 /* Create interleaving stmt (high part of):
5965 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5967 first_vect
= data_ref
;
5968 second_vect
= dr_chain
[2];
5969 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5970 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5971 second_vect
, perm3_mask_high
);
5972 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5973 (*result_chain
)[k
] = data_ref
;
5978 /* If length is not equal to 3 then only power of 2 is supported. */
5979 gcc_assert (pow2p_hwi (length
));
5981 /* The encoding has a single stepped pattern. */
5982 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5983 vec_perm_builder
sel (nelt
, 1, 3);
5985 for (i
= 0; i
< 3; ++i
)
5987 vec_perm_indices
indices (sel
, 2, nelt
);
5988 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5990 for (i
= 0; i
< 3; ++i
)
5992 indices
.new_vector (sel
, 2, nelt
);
5993 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5995 for (i
= 0; i
< log_length
; i
++)
5997 for (j
= 0; j
< length
; j
+= 2)
5999 first_vect
= dr_chain
[j
];
6000 second_vect
= dr_chain
[j
+1];
6002 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6003 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6004 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6005 first_vect
, second_vect
,
6007 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6008 (*result_chain
)[j
/2] = data_ref
;
6010 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6011 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6012 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6013 first_vect
, second_vect
,
6015 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6016 (*result_chain
)[j
/2+length
/2] = data_ref
;
6018 memcpy (dr_chain
.address (), result_chain
->address (),
6019 length
* sizeof (tree
));
6024 /* Function vect_shift_permute_load_chain.
6026 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6027 sequence of stmts to reorder the input data accordingly.
6028 Return the final references for loads in RESULT_CHAIN.
6029 Return true if successed, false otherwise.
6031 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6032 The input is 3 vectors each containing 8 elements. We assign a
6033 number to each element, the input sequence is:
6035 1st vec: 0 1 2 3 4 5 6 7
6036 2nd vec: 8 9 10 11 12 13 14 15
6037 3rd vec: 16 17 18 19 20 21 22 23
6039 The output sequence should be:
6041 1st vec: 0 3 6 9 12 15 18 21
6042 2nd vec: 1 4 7 10 13 16 19 22
6043 3rd vec: 2 5 8 11 14 17 20 23
6045 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6047 First we shuffle all 3 vectors to get correct elements order:
6049 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6050 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6051 3rd vec: (16 19 22) (17 20 23) (18 21)
6053 Next we unite and shift vector 3 times:
6056 shift right by 6 the concatenation of:
6057 "1st vec" and "2nd vec"
6058 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6059 "2nd vec" and "3rd vec"
6060 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6061 "3rd vec" and "1st vec"
6062 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6065 So that now new vectors are:
6067 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6068 2nd vec: (10 13) (16 19 22) (17 20 23)
6069 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6072 shift right by 5 the concatenation of:
6073 "1st vec" and "3rd vec"
6074 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6075 "2nd vec" and "1st vec"
6076 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6077 "3rd vec" and "2nd vec"
6078 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6081 So that now new vectors are:
6083 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6084 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6085 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6088 shift right by 5 the concatenation of:
6089 "1st vec" and "1st vec"
6090 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6091 shift right by 3 the concatenation of:
6092 "2nd vec" and "2nd vec"
6093 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6096 So that now all vectors are READY:
6097 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6098 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6099 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6101 This algorithm is faster than one in vect_permute_load_chain if:
6102 1. "shift of a concatination" is faster than general permutation.
6104 2. The TARGET machine can't execute vector instructions in parallel.
6105 This is because each step of the algorithm depends on previous.
6106 The algorithm in vect_permute_load_chain is much more parallel.
6108 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6112 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6113 unsigned int length
,
6114 stmt_vec_info stmt_info
,
6115 gimple_stmt_iterator
*gsi
,
6116 vec
<tree
> *result_chain
)
6118 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6119 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6120 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6123 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6125 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6127 unsigned HOST_WIDE_INT nelt
, vf
;
6128 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6129 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6130 /* Not supported for variable-length vectors. */
6133 vec_perm_builder
sel (nelt
, nelt
, 1);
6134 sel
.quick_grow (nelt
);
6136 result_chain
->quick_grow (length
);
6137 memcpy (result_chain
->address (), dr_chain
.address (),
6138 length
* sizeof (tree
));
6140 if (pow2p_hwi (length
) && vf
> 4)
6142 unsigned int j
, log_length
= exact_log2 (length
);
6143 for (i
= 0; i
< nelt
/ 2; ++i
)
6145 for (i
= 0; i
< nelt
/ 2; ++i
)
6146 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6147 vec_perm_indices
indices (sel
, 2, nelt
);
6148 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6150 if (dump_enabled_p ())
6151 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6152 "shuffle of 2 fields structure is not \
6153 supported by target\n");
6156 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6158 for (i
= 0; i
< nelt
/ 2; ++i
)
6160 for (i
= 0; i
< nelt
/ 2; ++i
)
6161 sel
[nelt
/ 2 + i
] = i
* 2;
6162 indices
.new_vector (sel
, 2, nelt
);
6163 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6165 if (dump_enabled_p ())
6166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6167 "shuffle of 2 fields structure is not \
6168 supported by target\n");
6171 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6173 /* Generating permutation constant to shift all elements.
6174 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6175 for (i
= 0; i
< nelt
; i
++)
6176 sel
[i
] = nelt
/ 2 + i
;
6177 indices
.new_vector (sel
, 2, nelt
);
6178 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6180 if (dump_enabled_p ())
6181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6182 "shift permutation is not supported by target\n");
6185 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6187 /* Generating permutation constant to select vector from 2.
6188 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6189 for (i
= 0; i
< nelt
/ 2; i
++)
6191 for (i
= nelt
/ 2; i
< nelt
; i
++)
6193 indices
.new_vector (sel
, 2, nelt
);
6194 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6196 if (dump_enabled_p ())
6197 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6198 "select is not supported by target\n");
6201 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6203 for (i
= 0; i
< log_length
; i
++)
6205 for (j
= 0; j
< length
; j
+= 2)
6207 first_vect
= dr_chain
[j
];
6208 second_vect
= dr_chain
[j
+ 1];
6210 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6211 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6212 first_vect
, first_vect
,
6214 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6217 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6218 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6219 second_vect
, second_vect
,
6221 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6224 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6225 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6226 vect
[0], vect
[1], shift1_mask
);
6227 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6228 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6230 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6231 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6232 vect
[0], vect
[1], select_mask
);
6233 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6234 (*result_chain
)[j
/2] = data_ref
;
6236 memcpy (dr_chain
.address (), result_chain
->address (),
6237 length
* sizeof (tree
));
6241 if (length
== 3 && vf
> 2)
6243 unsigned int k
= 0, l
= 0;
6245 /* Generating permutation constant to get all elements in rigth order.
6246 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6247 for (i
= 0; i
< nelt
; i
++)
6249 if (3 * k
+ (l
% 3) >= nelt
)
6252 l
+= (3 - (nelt
% 3));
6254 sel
[i
] = 3 * k
+ (l
% 3);
6257 vec_perm_indices
indices (sel
, 2, nelt
);
6258 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6260 if (dump_enabled_p ())
6261 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6262 "shuffle of 3 fields structure is not \
6263 supported by target\n");
6266 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6268 /* Generating permutation constant to shift all elements.
6269 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6270 for (i
= 0; i
< nelt
; i
++)
6271 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6272 indices
.new_vector (sel
, 2, nelt
);
6273 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6277 "shift permutation is not supported by target\n");
6280 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6282 /* Generating permutation constant to shift all elements.
6283 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6284 for (i
= 0; i
< nelt
; i
++)
6285 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6286 indices
.new_vector (sel
, 2, nelt
);
6287 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6289 if (dump_enabled_p ())
6290 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6291 "shift permutation is not supported by target\n");
6294 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6296 /* Generating permutation constant to shift all elements.
6297 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6298 for (i
= 0; i
< nelt
; i
++)
6299 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6300 indices
.new_vector (sel
, 2, nelt
);
6301 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6303 if (dump_enabled_p ())
6304 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6305 "shift permutation is not supported by target\n");
6308 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6310 /* Generating permutation constant to shift all elements.
6311 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6312 for (i
= 0; i
< nelt
; i
++)
6313 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6314 indices
.new_vector (sel
, 2, nelt
);
6315 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6317 if (dump_enabled_p ())
6318 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6319 "shift permutation is not supported by target\n");
6322 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6324 for (k
= 0; k
< 3; k
++)
6326 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6327 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6328 dr_chain
[k
], dr_chain
[k
],
6330 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6334 for (k
= 0; k
< 3; k
++)
6336 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6337 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6338 vect
[k
% 3], vect
[(k
+ 1) % 3],
6340 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6341 vect_shift
[k
] = data_ref
;
6344 for (k
= 0; k
< 3; k
++)
6346 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6347 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6348 vect_shift
[(4 - k
) % 3],
6349 vect_shift
[(3 - k
) % 3],
6351 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6355 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6357 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6358 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6359 vect
[0], shift3_mask
);
6360 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6361 (*result_chain
)[nelt
% 3] = data_ref
;
6363 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6364 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6365 vect
[1], shift4_mask
);
6366 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6367 (*result_chain
)[0] = data_ref
;
6373 /* Function vect_transform_grouped_load.
6375 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6376 to perform their permutation and ascribe the result vectorized statements to
6377 the scalar statements.
6381 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6383 int size
, gimple_stmt_iterator
*gsi
)
6386 vec
<tree
> result_chain
= vNULL
;
6388 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6389 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6390 vectors, that are ready for vector computation. */
6391 result_chain
.create (size
);
6393 /* If reassociation width for vector type is 2 or greater target machine can
6394 execute 2 or more vector instructions in parallel. Otherwise try to
6395 get chain for loads group using vect_shift_permute_load_chain. */
6396 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6397 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6399 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6400 gsi
, &result_chain
))
6401 vect_permute_load_chain (vinfo
, dr_chain
,
6402 size
, stmt_info
, gsi
, &result_chain
);
6403 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6404 result_chain
.release ();
6407 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6408 generated as part of the vectorization of STMT_INFO. Assign the statement
6409 for each vector to the associated scalar statement. */
6412 vect_record_grouped_load_vectors (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6413 vec
<tree
> result_chain
)
6415 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6416 unsigned int i
, gap_count
;
6419 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6420 Since we scan the chain starting from it's first node, their order
6421 corresponds the order of data-refs in RESULT_CHAIN. */
6422 stmt_vec_info next_stmt_info
= first_stmt_info
;
6424 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6426 if (!next_stmt_info
)
6429 /* Skip the gaps. Loads created for the gaps will be removed by dead
6430 code elimination pass later. No need to check for the first stmt in
6431 the group, since it always exists.
6432 DR_GROUP_GAP is the number of steps in elements from the previous
6433 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6434 correspond to the gaps. */
6435 if (next_stmt_info
!= first_stmt_info
6436 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6442 /* ??? The following needs cleanup after the removal of
6443 DR_GROUP_SAME_DR_STMT. */
6446 stmt_vec_info new_stmt_info
= vinfo
->lookup_def (tmp_data_ref
);
6447 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6448 copies, and we put the new vector statement in the first available
6450 if (!STMT_VINFO_VEC_STMT (next_stmt_info
))
6451 STMT_VINFO_VEC_STMT (next_stmt_info
) = new_stmt_info
;
6454 stmt_vec_info prev_stmt_info
6455 = STMT_VINFO_VEC_STMT (next_stmt_info
);
6456 stmt_vec_info rel_stmt_info
6457 = STMT_VINFO_RELATED_STMT (prev_stmt_info
);
6458 while (rel_stmt_info
)
6460 prev_stmt_info
= rel_stmt_info
;
6461 rel_stmt_info
= STMT_VINFO_RELATED_STMT (rel_stmt_info
);
6464 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt_info
;
6467 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
6473 /* Function vect_force_dr_alignment_p.
6475 Returns whether the alignment of a DECL can be forced to be aligned
6476 on ALIGNMENT bit boundary. */
6479 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
6484 if (decl_in_symtab_p (decl
)
6485 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6488 if (TREE_STATIC (decl
))
6489 return (known_le (alignment
,
6490 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
6492 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
6496 /* Return whether the data reference DR_INFO is supported with respect to its
6498 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6499 it is aligned, i.e., check if it is possible to vectorize it with different
6502 enum dr_alignment_support
6503 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
6504 bool check_aligned_accesses
)
6506 data_reference
*dr
= dr_info
->dr
;
6507 stmt_vec_info stmt_info
= dr_info
->stmt
;
6508 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6509 machine_mode mode
= TYPE_MODE (vectype
);
6510 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6511 class loop
*vect_loop
= NULL
;
6512 bool nested_in_vect_loop
= false;
6514 if (aligned_access_p (dr_info
) && !check_aligned_accesses
)
6517 /* For now assume all conditional loads/stores support unaligned
6518 access without any special code. */
6519 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
6520 if (gimple_call_internal_p (stmt
)
6521 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6522 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6523 return dr_unaligned_supported
;
6527 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6528 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
6531 /* Possibly unaligned access. */
6533 /* We can choose between using the implicit realignment scheme (generating
6534 a misaligned_move stmt) and the explicit realignment scheme (generating
6535 aligned loads with a REALIGN_LOAD). There are two variants to the
6536 explicit realignment scheme: optimized, and unoptimized.
6537 We can optimize the realignment only if the step between consecutive
6538 vector loads is equal to the vector size. Since the vector memory
6539 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6540 is guaranteed that the misalignment amount remains the same throughout the
6541 execution of the vectorized loop. Therefore, we can create the
6542 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6543 at the loop preheader.
6545 However, in the case of outer-loop vectorization, when vectorizing a
6546 memory access in the inner-loop nested within the LOOP that is now being
6547 vectorized, while it is guaranteed that the misalignment of the
6548 vectorized memory access will remain the same in different outer-loop
6549 iterations, it is *not* guaranteed that is will remain the same throughout
6550 the execution of the inner-loop. This is because the inner-loop advances
6551 with the original scalar step (and not in steps of VS). If the inner-loop
6552 step happens to be a multiple of VS, then the misalignment remains fixed
6553 and we can use the optimized realignment scheme. For example:
6559 When vectorizing the i-loop in the above example, the step between
6560 consecutive vector loads is 1, and so the misalignment does not remain
6561 fixed across the execution of the inner-loop, and the realignment cannot
6562 be optimized (as illustrated in the following pseudo vectorized loop):
6564 for (i=0; i<N; i+=4)
6565 for (j=0; j<M; j++){
6566 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6567 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6568 // (assuming that we start from an aligned address).
6571 We therefore have to use the unoptimized realignment scheme:
6573 for (i=0; i<N; i+=4)
6574 for (j=k; j<M; j+=4)
6575 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6576 // that the misalignment of the initial address is
6579 The loop can then be vectorized as follows:
6581 for (k=0; k<4; k++){
6582 rt = get_realignment_token (&vp[k]);
6583 for (i=0; i<N; i+=4){
6585 for (j=k; j<M; j+=4){
6587 va = REALIGN_LOAD <v1,v2,rt>;
6594 if (DR_IS_READ (dr
))
6596 bool is_packed
= false;
6597 tree type
= (TREE_TYPE (DR_REF (dr
)));
6599 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6600 && (!targetm
.vectorize
.builtin_mask_for_load
6601 || targetm
.vectorize
.builtin_mask_for_load ()))
6603 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6605 /* If we are doing SLP then the accesses need not have the
6606 same alignment, instead it depends on the SLP group size. */
6608 && STMT_SLP_TYPE (stmt_info
)
6609 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6611 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6612 TYPE_VECTOR_SUBPARTS (vectype
)))
6614 else if (!loop_vinfo
6615 || (nested_in_vect_loop
6616 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6617 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6618 return dr_explicit_realign
;
6620 return dr_explicit_realign_optimized
;
6622 if (!known_alignment_for_access_p (dr_info
))
6623 is_packed
= not_size_aligned (DR_REF (dr
));
6625 if (targetm
.vectorize
.support_vector_misalignment
6626 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6627 /* Can't software pipeline the loads, but can at least do them. */
6628 return dr_unaligned_supported
;
6632 bool is_packed
= false;
6633 tree type
= (TREE_TYPE (DR_REF (dr
)));
6635 if (!known_alignment_for_access_p (dr_info
))
6636 is_packed
= not_size_aligned (DR_REF (dr
));
6638 if (targetm
.vectorize
.support_vector_misalignment
6639 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6640 return dr_unaligned_supported
;
6644 return dr_unaligned_unsupported
;