1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
62 tree vectype
, unsigned HOST_WIDE_INT count
)
64 machine_mode mode
, array_mode
;
67 mode
= TYPE_MODE (vectype
);
68 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
70 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
71 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
72 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode
), count
);
82 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
86 "cannot use %s<%s><%s>\n", name
,
87 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE
, vect_location
,
93 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
94 GET_MODE_NAME (mode
));
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
,
119 HOST_WIDE_INT
*lhs_size_unit
,
120 HOST_WIDE_INT
*rhs_size_unit
)
122 tree scalar_type
= gimple_expr_type (stmt_info
->stmt
);
123 HOST_WIDE_INT lhs
, rhs
;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
130 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
132 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
134 && (gimple_assign_cast_p (assign
)
135 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
))
141 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
143 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
145 scalar_type
= rhs_type
;
147 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
150 if (gimple_call_internal_p (call
))
152 internal_fn ifn
= gimple_call_internal_fn (call
);
153 if (internal_load_fn_p (ifn
) || internal_store_fn_p (ifn
))
154 /* gimple_expr_type already picked the type of the loaded
157 else if (internal_fn_mask_index (ifn
) == 0)
160 if (i
< gimple_call_num_args (call
))
162 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
165 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
167 scalar_type
= rhs_type
;
172 *lhs_size_unit
= lhs
;
173 *rhs_size_unit
= rhs
;
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
183 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
185 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
187 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
188 return opt_result::failure_at (vect_location
,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
194 = runtime_alias_check_p (ddr
, loop
,
195 optimize_loop_nest_for_speed_p (loop
));
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
200 return opt_result::success ();
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
206 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
208 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
209 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
210 if (checks
[i
] == value
)
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE
, vect_location
,
215 "need run-time check that %T is nonzero\n",
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
225 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
227 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
228 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
235 /* STMT_A and STMT_B belong to overlapping groups. All loads are
236 emitted at the position of the first scalar load.
237 Stores in a group are emitted at the position of the last scalar store.
238 Compute that position and check whether the resulting order matches
240 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
243 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
244 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
245 s
= DR_GROUP_NEXT_ELEMENT (s
))
246 il_a
= get_later_stmt (il_a
, s
);
247 else /* DR_IS_READ */
248 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
249 s
= DR_GROUP_NEXT_ELEMENT (s
))
250 if (get_later_stmt (il_a
, s
) == il_a
)
255 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
258 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
259 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
260 s
= DR_GROUP_NEXT_ELEMENT (s
))
261 il_b
= get_later_stmt (il_b
, s
);
262 else /* DR_IS_READ */
263 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
264 s
= DR_GROUP_NEXT_ELEMENT (s
))
265 if (get_later_stmt (il_b
, s
) == il_b
)
270 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
271 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
274 /* A subroutine of vect_analyze_data_ref_dependence. Handle
275 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
276 distances. These distances are conservatively correct but they don't
277 reflect a guaranteed dependence.
279 Return true if this function does all the work necessary to avoid
280 an alias or false if the caller should use the dependence distances
281 to limit the vectorization factor in the usual way. LOOP_DEPTH is
282 the depth of the loop described by LOOP_VINFO and the other arguments
283 are as for vect_analyze_data_ref_dependence. */
286 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
287 loop_vec_info loop_vinfo
,
288 int loop_depth
, unsigned int *max_vf
)
290 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
291 lambda_vector dist_v
;
293 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
295 int dist
= dist_v
[loop_depth
];
296 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
298 /* If the user asserted safelen >= DIST consecutive iterations
299 can be executed concurrently, assume independence.
301 ??? An alternative would be to add the alias check even
302 in this case, and vectorize the fallback loop with the
303 maximum VF set to safelen. However, if the user has
304 explicitly given a length, it's less likely that that
306 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
308 if ((unsigned int) loop
->safelen
< *max_vf
)
309 *max_vf
= loop
->safelen
;
310 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
314 /* For dependence distances of 2 or more, we have the option
315 of limiting VF or checking for an alias at runtime.
316 Prefer to check at runtime if we can, to avoid limiting
317 the VF unnecessarily when the bases are in fact independent.
319 Note that the alias checks will be removed if the VF ends up
320 being small enough. */
321 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
322 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
323 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
324 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
325 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
332 /* Function vect_analyze_data_ref_dependence.
334 FIXME: I needed to change the sense of the returned flag.
336 Return FALSE if there (might) exist a dependence between a memory-reference
337 DRA and a memory-reference DRB. When versioning for alias may check a
338 dependence at run-time, return TRUE. Adjust *MAX_VF according to
339 the data dependence. */
342 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
343 loop_vec_info loop_vinfo
,
344 unsigned int *max_vf
)
347 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
348 struct data_reference
*dra
= DDR_A (ddr
);
349 struct data_reference
*drb
= DDR_B (ddr
);
350 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
351 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
352 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
353 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
354 lambda_vector dist_v
;
355 unsigned int loop_depth
;
357 /* In loop analysis all data references should be vectorizable. */
358 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
359 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
362 /* Independent data accesses. */
363 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
364 return opt_result::success ();
367 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
368 return opt_result::success ();
370 /* We do not have to consider dependences between accesses that belong
371 to the same group, unless the stride could be smaller than the
373 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
374 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
375 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
376 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
377 return opt_result::success ();
379 /* Even if we have an anti-dependence then, as the vectorized loop covers at
380 least two scalar iterations, there is always also a true dependence.
381 As the vectorizer does not re-order loads and stores we can ignore
382 the anti-dependence if TBAA can disambiguate both DRs similar to the
383 case with known negative distance anti-dependences (positive
384 distance anti-dependences would violate TBAA constraints). */
385 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
386 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
387 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
388 get_alias_set (DR_REF (drb
))))
389 return opt_result::success ();
391 /* Unknown data dependence. */
392 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
394 /* If user asserted safelen consecutive iterations can be
395 executed concurrently, assume independence. */
396 if (loop
->safelen
>= 2)
398 if ((unsigned int) loop
->safelen
< *max_vf
)
399 *max_vf
= loop
->safelen
;
400 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
401 return opt_result::success ();
404 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
405 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
406 return opt_result::failure_at
408 "versioning for alias not supported for: "
409 "can't determine dependence between %T and %T\n",
410 DR_REF (dra
), DR_REF (drb
));
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
414 "versioning for alias required: "
415 "can't determine dependence between %T and %T\n",
416 DR_REF (dra
), DR_REF (drb
));
418 /* Add to list of ddrs that need to be tested at run-time. */
419 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
422 /* Known data dependence. */
423 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
425 /* If user asserted safelen consecutive iterations can be
426 executed concurrently, assume independence. */
427 if (loop
->safelen
>= 2)
429 if ((unsigned int) loop
->safelen
< *max_vf
)
430 *max_vf
= loop
->safelen
;
431 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
432 return opt_result::success ();
435 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
436 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
437 return opt_result::failure_at
439 "versioning for alias not supported for: "
440 "bad dist vector for %T and %T\n",
441 DR_REF (dra
), DR_REF (drb
));
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
445 "versioning for alias required: "
446 "bad dist vector for %T and %T\n",
447 DR_REF (dra
), DR_REF (drb
));
448 /* Add to list of ddrs that need to be tested at run-time. */
449 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
452 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
454 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
455 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
457 return opt_result::success ();
459 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
461 int dist
= dist_v
[loop_depth
];
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE
, vect_location
,
465 "dependence distance = %d.\n", dist
);
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE
, vect_location
,
471 "dependence distance == 0 between %T and %T\n",
472 DR_REF (dra
), DR_REF (drb
));
474 /* When we perform grouped accesses and perform implicit CSE
475 by detecting equal accesses and doing disambiguation with
476 runtime alias tests like for
484 where we will end up loading { a[i], a[i+1] } once, make
485 sure that inserting group loads before the first load and
486 stores after the last store will do the right thing.
487 Similar for groups like
491 where loads from the group interleave with the store. */
492 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
493 return opt_result::failure_at (stmtinfo_a
->stmt
,
494 "READ_WRITE dependence"
495 " in interleaving.\n");
497 if (loop
->safelen
< 2)
499 tree indicator
= dr_zero_step_indicator (dra
);
500 if (!indicator
|| integer_zerop (indicator
))
501 return opt_result::failure_at (stmtinfo_a
->stmt
,
502 "access also has a zero step\n");
503 else if (TREE_CODE (indicator
) != INTEGER_CST
)
504 vect_check_nonzero_value (loop_vinfo
, indicator
);
509 if (dist
> 0 && DDR_REVERSED_P (ddr
))
511 /* If DDR_REVERSED_P the order of the data-refs in DDR was
512 reversed (to make distance vector positive), and the actual
513 distance is negative. */
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE
, vect_location
,
516 "dependence distance negative.\n");
517 /* When doing outer loop vectorization, we need to check if there is
518 a backward dependence at the inner loop level if the dependence
519 at the outer loop is reversed. See PR81740. */
520 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
521 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
523 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
524 DDR_LOOP_NEST (ddr
));
525 if (dist_v
[inner_depth
] < 0)
526 return opt_result::failure_at (stmtinfo_a
->stmt
,
527 "not vectorized, dependence "
528 "between data-refs %T and %T\n",
529 DR_REF (dra
), DR_REF (drb
));
531 /* Record a negative dependence distance to later limit the
532 amount of stmt copying / unrolling we can perform.
533 Only need to handle read-after-write dependence. */
535 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
536 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
537 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
541 unsigned int abs_dist
= abs (dist
);
542 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
544 /* The dependence distance requires reduction of the maximal
545 vectorization factor. */
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE
, vect_location
,
549 "adjusting maximal vectorization factor to %i\n",
553 if (abs_dist
>= *max_vf
)
555 /* Dependence distance does not create dependence, as far as
556 vectorization is concerned, in this case. */
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE
, vect_location
,
559 "dependence distance >= VF.\n");
563 return opt_result::failure_at (stmtinfo_a
->stmt
,
564 "not vectorized, possible dependence "
565 "between data-refs %T and %T\n",
566 DR_REF (dra
), DR_REF (drb
));
569 return opt_result::success ();
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the loop, and make sure there do not
575 exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
579 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
580 unsigned int *max_vf
)
583 struct data_dependence_relation
*ddr
;
585 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
589 LOOP_VINFO_DDRS (loop_vinfo
)
590 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
591 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
592 /* We do not need read-read dependences. */
593 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
594 &LOOP_VINFO_DDRS (loop_vinfo
),
595 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
600 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
602 /* For epilogues we either have no aliases or alias versioning
603 was applied to original loop. Therefore we may just get max_vf
604 using VF of original loop. */
605 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
606 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
608 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
611 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
616 return opt_result::success ();
620 /* Function vect_slp_analyze_data_ref_dependence.
622 Return TRUE if there (might) exist a dependence between a memory-reference
623 DRA and a memory-reference DRB for VINFO. When versioning for alias
624 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
625 according to the data dependence. */
628 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
629 struct data_dependence_relation
*ddr
)
631 struct data_reference
*dra
= DDR_A (ddr
);
632 struct data_reference
*drb
= DDR_B (ddr
);
633 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
634 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
636 /* We need to check dependences of statements marked as unvectorizable
637 as well, they still can prohibit vectorization. */
639 /* Independent data accesses. */
640 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
646 /* Read-read is OK. */
647 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
650 /* If dra and drb are part of the same interleaving chain consider
652 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
653 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
654 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
657 /* Unknown data dependence. */
658 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
662 "can't determine dependence between %T and %T\n",
663 DR_REF (dra
), DR_REF (drb
));
665 else if (dump_enabled_p ())
666 dump_printf_loc (MSG_NOTE
, vect_location
,
667 "determined dependence between %T and %T\n",
668 DR_REF (dra
), DR_REF (drb
));
674 /* Analyze dependences involved in the transform of SLP NODE. STORES
675 contain the vector of scalar stores of this instance if we are
676 disambiguating the loads. */
679 vect_slp_analyze_node_dependences (vec_info
*vinfo
, slp_tree node
,
680 vec
<stmt_vec_info
> stores
,
681 stmt_vec_info last_store_info
)
683 /* This walks over all stmts involved in the SLP load/store done
684 in NODE verifying we can sink them up to the last stmt in the
686 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
688 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
689 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
691 stmt_vec_info access_info
= SLP_TREE_SCALAR_STMTS (node
)[k
];
692 if (access_info
== last_access_info
)
694 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
696 bool ref_initialized_p
= false;
697 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
698 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
700 gimple
*stmt
= gsi_stmt (gsi
);
701 if (! gimple_vuse (stmt
))
704 /* If we couldn't record a (single) data reference for this
705 stmt we have to resort to the alias oracle. */
706 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
707 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
710 /* We are moving a store - this means
711 we cannot use TBAA for disambiguation. */
712 if (!ref_initialized_p
)
713 ao_ref_init (&ref
, DR_REF (dr_a
));
714 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
715 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
720 bool dependent
= false;
721 /* If we run into a store of this same instance (we've just
722 marked those) then delay dependence checking until we run
723 into the last store because this is where it will have
724 been sunk to (and we verify if we can do that as well). */
725 if (gimple_visited_p (stmt
))
727 if (stmt_info
!= last_store_info
)
730 stmt_vec_info store_info
;
731 FOR_EACH_VEC_ELT (stores
, i
, store_info
)
733 data_reference
*store_dr
734 = STMT_VINFO_DATA_REF (store_info
);
735 ddr_p ddr
= initialize_data_dependence_relation
736 (dr_a
, store_dr
, vNULL
);
738 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
739 free_dependence_relation (ddr
);
746 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
748 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
749 free_dependence_relation (ddr
);
756 else /* DR_IS_READ */
758 stmt_vec_info first_access_info
759 = vect_find_first_scalar_stmt_in_slp (node
);
760 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
762 stmt_vec_info access_info
= SLP_TREE_SCALAR_STMTS (node
)[k
];
763 if (access_info
== first_access_info
)
765 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
767 bool ref_initialized_p
= false;
768 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
769 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
771 gimple
*stmt
= gsi_stmt (gsi
);
772 if (! gimple_vdef (stmt
))
775 /* If we couldn't record a (single) data reference for this
776 stmt we have to resort to the alias oracle. */
777 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
778 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
781 /* We are hoisting a load - this means we can use
782 TBAA for disambiguation. */
783 if (!ref_initialized_p
)
784 ao_ref_init (&ref
, DR_REF (dr_a
));
785 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, true))
790 bool dependent
= false;
791 /* If we run into a store of this same instance (we've just
792 marked those) then delay dependence checking until we run
793 into the last store because this is where it will have
794 been sunk to (and we verify if we can do that as well). */
795 if (gimple_visited_p (stmt
))
797 if (stmt_info
!= last_store_info
)
800 stmt_vec_info store_info
;
801 FOR_EACH_VEC_ELT (stores
, i
, store_info
)
803 data_reference
*store_dr
804 = STMT_VINFO_DATA_REF (store_info
);
805 ddr_p ddr
= initialize_data_dependence_relation
806 (dr_a
, store_dr
, vNULL
);
808 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
809 free_dependence_relation (ddr
);
816 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
818 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
819 free_dependence_relation (ddr
);
830 /* Function vect_analyze_data_ref_dependences.
832 Examine all the data references in the basic-block, and make sure there
833 do not exist any data dependences between them. Set *MAX_VF according to
834 the maximum vectorization factor the data dependences allow. */
837 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
839 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
841 /* The stores of this instance are at the root of the SLP tree. */
842 slp_tree store
= SLP_INSTANCE_TREE (instance
);
843 if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store
)))
846 /* Verify we can sink stores to the vectorized stmt insert location. */
847 stmt_vec_info last_store_info
= NULL
;
850 if (! vect_slp_analyze_node_dependences (vinfo
, store
, vNULL
, NULL
))
853 /* Mark stores in this instance and remember the last one. */
854 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
855 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
856 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
861 /* Verify we can sink loads to the vectorized stmt insert location,
862 special-casing stores of this instance. */
865 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
866 if (! vect_slp_analyze_node_dependences (vinfo
, load
,
868 ? SLP_TREE_SCALAR_STMTS (store
)
869 : vNULL
, last_store_info
))
875 /* Unset the visited flag. */
877 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
878 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
883 /* Record the base alignment guarantee given by DRB, which occurs
887 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
888 innermost_loop_behavior
*drb
)
891 innermost_loop_behavior
*&entry
892 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
893 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE
, vect_location
,
898 "recording new base alignment for %T\n"
900 " misalignment: %d\n"
904 drb
->base_misalignment
,
909 /* If the region we're going to vectorize is reached, all unconditional
910 data references occur at least once. We can therefore pool the base
911 alignment guarantees from each unconditional reference. Do this by
912 going through all the data references in VINFO and checking whether
913 the containing statement makes the reference unconditionally. If so,
914 record the alignment of the base address in VINFO so that it can be
915 used for all other references with the same base. */
918 vect_record_base_alignments (vec_info
*vinfo
)
920 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
921 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
924 FOR_EACH_VEC_ELT (vinfo
->shared
->datarefs
, i
, dr
)
926 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
927 stmt_vec_info stmt_info
= dr_info
->stmt
;
928 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
929 && STMT_VINFO_VECTORIZABLE (stmt_info
)
930 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
932 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
934 /* If DR is nested in the loop that is being vectorized, we can also
935 record the alignment of the base wrt the outer loop. */
936 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
937 vect_record_base_alignment
938 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
943 /* Return the target alignment for the vectorized form of DR_INFO. */
946 vect_calculate_target_alignment (dr_vec_info
*dr_info
)
948 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
949 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
952 /* Function vect_compute_data_ref_alignment
954 Compute the misalignment of the data reference DR_INFO.
957 1. DR_MISALIGNMENT (DR_INFO) is defined.
959 FOR NOW: No analysis is actually performed. Misalignment is calculated
960 only for trivial cases. TODO. */
963 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
)
965 stmt_vec_info stmt_info
= dr_info
->stmt
;
966 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
967 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
968 class loop
*loop
= NULL
;
969 tree ref
= DR_REF (dr_info
->dr
);
970 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
972 if (dump_enabled_p ())
973 dump_printf_loc (MSG_NOTE
, vect_location
,
974 "vect_compute_data_ref_alignment:\n");
977 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
979 /* Initialize misalignment to unknown. */
980 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
982 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
985 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
986 bool step_preserves_misalignment_p
;
988 poly_uint64 vector_alignment
989 = exact_div (vect_calculate_target_alignment (dr_info
), BITS_PER_UNIT
);
990 DR_TARGET_ALIGNMENT (dr_info
) = vector_alignment
;
992 /* If the main loop has peeled for alignment we have no way of knowing
993 whether the data accesses in the epilogues are aligned. We can't at
994 compile time answer the question whether we have entered the main loop or
995 not. Fixes PR 92351. */
998 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1000 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1004 unsigned HOST_WIDE_INT vect_align_c
;
1005 if (!vector_alignment
.is_constant (&vect_align_c
))
1008 /* No step for BB vectorization. */
1011 gcc_assert (integer_zerop (drb
->step
));
1012 step_preserves_misalignment_p
= true;
1015 /* In case the dataref is in an inner-loop of the loop that is being
1016 vectorized (LOOP), we use the base and misalignment information
1017 relative to the outer-loop (LOOP). This is ok only if the misalignment
1018 stays the same throughout the execution of the inner-loop, which is why
1019 we have to check that the stride of the dataref in the inner-loop evenly
1020 divides by the vector alignment. */
1021 else if (nested_in_vect_loop_p (loop
, stmt_info
))
1023 step_preserves_misalignment_p
1024 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1026 if (dump_enabled_p ())
1028 if (step_preserves_misalignment_p
)
1029 dump_printf_loc (MSG_NOTE
, vect_location
,
1030 "inner step divides the vector alignment.\n");
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1033 "inner step doesn't divide the vector"
1038 /* Similarly we can only use base and misalignment information relative to
1039 an innermost loop if the misalignment stays the same throughout the
1040 execution of the loop. As above, this is the case if the stride of
1041 the dataref evenly divides by the alignment. */
1044 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1045 step_preserves_misalignment_p
1046 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
1048 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1050 "step doesn't divide the vector alignment.\n");
1053 unsigned int base_alignment
= drb
->base_alignment
;
1054 unsigned int base_misalignment
= drb
->base_misalignment
;
1056 /* Calculate the maximum of the pooled base address alignment and the
1057 alignment that we can compute for DR itself. */
1058 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
1059 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
1061 base_alignment
= (*entry
)->base_alignment
;
1062 base_misalignment
= (*entry
)->base_misalignment
;
1065 if (drb
->offset_alignment
< vect_align_c
1066 || !step_preserves_misalignment_p
1067 /* We need to know whether the step wrt the vectorized loop is
1068 negative when computing the starting misalignment below. */
1069 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1071 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1073 "Unknown alignment for access: %T\n", ref
);
1077 if (base_alignment
< vect_align_c
)
1079 unsigned int max_alignment
;
1080 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1081 if (max_alignment
< vect_align_c
1082 || !vect_can_force_dr_alignment_p (base
,
1083 vect_align_c
* BITS_PER_UNIT
))
1085 if (dump_enabled_p ())
1086 dump_printf_loc (MSG_NOTE
, vect_location
,
1087 "can't force alignment of ref: %T\n", ref
);
1091 /* Force the alignment of the decl.
1092 NOTE: This is the only change to the code we make during
1093 the analysis phase, before deciding to vectorize the loop. */
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE
, vect_location
,
1096 "force alignment of %T\n", ref
);
1098 dr_info
->base_decl
= base
;
1099 dr_info
->base_misaligned
= true;
1100 base_misalignment
= 0;
1102 poly_int64 misalignment
1103 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1105 /* If this is a backward running DR then first access in the larger
1106 vectype actually is N-1 elements before the address in the DR.
1107 Adjust misalign accordingly. */
1108 if (tree_int_cst_sgn (drb
->step
) < 0)
1109 /* PLUS because STEP is negative. */
1110 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1111 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1113 unsigned int const_misalignment
;
1114 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1118 "Non-constant misalignment for access: %T\n", ref
);
1122 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1124 if (dump_enabled_p ())
1125 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1126 "misalign = %d bytes of ref %T\n",
1127 DR_MISALIGNMENT (dr_info
), ref
);
1132 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1133 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1134 is made aligned via peeling. */
1137 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1138 dr_vec_info
*dr_peel_info
)
1140 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1141 DR_TARGET_ALIGNMENT (dr_info
)))
1143 poly_offset_int diff
1144 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1145 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1146 if (known_eq (diff
, 0)
1147 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1153 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1154 aligned via peeling. */
1157 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1158 dr_vec_info
*dr_peel_info
)
1160 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1161 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1162 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1163 DR_OFFSET (dr_peel_info
->dr
), 0)
1164 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1165 DR_STEP (dr_peel_info
->dr
), 0))
1168 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1171 /* Function vect_update_misalignment_for_peel.
1172 Sets DR_INFO's misalignment
1173 - to 0 if it has the same alignment as DR_PEEL_INFO,
1174 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1175 - to -1 (unknown) otherwise.
1177 DR_INFO - the data reference whose misalignment is to be adjusted.
1178 DR_PEEL_INFO - the data reference whose misalignment is being made
1179 zero in the vector loop by the peel.
1180 NPEEL - the number of iterations in the peel loop if the misalignment
1181 of DR_PEEL_INFO is known at compile time. */
1184 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1185 dr_vec_info
*dr_peel_info
, int npeel
)
1187 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1188 it is aligned in the vector loop. */
1189 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1191 gcc_assert (!known_alignment_for_access_p (dr_info
)
1192 || !known_alignment_for_access_p (dr_peel_info
)
1193 || (DR_MISALIGNMENT (dr_info
)
1194 == DR_MISALIGNMENT (dr_peel_info
)));
1195 SET_DR_MISALIGNMENT (dr_info
, 0);
1199 unsigned HOST_WIDE_INT alignment
;
1200 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1201 && known_alignment_for_access_p (dr_info
)
1202 && known_alignment_for_access_p (dr_peel_info
))
1204 int misal
= DR_MISALIGNMENT (dr_info
);
1205 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1206 misal
&= alignment
- 1;
1207 SET_DR_MISALIGNMENT (dr_info
, misal
);
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1213 "to unknown (-1).\n");
1214 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1217 /* Return true if alignment is relevant for DR_INFO. */
1220 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1222 stmt_vec_info stmt_info
= dr_info
->stmt
;
1224 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1227 /* For interleaving, only the alignment of the first access matters. */
1228 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1229 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1232 /* Scatter-gather and invariant accesses continue to address individual
1233 scalars, so vector-level alignment is irrelevant. */
1234 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1235 || integer_zerop (DR_STEP (dr_info
->dr
)))
1238 /* Strided accesses perform only component accesses, alignment is
1239 irrelevant for them. */
1240 if (STMT_VINFO_STRIDED_P (stmt_info
)
1241 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1247 /* Given an memory reference EXP return whether its alignment is less
1251 not_size_aligned (tree exp
)
1253 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1256 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1257 > get_object_alignment (exp
));
1260 /* Function vector_alignment_reachable_p
1262 Return true if vector alignment for DR_INFO is reachable by peeling
1263 a few loop iterations. Return false otherwise. */
1266 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1268 stmt_vec_info stmt_info
= dr_info
->stmt
;
1269 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1271 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1273 /* For interleaved access we peel only if number of iterations in
1274 the prolog loop ({VF - misalignment}), is a multiple of the
1275 number of the interleaved accesses. */
1276 int elem_size
, mis_in_elements
;
1278 /* FORNOW: handle only known alignment. */
1279 if (!known_alignment_for_access_p (dr_info
))
1282 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1283 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1284 elem_size
= vector_element_size (vector_size
, nelements
);
1285 mis_in_elements
= DR_MISALIGNMENT (dr_info
) / elem_size
;
1287 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1291 /* If misalignment is known at the compile time then allow peeling
1292 only if natural alignment is reachable through peeling. */
1293 if (known_alignment_for_access_p (dr_info
) && !aligned_access_p (dr_info
))
1295 HOST_WIDE_INT elmsize
=
1296 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1297 if (dump_enabled_p ())
1299 dump_printf_loc (MSG_NOTE
, vect_location
,
1300 "data size = %wd. misalignment = %d.\n", elmsize
,
1301 DR_MISALIGNMENT (dr_info
));
1303 if (DR_MISALIGNMENT (dr_info
) % elmsize
)
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1307 "data size does not divide the misalignment.\n");
1312 if (!known_alignment_for_access_p (dr_info
))
1314 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1315 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1316 if (dump_enabled_p ())
1317 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1318 "Unknown misalignment, %snaturally aligned\n",
1319 is_packed
? "not " : "");
1320 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1327 /* Calculate the cost of the memory access represented by DR_INFO. */
1330 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1331 unsigned int *inside_cost
,
1332 unsigned int *outside_cost
,
1333 stmt_vector_for_cost
*body_cost_vec
,
1334 stmt_vector_for_cost
*prologue_cost_vec
)
1336 stmt_vec_info stmt_info
= dr_info
->stmt
;
1337 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1340 if (PURE_SLP_STMT (stmt_info
))
1343 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1345 if (DR_IS_READ (dr_info
->dr
))
1346 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, true, inside_cost
,
1347 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1349 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, inside_cost
, body_cost_vec
);
1351 if (dump_enabled_p ())
1352 dump_printf_loc (MSG_NOTE
, vect_location
,
1353 "vect_get_data_access_cost: inside_cost = %d, "
1354 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1358 typedef struct _vect_peel_info
1360 dr_vec_info
*dr_info
;
1365 typedef struct _vect_peel_extended_info
1368 struct _vect_peel_info peel_info
;
1369 unsigned int inside_cost
;
1370 unsigned int outside_cost
;
1371 } *vect_peel_extended_info
;
1374 /* Peeling hashtable helpers. */
1376 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1378 static inline hashval_t
hash (const _vect_peel_info
*);
1379 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1383 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1385 return (hashval_t
) peel_info
->npeel
;
1389 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1391 return (a
->npeel
== b
->npeel
);
1395 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1398 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1399 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1402 struct _vect_peel_info elem
, *slot
;
1403 _vect_peel_info
**new_slot
;
1404 bool supportable_dr_alignment
1405 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, true);
1408 slot
= peeling_htab
->find (&elem
);
1413 slot
= XNEW (struct _vect_peel_info
);
1414 slot
->npeel
= npeel
;
1415 slot
->dr_info
= dr_info
;
1417 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1421 if (!supportable_dr_alignment
1422 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1423 slot
->count
+= VECT_MAX_COST
;
1427 /* Traverse peeling hash table to find peeling option that aligns maximum
1428 number of data accesses. */
1431 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1432 _vect_peel_extended_info
*max
)
1434 vect_peel_info elem
= *slot
;
1436 if (elem
->count
> max
->peel_info
.count
1437 || (elem
->count
== max
->peel_info
.count
1438 && max
->peel_info
.npeel
> elem
->npeel
))
1440 max
->peel_info
.npeel
= elem
->npeel
;
1441 max
->peel_info
.count
= elem
->count
;
1442 max
->peel_info
.dr_info
= elem
->dr_info
;
1448 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1449 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1450 we assume DR0_INFO's misalignment will be zero after peeling. */
1453 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1454 dr_vec_info
*dr0_info
,
1455 unsigned int *inside_cost
,
1456 unsigned int *outside_cost
,
1457 stmt_vector_for_cost
*body_cost_vec
,
1458 stmt_vector_for_cost
*prologue_cost_vec
,
1460 bool unknown_misalignment
)
1462 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1466 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1468 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1469 if (!vect_relevant_for_alignment_p (dr_info
))
1472 int save_misalignment
;
1473 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1476 else if (unknown_misalignment
&& dr_info
== dr0_info
)
1477 SET_DR_MISALIGNMENT (dr_info
, 0);
1479 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1480 vect_get_data_access_cost (loop_vinfo
, dr_info
, inside_cost
, outside_cost
,
1481 body_cost_vec
, prologue_cost_vec
);
1482 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1486 /* Traverse peeling hash table and calculate cost for each peeling option.
1487 Find the one with the lowest cost. */
1490 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1491 _vect_peel_extended_info
*min
)
1493 vect_peel_info elem
= *slot
;
1495 unsigned int inside_cost
= 0, outside_cost
= 0;
1496 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1497 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1500 prologue_cost_vec
.create (2);
1501 body_cost_vec
.create (2);
1502 epilogue_cost_vec
.create (2);
1504 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1505 &outside_cost
, &body_cost_vec
,
1506 &prologue_cost_vec
, elem
->npeel
, false);
1508 body_cost_vec
.release ();
1510 outside_cost
+= vect_get_known_peeling_cost
1511 (loop_vinfo
, elem
->npeel
, &dummy
,
1512 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1513 &prologue_cost_vec
, &epilogue_cost_vec
);
1515 /* Prologue and epilogue costs are added to the target model later.
1516 These costs depend only on the scalar iteration cost, the
1517 number of peeling iterations finally chosen, and the number of
1518 misaligned statements. So discard the information found here. */
1519 prologue_cost_vec
.release ();
1520 epilogue_cost_vec
.release ();
1522 if (inside_cost
< min
->inside_cost
1523 || (inside_cost
== min
->inside_cost
1524 && outside_cost
< min
->outside_cost
))
1526 min
->inside_cost
= inside_cost
;
1527 min
->outside_cost
= outside_cost
;
1528 min
->peel_info
.dr_info
= elem
->dr_info
;
1529 min
->peel_info
.npeel
= elem
->npeel
;
1530 min
->peel_info
.count
= elem
->count
;
1537 /* Choose best peeling option by traversing peeling hash table and either
1538 choosing an option with the lowest cost (if cost model is enabled) or the
1539 option that aligns as many accesses as possible. */
1541 static struct _vect_peel_extended_info
1542 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1543 loop_vec_info loop_vinfo
)
1545 struct _vect_peel_extended_info res
;
1547 res
.peel_info
.dr_info
= NULL
;
1548 res
.vinfo
= loop_vinfo
;
1550 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1552 res
.inside_cost
= INT_MAX
;
1553 res
.outside_cost
= INT_MAX
;
1554 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1555 vect_peeling_hash_get_lowest_cost
> (&res
);
1559 res
.peel_info
.count
= 0;
1560 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1561 vect_peeling_hash_get_most_frequent
> (&res
);
1562 res
.inside_cost
= 0;
1563 res
.outside_cost
= 0;
1569 /* Return true if the new peeling NPEEL is supported. */
1572 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1576 struct data_reference
*dr
= NULL
;
1577 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1578 enum dr_alignment_support supportable_dr_alignment
;
1580 /* Ensure that all data refs can be vectorized after the peel. */
1581 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1583 int save_misalignment
;
1585 if (dr
== dr0_info
->dr
)
1588 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1589 if (!vect_relevant_for_alignment_p (dr_info
))
1592 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1593 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1594 supportable_dr_alignment
1595 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, false);
1596 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1598 if (!supportable_dr_alignment
)
1605 /* Compare two data-references DRA and DRB to group them into chunks
1606 with related alignment. */
1609 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
1611 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
1612 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
1615 /* Stabilize sort. */
1619 /* Ordering of DRs according to base. */
1620 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
1621 DR_BASE_ADDRESS (drb
));
1625 /* And according to DR_OFFSET. */
1626 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
1630 /* And after step. */
1631 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
1635 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1636 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
1638 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
1642 /* Function vect_enhance_data_refs_alignment
1644 This pass will use loop versioning and loop peeling in order to enhance
1645 the alignment of data references in the loop.
1647 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1648 original loop is to be vectorized. Any other loops that are created by
1649 the transformations performed in this pass - are not supposed to be
1650 vectorized. This restriction will be relaxed.
1652 This pass will require a cost model to guide it whether to apply peeling
1653 or versioning or a combination of the two. For example, the scheme that
1654 intel uses when given a loop with several memory accesses, is as follows:
1655 choose one memory access ('p') which alignment you want to force by doing
1656 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1657 other accesses are not necessarily aligned, or (2) use loop versioning to
1658 generate one loop in which all accesses are aligned, and another loop in
1659 which only 'p' is necessarily aligned.
1661 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1662 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1663 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1665 Devising a cost model is the most critical aspect of this work. It will
1666 guide us on which access to peel for, whether to use loop versioning, how
1667 many versions to create, etc. The cost model will probably consist of
1668 generic considerations as well as target specific considerations (on
1669 powerpc for example, misaligned stores are more painful than misaligned
1672 Here are the general steps involved in alignment enhancements:
1674 -- original loop, before alignment analysis:
1675 for (i=0; i<N; i++){
1676 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1677 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1680 -- After vect_compute_data_refs_alignment:
1681 for (i=0; i<N; i++){
1682 x = q[i]; # DR_MISALIGNMENT(q) = 3
1683 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1686 -- Possibility 1: we do loop versioning:
1688 for (i=0; i<N; i++){ # loop 1A
1689 x = q[i]; # DR_MISALIGNMENT(q) = 3
1690 p[i] = y; # DR_MISALIGNMENT(p) = 0
1694 for (i=0; i<N; i++){ # loop 1B
1695 x = q[i]; # DR_MISALIGNMENT(q) = 3
1696 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1700 -- Possibility 2: we do loop peeling:
1701 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1705 for (i = 3; i < N; i++){ # loop 2A
1706 x = q[i]; # DR_MISALIGNMENT(q) = 0
1707 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1710 -- Possibility 3: combination of loop peeling and versioning:
1711 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1716 for (i = 3; i<N; i++){ # loop 3A
1717 x = q[i]; # DR_MISALIGNMENT(q) = 0
1718 p[i] = y; # DR_MISALIGNMENT(p) = 0
1722 for (i = 3; i<N; i++){ # loop 3B
1723 x = q[i]; # DR_MISALIGNMENT(q) = 0
1724 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1728 These loops are later passed to loop_transform to be vectorized. The
1729 vectorizer will use the alignment information to guide the transformation
1730 (whether to generate regular loads/stores, or with special handling for
1734 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1736 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1737 enum dr_alignment_support supportable_dr_alignment
;
1738 dr_vec_info
*first_store
= NULL
;
1739 dr_vec_info
*dr0_info
= NULL
;
1740 struct data_reference
*dr
;
1742 bool do_peeling
= false;
1743 bool do_versioning
= false;
1744 unsigned int npeel
= 0;
1745 bool one_misalignment_known
= false;
1746 bool one_misalignment_unknown
= false;
1747 bool one_dr_unsupportable
= false;
1748 dr_vec_info
*unsupportable_dr_info
= NULL
;
1749 unsigned int mis
, dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
1750 hash_table
<peel_info_hasher
> peeling_htab (1);
1752 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1754 /* Reset data so we can safely be called multiple times. */
1755 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1756 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1758 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
1759 return opt_result::success ();
1761 /* Sort the vector of datarefs so DRs that have the same or dependent
1762 alignment are next to each other. */
1763 auto_vec
<data_reference_p
> datarefs
1764 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
1765 datarefs
.qsort (dr_align_group_sort_cmp
);
1767 /* Compute the number of DRs that become aligned when we peel
1768 a dataref so it becomes aligned. */
1769 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
1770 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
1772 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
1773 if (DR_BASE_ADDRESS (datarefs
[i0
]))
1775 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
1777 if (i
== datarefs
.length ()
1778 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
1779 DR_BASE_ADDRESS (datarefs
[i
]), 0)
1780 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
1781 DR_OFFSET (datarefs
[i
]), 0)
1782 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
1783 DR_STEP (datarefs
[i
]), 0))
1785 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1786 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1787 will get known misalignment if we align one of the refs
1788 with the largest DR_TARGET_ALIGNMENT. */
1789 for (unsigned j
= i0
; j
< i
; ++j
)
1791 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
1792 for (unsigned k
= i0
; k
< i
; ++k
)
1796 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
1797 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
1799 n_same_align_refs
[j
]++;
1806 /* While cost model enhancements are expected in the future, the high level
1807 view of the code at this time is as follows:
1809 A) If there is a misaligned access then see if peeling to align
1810 this access can make all data references satisfy
1811 vect_supportable_dr_alignment. If so, update data structures
1812 as needed and return true.
1814 B) If peeling wasn't possible and there is a data reference with an
1815 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1816 then see if loop versioning checks can be used to make all data
1817 references satisfy vect_supportable_dr_alignment. If so, update
1818 data structures as needed and return true.
1820 C) If neither peeling nor versioning were successful then return false if
1821 any data reference does not satisfy vect_supportable_dr_alignment.
1823 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1825 Note, Possibility 3 above (which is peeling and versioning together) is not
1826 being done at this time. */
1828 /* (1) Peeling to force alignment. */
1830 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1832 + How many accesses will become aligned due to the peeling
1833 - How many accesses will become unaligned due to the peeling,
1834 and the cost of misaligned accesses.
1835 - The cost of peeling (the extra runtime checks, the increase
1838 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1840 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1841 if (!vect_relevant_for_alignment_p (dr_info
))
1844 stmt_vec_info stmt_info
= dr_info
->stmt
;
1845 supportable_dr_alignment
1846 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, true);
1847 do_peeling
= vector_alignment_reachable_p (dr_info
);
1850 if (known_alignment_for_access_p (dr_info
))
1852 unsigned int npeel_tmp
= 0;
1853 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1854 size_zero_node
) < 0;
1856 /* If known_alignment_for_access_p then we have set
1857 DR_MISALIGNMENT which is only done if we know it at compiler
1858 time, so it is safe to assume target alignment is constant.
1860 unsigned int target_align
=
1861 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
1862 unsigned int dr_size
= vect_get_scalar_dr_size (dr_info
);
1864 ? DR_MISALIGNMENT (dr_info
)
1865 : -DR_MISALIGNMENT (dr_info
));
1866 if (DR_MISALIGNMENT (dr_info
) != 0)
1867 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1869 /* For multiple types, it is possible that the bigger type access
1870 will have more than one peeling option. E.g., a loop with two
1871 types: one of size (vector size / 4), and the other one of
1872 size (vector size / 8). Vectorization factor will 8. If both
1873 accesses are misaligned by 3, the first one needs one scalar
1874 iteration to be aligned, and the second one needs 5. But the
1875 first one will be aligned also by peeling 5 scalar
1876 iterations, and in that case both accesses will be aligned.
1877 Hence, except for the immediate peeling amount, we also want
1878 to try to add full vector size, while we don't exceed
1879 vectorization factor.
1880 We do this automatically for cost model, since we calculate
1881 cost for every peeling option. */
1882 poly_uint64 nscalars
= npeel_tmp
;
1883 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1885 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1886 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1887 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1890 /* Save info about DR in the hash table. Also include peeling
1891 amounts according to the explanation above. */
1892 while (known_le (npeel_tmp
, nscalars
))
1894 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1895 dr_info
, npeel_tmp
);
1896 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
1899 one_misalignment_known
= true;
1903 /* If we don't know any misalignment values, we prefer
1904 peeling for data-ref that has the maximum number of data-refs
1905 with the same alignment, unless the target prefers to align
1906 stores over load. */
1907 unsigned same_align_drs
= n_same_align_refs
[i
];
1909 || dr0_same_align_drs
< same_align_drs
)
1911 dr0_same_align_drs
= same_align_drs
;
1914 /* For data-refs with the same number of related
1915 accesses prefer the one where the misalign
1916 computation will be invariant in the outermost loop. */
1917 else if (dr0_same_align_drs
== same_align_drs
)
1919 class loop
*ivloop0
, *ivloop
;
1920 ivloop0
= outermost_invariant_loop_for_expr
1921 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
1922 ivloop
= outermost_invariant_loop_for_expr
1923 (loop
, DR_BASE_ADDRESS (dr
));
1924 if ((ivloop
&& !ivloop0
)
1925 || (ivloop
&& ivloop0
1926 && flow_loop_nested_p (ivloop
, ivloop0
)))
1930 one_misalignment_unknown
= true;
1932 /* Check for data refs with unsupportable alignment that
1934 if (!supportable_dr_alignment
)
1936 one_dr_unsupportable
= true;
1937 unsupportable_dr_info
= dr_info
;
1940 if (!first_store
&& DR_IS_WRITE (dr
))
1942 first_store
= dr_info
;
1943 first_store_same_align_drs
= same_align_drs
;
1949 if (!aligned_access_p (dr_info
))
1951 if (dump_enabled_p ())
1952 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1953 "vector alignment may not be reachable\n");
1959 /* Check if we can possibly peel the loop. */
1960 if (!vect_can_advance_ivs_p (loop_vinfo
)
1961 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1965 struct _vect_peel_extended_info peel_for_known_alignment
;
1966 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1967 struct _vect_peel_extended_info best_peel
;
1969 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1970 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1971 peel_for_unknown_alignment
.peel_info
.count
= 0;
1974 && one_misalignment_unknown
)
1976 /* Check if the target requires to prefer stores over loads, i.e., if
1977 misaligned stores are more expensive than misaligned loads (taking
1978 drs with same alignment into account). */
1979 unsigned int load_inside_cost
= 0;
1980 unsigned int load_outside_cost
= 0;
1981 unsigned int store_inside_cost
= 0;
1982 unsigned int store_outside_cost
= 0;
1983 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1985 stmt_vector_for_cost dummy
;
1987 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
1990 &dummy
, &dummy
, estimated_npeels
, true);
1996 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
1998 &store_outside_cost
,
2000 estimated_npeels
, true);
2005 store_inside_cost
= INT_MAX
;
2006 store_outside_cost
= INT_MAX
;
2009 if (load_inside_cost
> store_inside_cost
2010 || (load_inside_cost
== store_inside_cost
2011 && load_outside_cost
> store_outside_cost
))
2013 dr0_info
= first_store
;
2014 dr0_same_align_drs
= first_store_same_align_drs
;
2015 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2016 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2020 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2021 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2024 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2025 prologue_cost_vec
.create (2);
2026 epilogue_cost_vec
.create (2);
2029 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2030 (loop_vinfo
, estimated_npeels
, &dummy2
,
2031 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2032 &prologue_cost_vec
, &epilogue_cost_vec
);
2034 prologue_cost_vec
.release ();
2035 epilogue_cost_vec
.release ();
2037 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2040 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2041 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2043 best_peel
= peel_for_unknown_alignment
;
2045 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2046 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2047 peel_for_known_alignment
.peel_info
.count
= 0;
2048 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2050 if (do_peeling
&& one_misalignment_known
)
2052 /* Peeling is possible, but there is no data access that is not supported
2053 unless aligned. So we try to choose the best possible peeling from
2055 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2056 (&peeling_htab
, loop_vinfo
);
2059 /* Compare costs of peeling for known and unknown alignment. */
2060 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2061 && peel_for_unknown_alignment
.inside_cost
2062 >= peel_for_known_alignment
.inside_cost
)
2064 best_peel
= peel_for_known_alignment
;
2066 /* If the best peeling for known alignment has NPEEL == 0, perform no
2067 peeling at all except if there is an unsupportable dr that we can
2069 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2073 /* If there is an unsupportable data ref, prefer this over all choices so far
2074 since we'd have to discard a chosen peeling except when it accidentally
2075 aligned the unsupportable data ref. */
2076 if (one_dr_unsupportable
)
2077 dr0_info
= unsupportable_dr_info
;
2078 else if (do_peeling
)
2080 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2081 TODO: Use nopeel_outside_cost or get rid of it? */
2082 unsigned nopeel_inside_cost
= 0;
2083 unsigned nopeel_outside_cost
= 0;
2085 stmt_vector_for_cost dummy
;
2087 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2088 &nopeel_outside_cost
, &dummy
, &dummy
,
2092 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2093 costs will be recorded. */
2094 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2095 prologue_cost_vec
.create (2);
2096 epilogue_cost_vec
.create (2);
2099 nopeel_outside_cost
+= vect_get_known_peeling_cost
2100 (loop_vinfo
, 0, &dummy2
,
2101 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2102 &prologue_cost_vec
, &epilogue_cost_vec
);
2104 prologue_cost_vec
.release ();
2105 epilogue_cost_vec
.release ();
2107 npeel
= best_peel
.peel_info
.npeel
;
2108 dr0_info
= best_peel
.peel_info
.dr_info
;
2110 /* If no peeling is not more expensive than the best peeling we
2111 have so far, don't perform any peeling. */
2112 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2118 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2119 if (known_alignment_for_access_p (dr0_info
))
2121 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2122 size_zero_node
) < 0;
2125 /* Since it's known at compile time, compute the number of
2126 iterations in the peeled loop (the peeling factor) for use in
2127 updating DR_MISALIGNMENT values. The peeling factor is the
2128 vectorization factor minus the misalignment as an element
2131 ? DR_MISALIGNMENT (dr0_info
)
2132 : -DR_MISALIGNMENT (dr0_info
));
2133 /* If known_alignment_for_access_p then we have set
2134 DR_MISALIGNMENT which is only done if we know it at compiler
2135 time, so it is safe to assume target alignment is constant.
2137 unsigned int target_align
=
2138 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2139 npeel
= ((mis
& (target_align
- 1))
2140 / vect_get_scalar_dr_size (dr0_info
));
2143 /* For interleaved data access every iteration accesses all the
2144 members of the group, therefore we divide the number of iterations
2145 by the group size. */
2146 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2147 npeel
/= DR_GROUP_SIZE (stmt_info
);
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_NOTE
, vect_location
,
2151 "Try peeling by %d\n", npeel
);
2154 /* Ensure that all datarefs can be vectorized after the peel. */
2155 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2158 /* Check if all datarefs are supportable and log. */
2159 if (do_peeling
&& known_alignment_for_access_p (dr0_info
) && npeel
== 0)
2160 return opt_result::success ();
2162 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2165 unsigned max_allowed_peel
2166 = param_vect_max_peeling_for_alignment
;
2167 if (flag_vect_cost_model
== VECT_COST_MODEL_CHEAP
)
2168 max_allowed_peel
= 0;
2169 if (max_allowed_peel
!= (unsigned)-1)
2171 unsigned max_peel
= npeel
;
2174 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2175 unsigned HOST_WIDE_INT target_align_c
;
2176 if (target_align
.is_constant (&target_align_c
))
2178 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE
, vect_location
,
2184 "Disable peeling, max peels set and vector"
2185 " alignment unknown\n");
2188 if (max_peel
> max_allowed_peel
)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_NOTE
, vect_location
,
2193 "Disable peeling, max peels reached: %d\n", max_peel
);
2198 /* Cost model #2 - if peeling may result in a remaining loop not
2199 iterating enough to be vectorized then do not peel. Since this
2200 is a cost heuristic rather than a correctness decision, use the
2201 most likely runtime value for variable vectorization factors. */
2203 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2205 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2206 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2207 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2208 < assumed_vf
+ max_peel
)
2214 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2215 If the misalignment of DR_i is identical to that of dr0 then set
2216 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2217 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2218 by the peeling factor times the element size of DR_i (MOD the
2219 vectorization factor times the size). Otherwise, the
2220 misalignment of DR_i must be set to unknown. */
2221 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2222 if (dr
!= dr0_info
->dr
)
2224 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2225 if (!vect_relevant_for_alignment_p (dr_info
))
2228 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2231 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2233 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2235 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2236 = DR_MISALIGNMENT (dr0_info
);
2237 SET_DR_MISALIGNMENT (dr0_info
, 0);
2238 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_NOTE
, vect_location
,
2241 "Alignment of access forced using peeling.\n");
2242 dump_printf_loc (MSG_NOTE
, vect_location
,
2243 "Peeling for alignment will be applied.\n");
2246 /* The inside-loop cost will be accounted for in vectorizable_load
2247 and vectorizable_store correctly with adjusted alignments.
2248 Drop the body_cst_vec on the floor here. */
2249 return opt_result::success ();
2253 /* (2) Versioning to force alignment. */
2255 /* Try versioning if:
2256 1) optimize loop for speed and the cost-model is not cheap
2257 2) there is at least one unsupported misaligned data ref with an unknown
2259 3) all misaligned data refs with a known misalignment are supported, and
2260 4) the number of runtime alignment checks is within reason. */
2263 = (optimize_loop_nest_for_speed_p (loop
)
2264 && !loop
->inner
/* FORNOW */
2265 && flag_vect_cost_model
!= VECT_COST_MODEL_CHEAP
);
2269 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2271 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2272 if (aligned_access_p (dr_info
)
2273 || !vect_relevant_for_alignment_p (dr_info
))
2276 stmt_vec_info stmt_info
= dr_info
->stmt
;
2277 if (STMT_VINFO_STRIDED_P (stmt_info
))
2279 do_versioning
= false;
2283 supportable_dr_alignment
2284 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, false);
2286 if (!supportable_dr_alignment
)
2291 if (known_alignment_for_access_p (dr_info
)
2292 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2293 >= (unsigned) param_vect_max_version_for_alignment_checks
)
2295 do_versioning
= false;
2299 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2300 gcc_assert (vectype
);
2302 /* At present we don't support versioning for alignment
2303 with variable VF, since there's no guarantee that the
2304 VF is a power of two. We could relax this if we added
2305 a way of enforcing a power-of-two size. */
2306 unsigned HOST_WIDE_INT size
;
2307 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2309 do_versioning
= false;
2313 /* Forcing alignment in the first iteration is no good if
2314 we don't keep it across iterations. For now, just disable
2315 versioning in this case.
2316 ?? We could actually unroll the loop to achieve the required
2317 overall step alignment, and forcing the alignment could be
2318 done by doing some iterations of the non-vectorized loop. */
2319 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2320 * DR_STEP_ALIGNMENT (dr
),
2321 DR_TARGET_ALIGNMENT (dr_info
)))
2323 do_versioning
= false;
2327 /* The rightmost bits of an aligned address must be zeros.
2328 Construct the mask needed for this test. For example,
2329 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2330 mask must be 15 = 0xf. */
2333 /* FORNOW: use the same mask to test all potentially unaligned
2334 references in the loop. */
2335 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2336 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2338 do_versioning
= false;
2342 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2343 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2347 /* Versioning requires at least one misaligned data reference. */
2348 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2349 do_versioning
= false;
2350 else if (!do_versioning
)
2351 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2356 vec
<stmt_vec_info
> may_misalign_stmts
2357 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2358 stmt_vec_info stmt_info
;
2360 /* It can now be assumed that the data references in the statements
2361 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2362 of the loop being vectorized. */
2363 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2365 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2366 SET_DR_MISALIGNMENT (dr_info
, 0);
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE
, vect_location
,
2369 "Alignment of access forced using versioning.\n");
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_NOTE
, vect_location
,
2374 "Versioning for alignment will be applied.\n");
2376 /* Peeling and versioning can't be done together at this time. */
2377 gcc_assert (! (do_peeling
&& do_versioning
));
2379 return opt_result::success ();
2382 /* This point is reached if neither peeling nor versioning is being done. */
2383 gcc_assert (! (do_peeling
|| do_versioning
));
2385 return opt_result::success ();
2389 /* Function vect_analyze_data_refs_alignment
2391 Analyze the alignment of the data-references in the loop.
2392 Return FALSE if a data reference is found that cannot be vectorized. */
2395 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2397 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2399 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2400 struct data_reference
*dr
;
2403 vect_record_base_alignments (loop_vinfo
);
2404 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2406 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2407 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2408 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
);
2411 return opt_result::success ();
2415 /* Analyze alignment of DRs of stmts in NODE. */
2418 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2420 /* We vectorize from the first scalar stmt in the node unless
2421 the node is permuted in which case we start from the first
2422 element in the group. */
2423 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2424 dr_vec_info
*first_dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2425 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2426 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2428 /* We need to commit to a vector type for the group now. */
2429 if (is_a
<bb_vec_info
> (vinfo
)
2430 && !vect_update_shared_vectype (first_stmt_info
, SLP_TREE_VECTYPE (node
)))
2433 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2434 vect_compute_data_ref_alignment (vinfo
, dr_info
);
2435 /* In several places we need alignment of the first element anyway. */
2436 if (dr_info
!= first_dr_info
)
2437 vect_compute_data_ref_alignment (vinfo
, first_dr_info
);
2439 /* For creating the data-ref pointer we need alignment of the
2440 first element as well. */
2441 first_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
2442 if (first_stmt_info
!= SLP_TREE_SCALAR_STMTS (node
)[0])
2444 first_dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2445 if (dr_info
!= first_dr_info
)
2446 vect_compute_data_ref_alignment (vinfo
, first_dr_info
);
2452 /* Function vect_slp_analyze_instance_alignment
2454 Analyze the alignment of the data-references in the SLP instance.
2455 Return FALSE if a data reference is found that cannot be vectorized. */
2458 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
2459 slp_instance instance
)
2461 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2465 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2466 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
2469 node
= SLP_INSTANCE_TREE (instance
);
2470 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2471 && ! vect_slp_analyze_node_alignment
2472 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2479 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2480 accesses of legal size, step, etc. Detect gaps, single element
2481 interleaving, and other special cases. Set grouped access info.
2482 Collect groups of strided stores for further use in SLP analysis.
2483 Worker for vect_analyze_group_access. */
2486 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2488 data_reference
*dr
= dr_info
->dr
;
2489 tree step
= DR_STEP (dr
);
2490 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2491 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2492 stmt_vec_info stmt_info
= dr_info
->stmt
;
2493 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2494 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2495 HOST_WIDE_INT dr_step
= -1;
2496 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2497 bool slp_impossible
= false;
2499 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2500 size of the interleaving group (including gaps). */
2501 if (tree_fits_shwi_p (step
))
2503 dr_step
= tree_to_shwi (step
);
2504 /* Check that STEP is a multiple of type size. Otherwise there is
2505 a non-element-sized gap at the end of the group which we
2506 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2507 ??? As we can handle non-constant step fine here we should
2508 simply remove uses of DR_GROUP_GAP between the last and first
2509 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2510 simply not include that gap. */
2511 if ((dr_step
% type_size
) != 0)
2513 if (dump_enabled_p ())
2514 dump_printf_loc (MSG_NOTE
, vect_location
,
2515 "Step %T is not a multiple of the element size"
2520 groupsize
= absu_hwi (dr_step
) / type_size
;
2525 /* Not consecutive access is possible only if it is a part of interleaving. */
2526 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2528 /* Check if it this DR is a part of interleaving, and is a single
2529 element of the group that is accessed in the loop. */
2531 /* Gaps are supported only for loads. STEP must be a multiple of the type
2534 && (dr_step
% type_size
) == 0
2537 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2538 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2539 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_NOTE
, vect_location
,
2542 "Detected single element interleaving %T"
2549 if (dump_enabled_p ())
2550 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2551 "not consecutive access %G", stmt_info
->stmt
);
2555 /* Mark the statement as unvectorizable. */
2556 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2562 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2566 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2568 /* First stmt in the interleaving chain. Check the chain. */
2569 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2570 struct data_reference
*data_ref
= dr
;
2571 unsigned int count
= 1;
2572 tree prev_init
= DR_INIT (data_ref
);
2573 HOST_WIDE_INT diff
, gaps
= 0;
2575 /* By construction, all group members have INTEGER_CST DR_INITs. */
2578 /* We never have the same DR multiple times. */
2579 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
2580 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
2582 data_ref
= STMT_VINFO_DATA_REF (next
);
2584 /* All group members have the same STEP by construction. */
2585 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2587 /* Check that the distance between two accesses is equal to the type
2588 size. Otherwise, we have gaps. */
2589 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2590 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2593 /* FORNOW: SLP of accesses with gaps is not supported. */
2594 slp_impossible
= true;
2595 if (DR_IS_WRITE (data_ref
))
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2599 "interleaved store with gaps\n");
2606 last_accessed_element
+= diff
;
2608 /* Store the gap from the previous member of the group. If there is no
2609 gap in the access, DR_GROUP_GAP is always 1. */
2610 DR_GROUP_GAP (next
) = diff
;
2612 prev_init
= DR_INIT (data_ref
);
2613 next
= DR_GROUP_NEXT_ELEMENT (next
);
2614 /* Count the number of data-refs in the chain. */
2619 groupsize
= count
+ gaps
;
2621 /* This could be UINT_MAX but as we are generating code in a very
2622 inefficient way we have to cap earlier. See PR78699 for example. */
2623 if (groupsize
> 4096)
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2627 "group is too large\n");
2631 /* Check that the size of the interleaving is equal to count for stores,
2632 i.e., that there are no gaps. */
2633 if (groupsize
!= count
2634 && !DR_IS_READ (dr
))
2637 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2640 /* If there is a gap after the last load in the group it is the
2641 difference between the groupsize and the last accessed
2643 When there is no gap, this difference should be 0. */
2644 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
2646 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE
, vect_location
,
2650 "Detected interleaving ");
2651 if (DR_IS_READ (dr
))
2652 dump_printf (MSG_NOTE
, "load ");
2653 else if (STMT_VINFO_STRIDED_P (stmt_info
))
2654 dump_printf (MSG_NOTE
, "strided store ");
2656 dump_printf (MSG_NOTE
, "store ");
2657 dump_printf (MSG_NOTE
, "of size %u\n",
2658 (unsigned)groupsize
);
2659 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
2660 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2663 if (DR_GROUP_GAP (next
) != 1)
2664 dump_printf_loc (MSG_NOTE
, vect_location
,
2665 "\t<gap of %d elements>\n",
2666 DR_GROUP_GAP (next
) - 1);
2667 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
2668 next
= DR_GROUP_NEXT_ELEMENT (next
);
2670 if (DR_GROUP_GAP (stmt_info
) != 0)
2671 dump_printf_loc (MSG_NOTE
, vect_location
,
2672 "\t<gap of %d elements>\n",
2673 DR_GROUP_GAP (stmt_info
));
2676 /* SLP: create an SLP data structure for every interleaving group of
2677 stores for further analysis in vect_analyse_slp. */
2678 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2681 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
2683 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
2690 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2691 accesses of legal size, step, etc. Detect gaps, single element
2692 interleaving, and other special cases. Set grouped access info.
2693 Collect groups of strided stores for further use in SLP analysis. */
2696 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2698 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
2700 /* Dissolve the group if present. */
2701 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
2704 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2705 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2706 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
2714 /* Analyze the access pattern of the data-reference DR_INFO.
2715 In case of non-consecutive accesses call vect_analyze_group_access() to
2716 analyze groups of accesses. */
2719 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2721 data_reference
*dr
= dr_info
->dr
;
2722 tree step
= DR_STEP (dr
);
2723 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2724 stmt_vec_info stmt_info
= dr_info
->stmt
;
2725 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2726 class loop
*loop
= NULL
;
2728 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2732 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2734 if (loop_vinfo
&& !step
)
2736 if (dump_enabled_p ())
2737 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2738 "bad data-ref access in loop\n");
2742 /* Allow loads with zero step in inner-loop vectorization. */
2743 if (loop_vinfo
&& integer_zerop (step
))
2745 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2746 if (!nested_in_vect_loop_p (loop
, stmt_info
))
2747 return DR_IS_READ (dr
);
2748 /* Allow references with zero step for outer loops marked
2749 with pragma omp simd only - it guarantees absence of
2750 loop-carried dependencies between inner loop iterations. */
2751 if (loop
->safelen
< 2)
2753 if (dump_enabled_p ())
2754 dump_printf_loc (MSG_NOTE
, vect_location
,
2755 "zero step in inner loop of nest\n");
2760 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2762 /* Interleaved accesses are not yet supported within outer-loop
2763 vectorization for references in the inner-loop. */
2764 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2766 /* For the rest of the analysis we use the outer-loop step. */
2767 step
= STMT_VINFO_DR_STEP (stmt_info
);
2768 if (integer_zerop (step
))
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE
, vect_location
,
2772 "zero step in outer loop.\n");
2773 return DR_IS_READ (dr
);
2778 if (TREE_CODE (step
) == INTEGER_CST
)
2780 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2781 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2783 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2785 /* Mark that it is not interleaving. */
2786 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2791 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2793 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_NOTE
, vect_location
,
2795 "grouped access in outer loop.\n");
2800 /* Assume this is a DR handled by non-constant strided load case. */
2801 if (TREE_CODE (step
) != INTEGER_CST
)
2802 return (STMT_VINFO_STRIDED_P (stmt_info
)
2803 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2804 || vect_analyze_group_access (vinfo
, dr_info
)));
2806 /* Not consecutive access - check if it's a part of interleaving group. */
2807 return vect_analyze_group_access (vinfo
, dr_info
);
2810 typedef std::pair
<data_reference_p
, int> data_ref_pair
;
2812 /* Compare two data-references DRA and DRB to group them into chunks
2813 suitable for grouping. */
2816 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2818 data_ref_pair dra_pair
= *(data_ref_pair
*)const_cast<void *>(dra_
);
2819 data_ref_pair drb_pair
= *(data_ref_pair
*)const_cast<void *>(drb_
);
2820 data_reference_p dra
= dra_pair
.first
;
2821 data_reference_p drb
= drb_pair
.first
;
2824 /* Stabilize sort. */
2828 /* DRs in different basic-blocks never belong to the same group. */
2829 int bb_index1
= gimple_bb (DR_STMT (dra
))->index
;
2830 int bb_index2
= gimple_bb (DR_STMT (drb
))->index
;
2831 if (bb_index1
!= bb_index2
)
2832 return bb_index1
< bb_index2
? -1 : 1;
2834 /* Different group IDs lead never belong to the same group. */
2835 if (dra_pair
.second
!= drb_pair
.second
)
2836 return dra_pair
.second
< drb_pair
.second
? -1 : 1;
2838 /* Ordering of DRs according to base. */
2839 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2840 DR_BASE_ADDRESS (drb
));
2844 /* And according to DR_OFFSET. */
2845 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2849 /* Put reads before writes. */
2850 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2851 return DR_IS_READ (dra
) ? -1 : 1;
2853 /* Then sort after access size. */
2854 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2855 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2859 /* And after step. */
2860 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2864 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2865 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2867 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2871 /* If OP is the result of a conversion, return the unconverted value,
2872 otherwise return null. */
2875 strip_conversion (tree op
)
2877 if (TREE_CODE (op
) != SSA_NAME
)
2879 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2880 if (!is_gimple_assign (stmt
)
2881 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2883 return gimple_assign_rhs1 (stmt
);
2886 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2887 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2888 be grouped in SLP mode. */
2891 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
2894 if (gimple_assign_single_p (stmt1_info
->stmt
))
2895 return gimple_assign_single_p (stmt2_info
->stmt
);
2897 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
2898 if (call1
&& gimple_call_internal_p (call1
))
2900 /* Check for two masked loads or two masked stores. */
2901 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
2902 if (!call2
|| !gimple_call_internal_p (call2
))
2904 internal_fn ifn
= gimple_call_internal_fn (call1
);
2905 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2907 if (ifn
!= gimple_call_internal_fn (call2
))
2910 /* Check that the masks are the same. Cope with casts of masks,
2911 like those created by build_mask_conversion. */
2912 tree mask1
= gimple_call_arg (call1
, 2);
2913 tree mask2
= gimple_call_arg (call2
, 2);
2914 if (!operand_equal_p (mask1
, mask2
, 0)
2915 && (ifn
== IFN_MASK_STORE
|| !allow_slp_p
))
2917 mask1
= strip_conversion (mask1
);
2920 mask2
= strip_conversion (mask2
);
2923 if (!operand_equal_p (mask1
, mask2
, 0))
2932 /* Function vect_analyze_data_ref_accesses.
2934 Analyze the access pattern of all the data references in the loop.
2936 FORNOW: the only access pattern that is considered vectorizable is a
2937 simple step 1 (consecutive) access.
2939 FORNOW: handle only arrays and pointer accesses. */
2942 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
2943 vec
<int> *dataref_groups
)
2946 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2948 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2950 if (datarefs
.is_empty ())
2951 return opt_result::success ();
2953 /* Sort the array of datarefs to make building the interleaving chains
2954 linear. Don't modify the original vector's order, it is needed for
2955 determining what dependencies are reversed. */
2956 vec
<data_ref_pair
> datarefs_copy
;
2957 datarefs_copy
.create (datarefs
.length ());
2958 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
2960 int group_id
= dataref_groups
? (*dataref_groups
)[i
] : 0;
2961 datarefs_copy
.quick_push (data_ref_pair (datarefs
[i
], group_id
));
2963 datarefs_copy
.qsort (dr_group_sort_cmp
);
2964 hash_set
<stmt_vec_info
> to_fixup
;
2966 /* Build the interleaving chains. */
2967 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2969 data_reference_p dra
= datarefs_copy
[i
].first
;
2970 int dra_group_id
= datarefs_copy
[i
].second
;
2971 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
2972 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
2973 stmt_vec_info lastinfo
= NULL
;
2974 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2975 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2980 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2982 data_reference_p drb
= datarefs_copy
[i
].first
;
2983 int drb_group_id
= datarefs_copy
[i
].second
;
2984 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
2985 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
2986 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2987 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2990 /* ??? Imperfect sorting (non-compatible types, non-modulo
2991 accesses, same accesses) can lead to a group to be artificially
2992 split here as we don't just skip over those. If it really
2993 matters we can push those to a worklist and re-iterate
2994 over them. The we can just skip ahead to the next DR here. */
2996 /* DRs in a different BBs should not be put into the same
2997 interleaving group. */
2998 int bb_index1
= gimple_bb (DR_STMT (dra
))->index
;
2999 int bb_index2
= gimple_bb (DR_STMT (drb
))->index
;
3000 if (bb_index1
!= bb_index2
)
3003 if (dra_group_id
!= drb_group_id
)
3006 /* Check that the data-refs have same first location (except init)
3007 and they are both either store or load (not load and store,
3008 not masked loads or stores). */
3009 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3010 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3011 DR_BASE_ADDRESS (drb
)) != 0
3012 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3013 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3016 /* Check that the data-refs have the same constant size. */
3017 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3018 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3019 if (!tree_fits_uhwi_p (sza
)
3020 || !tree_fits_uhwi_p (szb
)
3021 || !tree_int_cst_equal (sza
, szb
))
3024 /* Check that the data-refs have the same step. */
3025 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3028 /* Check the types are compatible.
3029 ??? We don't distinguish this during sorting. */
3030 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3031 TREE_TYPE (DR_REF (drb
))))
3034 /* Check that the DR_INITs are compile-time constants. */
3035 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
3036 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
3039 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3040 just hold extra information. */
3041 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3042 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3043 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3046 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3047 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3048 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3049 HOST_WIDE_INT init_prev
3050 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1].first
));
3051 gcc_assert (init_a
<= init_b
3052 && init_a
<= init_prev
3053 && init_prev
<= init_b
);
3055 /* Do not place the same access in the interleaving chain twice. */
3056 if (init_b
== init_prev
)
3058 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1].first
))
3059 < gimple_uid (DR_STMT (drb
)));
3060 /* Simply link in duplicates and fix up the chain below. */
3064 /* If init_b == init_a + the size of the type * k, we have an
3065 interleaving, and DRA is accessed before DRB. */
3066 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3067 if (type_size_a
== 0
3068 || (init_b
- init_a
) % type_size_a
!= 0)
3071 /* If we have a store, the accesses are adjacent. This splits
3072 groups into chunks we support (we don't support vectorization
3073 of stores with gaps). */
3074 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
3077 /* If the step (if not zero or non-constant) is smaller than the
3078 difference between data-refs' inits this splits groups into
3080 if (tree_fits_shwi_p (DR_STEP (dra
)))
3082 unsigned HOST_WIDE_INT step
3083 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3085 && step
<= (unsigned HOST_WIDE_INT
)(init_b
- init_a
))
3090 if (dump_enabled_p ())
3091 dump_printf_loc (MSG_NOTE
, vect_location
,
3093 ? "Detected interleaving load %T and %T\n"
3094 : "Detected interleaving store %T and %T\n",
3095 DR_REF (dra
), DR_REF (drb
));
3097 /* Link the found element into the group list. */
3098 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3100 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3101 lastinfo
= stmtinfo_a
;
3103 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3104 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3105 lastinfo
= stmtinfo_b
;
3107 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3108 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3110 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3111 dump_printf_loc (MSG_NOTE
, vect_location
,
3112 "Load suitable for SLP vectorization only.\n");
3114 if (init_b
== init_prev
3115 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3116 && dump_enabled_p ())
3117 dump_printf_loc (MSG_NOTE
, vect_location
,
3118 "Queuing group with duplicate access for fixup\n");
3122 /* Fixup groups with duplicate entries by splitting it. */
3125 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3126 if (!(it
!= to_fixup
.end ()))
3128 stmt_vec_info grp
= *it
;
3129 to_fixup
.remove (grp
);
3131 /* Find the earliest duplicate group member. */
3132 unsigned first_duplicate
= -1u;
3133 stmt_vec_info next
, g
= grp
;
3134 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3136 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3137 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3138 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3139 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3142 if (first_duplicate
== -1U)
3145 /* Then move all stmts after the first duplicate to a new group.
3146 Note this is a heuristic but one with the property that *it
3147 is fixed up completely. */
3149 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3150 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3152 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3154 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3158 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3160 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3163 g
= DR_GROUP_NEXT_ELEMENT (g
);
3165 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3167 /* Fixup the new group which still may contain duplicates. */
3168 to_fixup
.add (newgroup
);
3171 data_ref_pair
*dr_pair
;
3172 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_pair
)
3174 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr_pair
->first
);
3175 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3176 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3180 "not vectorized: complicated access pattern.\n");
3182 if (is_a
<bb_vec_info
> (vinfo
))
3184 /* Mark the statement as not vectorizable. */
3185 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3190 datarefs_copy
.release ();
3191 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3193 " complicated access pattern.\n");
3198 datarefs_copy
.release ();
3199 return opt_result::success ();
3202 /* Function vect_vfa_segment_size.
3205 DR_INFO: The data reference.
3206 LENGTH_FACTOR: segment length to consider.
3208 Return a value suitable for the dr_with_seg_len::seg_len field.
3209 This is the "distance travelled" by the pointer from the first
3210 iteration in the segment to the last. Note that it does not include
3211 the size of the access; in effect it only describes the first byte. */
3214 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3216 length_factor
= size_binop (MINUS_EXPR
,
3217 fold_convert (sizetype
, length_factor
),
3219 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3223 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3224 gives the worst-case number of bytes covered by the segment. */
3226 static unsigned HOST_WIDE_INT
3227 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3229 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3230 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3231 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3232 unsigned HOST_WIDE_INT access_size
= ref_size
;
3233 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3235 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3236 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3238 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3239 && (vect_supportable_dr_alignment (vinfo
, dr_info
, false)
3240 == dr_explicit_realign_optimized
))
3242 /* We might access a full vector's worth. */
3243 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3244 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3249 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3253 vect_vfa_align (dr_vec_info
*dr_info
)
3255 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info
->dr
)));
3258 /* Function vect_no_alias_p.
3260 Given data references A and B with equal base and offset, see whether
3261 the alias relation can be decided at compilation time. Return 1 if
3262 it can and the references alias, 0 if it can and the references do
3263 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3264 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3265 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3268 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3269 tree segment_length_a
, tree segment_length_b
,
3270 unsigned HOST_WIDE_INT access_size_a
,
3271 unsigned HOST_WIDE_INT access_size_b
)
3273 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3274 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3275 poly_uint64 const_length_a
;
3276 poly_uint64 const_length_b
;
3278 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3279 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3281 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3283 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3284 offset_a
-= const_length_a
;
3287 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3288 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3290 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3291 offset_b
-= const_length_b
;
3294 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3296 const_length_a
+= access_size_a
;
3297 const_length_b
+= access_size_b
;
3299 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3300 offset_b
, const_length_b
))
3303 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3304 offset_b
, const_length_b
))
3310 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3314 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3315 unsigned int loop_depth
, poly_uint64 vf
)
3317 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3318 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3321 /* If the dependence is exact, we should have limited the VF instead. */
3322 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3325 lambda_vector dist_v
;
3326 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3328 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3330 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3331 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3335 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE
, vect_location
,
3337 "dependence distance between %T and %T is >= VF\n",
3338 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3343 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3346 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3348 dump_printf (dump_kind
, "%s (%T) >= ",
3349 lower_bound
.unsigned_p
? "unsigned" : "abs",
3351 dump_dec (dump_kind
, lower_bound
.min_value
);
3354 /* Record that the vectorized loop requires the vec_lower_bound described
3355 by EXPR, UNSIGNED_P and MIN_VALUE. */
3358 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3359 poly_uint64 min_value
)
3361 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3362 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3363 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3365 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3366 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3367 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3368 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3370 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3371 lower_bounds
[i
].min_value
= min_value
;
3372 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE
, vect_location
,
3375 "updating run-time check to ");
3376 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3377 dump_printf (MSG_NOTE
, "\n");
3383 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3384 if (dump_enabled_p ())
3386 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3387 dump_lower_bound (MSG_NOTE
, lower_bound
);
3388 dump_printf (MSG_NOTE
, "\n");
3390 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3393 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3394 will span fewer than GAP bytes. */
3397 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3400 stmt_vec_info stmt_info
= dr_info
->stmt
;
3402 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3403 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3404 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3405 return (estimated_poly_value (gap
)
3406 <= count
* vect_get_scalar_dr_size (dr_info
));
3409 /* Return true if we know that there is no alias between DR_INFO_A and
3410 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3411 When returning true, set *LOWER_BOUND_OUT to this N. */
3414 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3415 poly_uint64
*lower_bound_out
)
3417 /* Check that there is a constant gap of known sign between DR_A
3419 data_reference
*dr_a
= dr_info_a
->dr
;
3420 data_reference
*dr_b
= dr_info_b
->dr
;
3421 poly_int64 init_a
, init_b
;
3422 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3423 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3424 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3425 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3426 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3427 || !ordered_p (init_a
, init_b
))
3430 /* Sort DR_A and DR_B by the address they access. */
3431 if (maybe_lt (init_b
, init_a
))
3433 std::swap (init_a
, init_b
);
3434 std::swap (dr_info_a
, dr_info_b
);
3435 std::swap (dr_a
, dr_b
);
3438 /* If the two accesses could be dependent within a scalar iteration,
3439 make sure that we'd retain their order. */
3440 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3441 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3444 /* There is no alias if abs (DR_STEP) is greater than or equal to
3445 the bytes spanned by the combination of the two accesses. */
3446 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3450 /* Function vect_prune_runtime_alias_test_list.
3452 Prune a list of ddrs to be tested at run-time by versioning for alias.
3453 Merge several alias checks into one if possible.
3454 Return FALSE if resulting list of ddrs is longer then allowed by
3455 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3458 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3460 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3461 hash_set
<tree_pair_hash
> compared_objects
;
3463 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3464 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3465 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3466 vec
<vec_object_pair
> &check_unequal_addrs
3467 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3468 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3469 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3475 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3477 /* Step values are irrelevant for aliasing if the number of vector
3478 iterations is equal to the number of scalar iterations (which can
3479 happen for fully-SLP loops). */
3480 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3484 /* Convert the checks for nonzero steps into bound tests. */
3486 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3487 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3490 if (may_alias_ddrs
.is_empty ())
3491 return opt_result::success ();
3493 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3495 unsigned int loop_depth
3496 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3497 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3499 /* First, we collect all data ref pairs for aliasing checks. */
3500 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3502 poly_uint64 lower_bound
;
3503 tree segment_length_a
, segment_length_b
;
3504 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3505 unsigned int align_a
, align_b
;
3507 /* Ignore the alias if the VF we chose ended up being no greater
3508 than the dependence distance. */
3509 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3512 if (DDR_OBJECT_A (ddr
))
3514 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3515 if (!compared_objects
.add (new_pair
))
3517 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_NOTE
, vect_location
,
3519 "checking that %T and %T"
3520 " have different addresses\n",
3521 new_pair
.first
, new_pair
.second
);
3522 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3527 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3528 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3530 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3531 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3533 bool preserves_scalar_order_p
3534 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3536 /* Skip the pair if inter-iteration dependencies are irrelevant
3537 and intra-iteration dependencies are guaranteed to be honored. */
3539 && (preserves_scalar_order_p
3540 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3543 if (dump_enabled_p ())
3544 dump_printf_loc (MSG_NOTE
, vect_location
,
3545 "no need for alias check between "
3546 "%T and %T when VF is 1\n",
3547 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3551 /* See whether we can handle the alias using a bounds check on
3552 the step, and whether that's likely to be the best approach.
3553 (It might not be, for example, if the minimum step is much larger
3554 than the number of bytes handled by one vector iteration.) */
3556 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3557 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3559 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3560 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3562 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3563 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3566 "%T and %T when the step %T is outside ",
3567 DR_REF (dr_info_a
->dr
),
3568 DR_REF (dr_info_b
->dr
),
3569 DR_STEP (dr_info_a
->dr
));
3571 dump_printf (MSG_NOTE
, "[0");
3574 dump_printf (MSG_NOTE
, "(");
3575 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3577 dump_printf (MSG_NOTE
, ", ");
3578 dump_dec (MSG_NOTE
, lower_bound
);
3579 dump_printf (MSG_NOTE
, ")\n");
3581 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
3582 unsigned_p
, lower_bound
);
3586 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
3587 if (dr_group_first_a
)
3589 stmt_info_a
= dr_group_first_a
;
3590 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
3593 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
3594 if (dr_group_first_b
)
3596 stmt_info_b
= dr_group_first_b
;
3597 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
3602 segment_length_a
= size_zero_node
;
3603 segment_length_b
= size_zero_node
;
3607 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
3608 DR_STEP (dr_info_b
->dr
), 0))
3609 length_factor
= scalar_loop_iters
;
3611 length_factor
= size_int (vect_factor
);
3612 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
3613 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
3615 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
3616 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
3617 align_a
= vect_vfa_align (dr_info_a
);
3618 align_b
= vect_vfa_align (dr_info_b
);
3620 /* See whether the alias is known at compilation time. */
3621 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
3622 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
3623 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
3624 DR_OFFSET (dr_info_b
->dr
), 0)
3625 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
3626 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
3627 && poly_int_tree_p (segment_length_a
)
3628 && poly_int_tree_p (segment_length_b
))
3630 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
3635 if (res
>= 0 && dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE
, vect_location
,
3638 "can tell at compile time that %T and %T",
3639 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3641 dump_printf (MSG_NOTE
, " do not alias\n");
3643 dump_printf (MSG_NOTE
, " alias\n");
3650 return opt_result::failure_at (stmt_info_b
->stmt
,
3652 " compilation time alias: %G%G",
3657 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
3658 access_size_a
, align_a
);
3659 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
3660 access_size_b
, align_b
);
3661 /* Canonicalize the order to be the one that's needed for accurate
3662 RAW, WAR and WAW flags, in cases where the data references are
3663 well-ordered. The order doesn't really matter otherwise,
3664 but we might as well be consistent. */
3665 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
3666 std::swap (dr_a
, dr_b
);
3668 dr_with_seg_len_pair_t dr_with_seg_len_pair
3669 (dr_a
, dr_b
, (preserves_scalar_order_p
3670 ? dr_with_seg_len_pair_t::WELL_ORDERED
3671 : dr_with_seg_len_pair_t::REORDERED
));
3673 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3676 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3678 unsigned int count
= (comp_alias_ddrs
.length ()
3679 + check_unequal_addrs
.length ());
3681 if (dump_enabled_p ())
3682 dump_printf_loc (MSG_NOTE
, vect_location
,
3683 "improved number of alias checks from %d to %d\n",
3684 may_alias_ddrs
.length (), count
);
3685 unsigned limit
= param_vect_max_version_for_alias_checks
;
3686 if (flag_simd_cost_model
== VECT_COST_MODEL_CHEAP
)
3687 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
3689 return opt_result::failure_at
3691 "number of versioning for alias run-time tests exceeds %d "
3692 "(--param vect-max-version-for-alias-checks)\n", limit
);
3694 return opt_result::success ();
3697 /* Check whether we can use an internal function for a gather load
3698 or scatter store. READ_P is true for loads and false for stores.
3699 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3700 the type of the memory elements being loaded or stored. OFFSET_TYPE
3701 is the type of the offset that is being applied to the invariant
3702 base address. SCALE is the amount by which the offset should
3703 be multiplied *after* it has been converted to address width.
3705 Return true if the function is supported, storing the function id in
3706 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3709 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
3710 tree vectype
, tree memory_type
, tree offset_type
,
3711 int scale
, internal_fn
*ifn_out
,
3712 tree
*offset_vectype_out
)
3714 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3715 unsigned int element_bits
= vector_element_bits (vectype
);
3716 if (element_bits
!= memory_bits
)
3717 /* For now the vector elements must be the same width as the
3721 /* Work out which function we need. */
3724 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3726 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3730 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
3731 if (!offset_vectype
)
3734 /* Test whether the target supports this combination. */
3735 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3736 offset_vectype
, scale
))
3739 *offset_vectype_out
= offset_vectype
;
3743 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
3744 && TYPE_PRECISION (offset_type
) >= element_bits
)
3747 offset_type
= build_nonstandard_integer_type
3748 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
3752 /* STMT_INFO is a call to an internal gather load or scatter store function.
3753 Describe the operation in INFO. */
3756 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
3757 gather_scatter_info
*info
)
3759 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
3760 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3761 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3763 info
->ifn
= gimple_call_internal_fn (call
);
3764 info
->decl
= NULL_TREE
;
3765 info
->base
= gimple_call_arg (call
, 0);
3766 info
->offset
= gimple_call_arg (call
, 1);
3767 info
->offset_dt
= vect_unknown_def_type
;
3768 info
->offset_vectype
= NULL_TREE
;
3769 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3770 info
->element_type
= TREE_TYPE (vectype
);
3771 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3774 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3775 gather load or scatter store. Describe the operation in *INFO if so. */
3778 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
3779 gather_scatter_info
*info
)
3781 HOST_WIDE_INT scale
= 1;
3782 poly_int64 pbitpos
, pbitsize
;
3783 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3784 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3785 tree offtype
= NULL_TREE
;
3786 tree decl
= NULL_TREE
, base
, off
;
3787 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3788 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3790 int punsignedp
, reversep
, pvolatilep
= 0;
3792 tree offset_vectype
;
3793 bool masked_p
= false;
3795 /* See whether this is already a call to a gather/scatter internal function.
3796 If not, see whether it's a masked load or store. */
3797 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
3798 if (call
&& gimple_call_internal_p (call
))
3800 ifn
= gimple_call_internal_fn (call
);
3801 if (internal_gather_scatter_fn_p (ifn
))
3803 vect_describe_gather_scatter_call (stmt_info
, info
);
3806 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3809 /* True if we should aim to use internal functions rather than
3810 built-in functions. */
3811 bool use_ifn_p
= (DR_IS_READ (dr
)
3812 ? supports_vec_gather_load_p ()
3813 : supports_vec_scatter_store_p ());
3816 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3817 see if we can use the def stmt of the address. */
3819 && TREE_CODE (base
) == MEM_REF
3820 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3821 && integer_zerop (TREE_OPERAND (base
, 1))
3822 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3824 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3825 if (is_gimple_assign (def_stmt
)
3826 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3827 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3830 /* The gather and scatter builtins need address of the form
3831 loop_invariant + vector * {1, 2, 4, 8}
3833 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3834 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3835 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3836 multiplications and additions in it. To get a vector, we need
3837 a single SSA_NAME that will be defined in the loop and will
3838 contain everything that is not loop invariant and that can be
3839 vectorized. The following code attempts to find such a preexistng
3840 SSA_NAME OFF and put the loop invariants into a tree BASE
3841 that can be gimplified before the loop. */
3842 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3843 &punsignedp
, &reversep
, &pvolatilep
);
3847 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3849 if (TREE_CODE (base
) == MEM_REF
)
3851 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3853 if (off
== NULL_TREE
)
3854 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3856 off
= size_binop (PLUS_EXPR
, off
,
3857 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3859 base
= TREE_OPERAND (base
, 0);
3862 base
= build_fold_addr_expr (base
);
3864 if (off
== NULL_TREE
)
3865 off
= size_zero_node
;
3867 /* If base is not loop invariant, either off is 0, then we start with just
3868 the constant offset in the loop invariant BASE and continue with base
3869 as OFF, otherwise give up.
3870 We could handle that case by gimplifying the addition of base + off
3871 into some SSA_NAME and use that as off, but for now punt. */
3872 if (!expr_invariant_in_loop_p (loop
, base
))
3874 if (!integer_zerop (off
))
3877 base
= size_int (pbytepos
);
3879 /* Otherwise put base + constant offset into the loop invariant BASE
3880 and continue with OFF. */
3883 base
= fold_convert (sizetype
, base
);
3884 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3887 /* OFF at this point may be either a SSA_NAME or some tree expression
3888 from get_inner_reference. Try to peel off loop invariants from it
3889 into BASE as long as possible. */
3891 while (offtype
== NULL_TREE
)
3893 enum tree_code code
;
3894 tree op0
, op1
, add
= NULL_TREE
;
3896 if (TREE_CODE (off
) == SSA_NAME
)
3898 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3900 if (expr_invariant_in_loop_p (loop
, off
))
3903 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3906 op0
= gimple_assign_rhs1 (def_stmt
);
3907 code
= gimple_assign_rhs_code (def_stmt
);
3908 op1
= gimple_assign_rhs2 (def_stmt
);
3912 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3914 code
= TREE_CODE (off
);
3915 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3919 case POINTER_PLUS_EXPR
:
3921 if (expr_invariant_in_loop_p (loop
, op0
))
3926 add
= fold_convert (sizetype
, add
);
3928 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3929 base
= size_binop (PLUS_EXPR
, base
, add
);
3932 if (expr_invariant_in_loop_p (loop
, op1
))
3940 if (expr_invariant_in_loop_p (loop
, op1
))
3942 add
= fold_convert (sizetype
, op1
);
3943 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3949 if (scale
== 1 && tree_fits_shwi_p (op1
))
3951 int new_scale
= tree_to_shwi (op1
);
3952 /* Only treat this as a scaling operation if the target
3953 supports it for at least some offset type. */
3955 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3956 masked_p
, vectype
, memory_type
,
3957 signed_char_type_node
,
3960 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3961 masked_p
, vectype
, memory_type
,
3962 unsigned_char_type_node
,
3975 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3976 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3979 /* Don't include the conversion if the target is happy with
3980 the current offset type. */
3982 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3983 masked_p
, vectype
, memory_type
,
3984 TREE_TYPE (off
), scale
, &ifn
,
3988 if (TYPE_PRECISION (TREE_TYPE (op0
))
3989 == TYPE_PRECISION (TREE_TYPE (off
)))
3995 if (TYPE_PRECISION (TREE_TYPE (op0
))
3996 < TYPE_PRECISION (TREE_TYPE (off
)))
3999 offtype
= TREE_TYPE (off
);
4010 /* If at the end OFF still isn't a SSA_NAME or isn't
4011 defined in the loop, punt. */
4012 if (TREE_CODE (off
) != SSA_NAME
4013 || expr_invariant_in_loop_p (loop
, off
))
4016 if (offtype
== NULL_TREE
)
4017 offtype
= TREE_TYPE (off
);
4021 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4022 vectype
, memory_type
, offtype
, scale
,
4023 &ifn
, &offset_vectype
))
4028 if (DR_IS_READ (dr
))
4030 if (targetm
.vectorize
.builtin_gather
)
4031 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4035 if (targetm
.vectorize
.builtin_scatter
)
4036 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4043 /* The offset vector type will be read from DECL when needed. */
4044 offset_vectype
= NULL_TREE
;
4051 info
->offset_dt
= vect_unknown_def_type
;
4052 info
->offset_vectype
= offset_vectype
;
4053 info
->scale
= scale
;
4054 info
->element_type
= TREE_TYPE (vectype
);
4055 info
->memory_type
= memory_type
;
4059 /* Find the data references in STMT, analyze them with respect to LOOP and
4060 append them to DATAREFS. Return false if datarefs in this stmt cannot
4064 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4065 vec
<data_reference_p
> *datarefs
,
4066 vec
<int> *dataref_groups
, int group_id
)
4068 /* We can ignore clobbers for dataref analysis - they are removed during
4069 loop vectorization and BB vectorization checks dependences with a
4071 if (gimple_clobber_p (stmt
))
4072 return opt_result::success ();
4074 if (gimple_has_volatile_ops (stmt
))
4075 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4078 if (stmt_can_throw_internal (cfun
, stmt
))
4079 return opt_result::failure_at (stmt
,
4081 " statement can throw an exception: %G",
4084 auto_vec
<data_reference_p
, 2> refs
;
4085 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4089 if (refs
.is_empty ())
4090 return opt_result::success ();
4092 if (refs
.length () > 1)
4094 while (!refs
.is_empty ())
4095 free_data_ref (refs
.pop ());
4096 return opt_result::failure_at (stmt
,
4097 "not vectorized: more than one "
4098 "data ref in stmt: %G", stmt
);
4101 data_reference_p dr
= refs
.pop ();
4102 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4103 if (!gimple_call_internal_p (call
)
4104 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4105 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4108 return opt_result::failure_at (stmt
,
4109 "not vectorized: dr in a call %G", stmt
);
4112 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4113 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4116 return opt_result::failure_at (stmt
,
4118 " statement is bitfield access %G", stmt
);
4121 if (DR_BASE_ADDRESS (dr
)
4122 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4125 return opt_result::failure_at (stmt
,
4127 " base addr of dr is a constant\n");
4130 /* Check whether this may be a SIMD lane access and adjust the
4131 DR to make it easier for us to handle it. */
4134 && (!DR_BASE_ADDRESS (dr
)
4139 struct data_reference
*newdr
4140 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4141 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4142 if (DR_BASE_ADDRESS (newdr
)
4143 && DR_OFFSET (newdr
)
4146 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4147 && integer_zerop (DR_STEP (newdr
)))
4149 tree base_address
= DR_BASE_ADDRESS (newdr
);
4150 tree off
= DR_OFFSET (newdr
);
4151 tree step
= ssize_int (1);
4152 if (integer_zerop (off
)
4153 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4155 off
= TREE_OPERAND (base_address
, 1);
4156 base_address
= TREE_OPERAND (base_address
, 0);
4159 if (TREE_CODE (off
) == MULT_EXPR
4160 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4162 step
= TREE_OPERAND (off
, 1);
4163 off
= TREE_OPERAND (off
, 0);
4166 if (CONVERT_EXPR_P (off
)
4167 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4168 < TYPE_PRECISION (TREE_TYPE (off
))))
4169 off
= TREE_OPERAND (off
, 0);
4170 if (TREE_CODE (off
) == SSA_NAME
)
4172 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4173 /* Look through widening conversion. */
4174 if (is_gimple_assign (def
)
4175 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4177 tree rhs1
= gimple_assign_rhs1 (def
);
4178 if (TREE_CODE (rhs1
) == SSA_NAME
4179 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4180 && (TYPE_PRECISION (TREE_TYPE (off
))
4181 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4182 def
= SSA_NAME_DEF_STMT (rhs1
);
4184 if (is_gimple_call (def
)
4185 && gimple_call_internal_p (def
)
4186 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4188 tree arg
= gimple_call_arg (def
, 0);
4189 tree reft
= TREE_TYPE (DR_REF (newdr
));
4190 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4191 arg
= SSA_NAME_VAR (arg
);
4192 if (arg
== loop
->simduid
4194 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4196 DR_BASE_ADDRESS (newdr
) = base_address
;
4197 DR_OFFSET (newdr
) = ssize_int (0);
4198 DR_STEP (newdr
) = step
;
4199 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4200 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4201 /* Mark as simd-lane access. */
4202 tree arg2
= gimple_call_arg (def
, 1);
4203 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4205 datarefs
->safe_push (newdr
);
4207 dataref_groups
->safe_push (group_id
);
4208 return opt_result::success ();
4213 free_data_ref (newdr
);
4216 datarefs
->safe_push (dr
);
4218 dataref_groups
->safe_push (group_id
);
4219 return opt_result::success ();
4222 /* Function vect_analyze_data_refs.
4224 Find all the data references in the loop or basic block.
4226 The general structure of the analysis of data refs in the vectorizer is as
4228 1- vect_analyze_data_refs(loop/bb): call
4229 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4230 in the loop/bb and their dependences.
4231 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4232 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4233 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4238 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4240 class loop
*loop
= NULL
;
4242 struct data_reference
*dr
;
4245 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4247 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4248 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4250 /* Go through the data-refs, check that the analysis succeeded. Update
4251 pointer from stmt_vec_info struct to DR and vectype. */
4253 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4254 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4256 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4259 gcc_assert (DR_REF (dr
));
4260 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4261 gcc_assert (!stmt_info
->dr_aux
.dr
);
4262 stmt_info
->dr_aux
.dr
= dr
;
4263 stmt_info
->dr_aux
.stmt
= stmt_info
;
4265 /* Check that analysis of the data-ref succeeded. */
4266 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4271 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4272 && (targetm
.vectorize
.builtin_gather
!= NULL
4273 || supports_vec_gather_load_p ());
4276 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4277 && (targetm
.vectorize
.builtin_scatter
!= NULL
4278 || supports_vec_scatter_store_p ());
4280 /* If target supports vector gather loads or scatter stores,
4281 see if they can't be used. */
4282 if (is_a
<loop_vec_info
> (vinfo
)
4283 && !nested_in_vect_loop_p (loop
, stmt_info
))
4285 if (maybe_gather
|| maybe_scatter
)
4288 gatherscatter
= GATHER
;
4290 gatherscatter
= SCATTER
;
4294 if (gatherscatter
== SG_NONE
)
4296 if (dump_enabled_p ())
4297 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4298 "not vectorized: data ref analysis "
4299 "failed %G", stmt_info
->stmt
);
4300 if (is_a
<bb_vec_info
> (vinfo
))
4302 /* In BB vectorization the ref can still participate
4303 in dependence analysis, we just can't vectorize it. */
4304 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4307 return opt_result::failure_at (stmt_info
->stmt
,
4309 " data ref analysis failed: %G",
4314 /* See if this was detected as SIMD lane access. */
4315 if (dr
->aux
== (void *)-1
4316 || dr
->aux
== (void *)-2
4317 || dr
->aux
== (void *)-3
4318 || dr
->aux
== (void *)-4)
4320 if (nested_in_vect_loop_p (loop
, stmt_info
))
4321 return opt_result::failure_at (stmt_info
->stmt
,
4323 " data ref analysis failed: %G",
4325 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4326 = -(uintptr_t) dr
->aux
;
4329 tree base
= get_base_address (DR_REF (dr
));
4330 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4332 if (dump_enabled_p ())
4333 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4334 "not vectorized: base object not addressable "
4335 "for stmt: %G", stmt_info
->stmt
);
4336 if (is_a
<bb_vec_info
> (vinfo
))
4338 /* In BB vectorization the ref can still participate
4339 in dependence analysis, we just can't vectorize it. */
4340 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4343 return opt_result::failure_at (stmt_info
->stmt
,
4344 "not vectorized: base object not"
4345 " addressable for stmt: %G",
4349 if (is_a
<loop_vec_info
> (vinfo
)
4351 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4353 if (nested_in_vect_loop_p (loop
, stmt_info
))
4354 return opt_result::failure_at (stmt_info
->stmt
,
4356 "not suitable for strided load %G",
4358 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4361 /* Update DR field in stmt_vec_info struct. */
4363 /* If the dataref is in an inner-loop of the loop that is considered for
4364 for vectorization, we also want to analyze the access relative to
4365 the outer-loop (DR contains information only relative to the
4366 inner-most enclosing loop). We do that by building a reference to the
4367 first location accessed by the inner-loop, and analyze it relative to
4369 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4371 /* Build a reference to the first location accessed by the
4372 inner loop: *(BASE + INIT + OFFSET). By construction,
4373 this address must be invariant in the inner loop, so we
4374 can consider it as being used in the outer loop. */
4375 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4376 tree offset
= unshare_expr (DR_OFFSET (dr
));
4377 tree init
= unshare_expr (DR_INIT (dr
));
4378 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4380 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4381 tree init_ref
= build_fold_indirect_ref (init_addr
);
4383 if (dump_enabled_p ())
4384 dump_printf_loc (MSG_NOTE
, vect_location
,
4385 "analyze in outer loop: %T\n", init_ref
);
4388 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4389 init_ref
, loop
, stmt_info
->stmt
);
4391 /* dr_analyze_innermost already explained the failure. */
4394 if (dump_enabled_p ())
4395 dump_printf_loc (MSG_NOTE
, vect_location
,
4396 "\touter base_address: %T\n"
4397 "\touter offset from base address: %T\n"
4398 "\touter constant offset from base address: %T\n"
4399 "\touter step: %T\n"
4400 "\touter base alignment: %d\n\n"
4401 "\touter base misalignment: %d\n"
4402 "\touter offset alignment: %d\n"
4403 "\touter step alignment: %d\n",
4404 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4405 STMT_VINFO_DR_OFFSET (stmt_info
),
4406 STMT_VINFO_DR_INIT (stmt_info
),
4407 STMT_VINFO_DR_STEP (stmt_info
),
4408 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4409 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4410 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4411 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4414 /* Set vectype for STMT. */
4415 scalar_type
= TREE_TYPE (DR_REF (dr
));
4416 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4419 if (dump_enabled_p ())
4421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4422 "not vectorized: no vectype for stmt: %G",
4424 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4425 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4427 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4430 if (is_a
<bb_vec_info
> (vinfo
))
4432 /* No vector type is fine, the ref can still participate
4433 in dependence analysis, we just can't vectorize it. */
4434 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4439 return opt_result::failure_at (stmt_info
->stmt
,
4441 " no vectype for stmt: %G"
4442 " scalar_type: %T\n",
4443 stmt_info
->stmt
, scalar_type
);
4447 if (dump_enabled_p ())
4448 dump_printf_loc (MSG_NOTE
, vect_location
,
4449 "got vectype for stmt: %G%T\n",
4450 stmt_info
->stmt
, vectype
);
4453 /* Adjust the minimal vectorization factor according to the
4455 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4456 *min_vf
= upper_bound (*min_vf
, vf
);
4458 /* Leave the BB vectorizer to pick the vector type later, based on
4459 the final dataref group size and SLP node size. */
4460 if (is_a
<loop_vec_info
> (vinfo
))
4461 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4463 if (gatherscatter
!= SG_NONE
)
4465 gather_scatter_info gs_info
;
4466 if (!vect_check_gather_scatter (stmt_info
,
4467 as_a
<loop_vec_info
> (vinfo
),
4469 || !get_vectype_for_scalar_type (vinfo
,
4470 TREE_TYPE (gs_info
.offset
)))
4474 return opt_result::failure_at
4476 (gatherscatter
== GATHER
)
4477 ? "not vectorized: not suitable for gather load %G"
4478 : "not vectorized: not suitable for scatter store %G",
4481 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4485 /* We used to stop processing and prune the list here. Verify we no
4487 gcc_assert (i
== datarefs
.length ());
4489 return opt_result::success ();
4493 /* Function vect_get_new_vect_var.
4495 Returns a name for a new variable. The current naming scheme appends the
4496 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4497 the name of vectorizer generated variables, and appends that to NAME if
4501 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4508 case vect_simple_var
:
4511 case vect_scalar_var
:
4517 case vect_pointer_var
:
4526 char* tmp
= concat (prefix
, "_", name
, NULL
);
4527 new_vect_var
= create_tmp_reg (type
, tmp
);
4531 new_vect_var
= create_tmp_reg (type
, prefix
);
4533 return new_vect_var
;
4536 /* Like vect_get_new_vect_var but return an SSA name. */
4539 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4546 case vect_simple_var
:
4549 case vect_scalar_var
:
4552 case vect_pointer_var
:
4561 char* tmp
= concat (prefix
, "_", name
, NULL
);
4562 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4566 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4568 return new_vect_var
;
4571 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4574 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
4576 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
4577 int misalign
= DR_MISALIGNMENT (dr_info
);
4578 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4579 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4581 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4582 known_alignment (DR_TARGET_ALIGNMENT (dr_info
)),
4586 /* Function vect_create_addr_base_for_vector_ref.
4588 Create an expression that computes the address of the first memory location
4589 that will be accessed for a data reference.
4592 STMT_INFO: The statement containing the data reference.
4593 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4594 OFFSET: Optional. If supplied, it is be added to the initial address.
4595 LOOP: Specify relative to which loop-nest should the address be computed.
4596 For example, when the dataref is in an inner-loop nested in an
4597 outer-loop that is now being vectorized, LOOP can be either the
4598 outer-loop, or the inner-loop. The first memory location accessed
4599 by the following dataref ('in' points to short):
4606 if LOOP=i_loop: &in (relative to i_loop)
4607 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4608 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4609 initial address. Unlike OFFSET, which is number of elements to
4610 be added, BYTE_OFFSET is measured in bytes.
4613 1. Return an SSA_NAME whose value is the address of the memory location of
4614 the first vector of the data reference.
4615 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4616 these statement(s) which define the returned SSA_NAME.
4618 FORNOW: We are only handling array accesses with step 1. */
4621 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4622 gimple_seq
*new_stmt_list
,
4626 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4627 struct data_reference
*dr
= dr_info
->dr
;
4628 const char *base_name
;
4631 gimple_seq seq
= NULL
;
4633 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4634 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4635 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
4637 tree data_ref_base
= unshare_expr (drb
->base_address
);
4638 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
4639 tree init
= unshare_expr (drb
->init
);
4642 base_name
= get_name (data_ref_base
);
4645 base_offset
= ssize_int (0);
4646 init
= ssize_int (0);
4647 base_name
= get_name (DR_REF (dr
));
4650 /* Create base_offset */
4651 base_offset
= size_binop (PLUS_EXPR
,
4652 fold_convert (sizetype
, base_offset
),
4653 fold_convert (sizetype
, init
));
4657 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4658 fold_convert (sizetype
, offset
), step
);
4659 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4660 base_offset
, offset
);
4664 byte_offset
= fold_convert (sizetype
, byte_offset
);
4665 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4666 base_offset
, byte_offset
);
4669 /* base + base_offset */
4671 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4674 addr_base
= build1 (ADDR_EXPR
,
4675 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4676 unshare_expr (DR_REF (dr
)));
4679 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4680 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4681 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4682 gimple_seq_add_seq (new_stmt_list
, seq
);
4684 if (DR_PTR_INFO (dr
)
4685 && TREE_CODE (addr_base
) == SSA_NAME
4686 && !SSA_NAME_PTR_INFO (addr_base
))
4688 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
4689 if (offset
|| byte_offset
)
4690 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4693 if (dump_enabled_p ())
4694 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
4700 /* Function vect_create_data_ref_ptr.
4702 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4703 location accessed in the loop by STMT_INFO, along with the def-use update
4704 chain to appropriately advance the pointer through the loop iterations.
4705 Also set aliasing information for the pointer. This pointer is used by
4706 the callers to this function to create a memory reference expression for
4707 vector load/store access.
4710 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4711 GIMPLE_ASSIGN <name, data-ref> or
4712 GIMPLE_ASSIGN <data-ref, name>.
4713 2. AGGR_TYPE: the type of the reference, which should be either a vector
4715 3. AT_LOOP: the loop where the vector memref is to be created.
4716 4. OFFSET (optional): an offset to be added to the initial address accessed
4717 by the data-ref in STMT_INFO.
4718 5. BSI: location where the new stmts are to be placed if there is no loop
4719 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4720 pointing to the initial address.
4721 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4722 to the initial address accessed by the data-ref in STMT_INFO. This is
4723 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4725 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4726 to the IV during each iteration of the loop. NULL says to move
4727 by one copy of AGGR_TYPE up or down, depending on the step of the
4731 1. Declare a new ptr to vector_type, and have it point to the base of the
4732 data reference (initial addressed accessed by the data reference).
4733 For example, for vector of type V8HI, the following code is generated:
4736 ap = (v8hi *)initial_address;
4738 if OFFSET is not supplied:
4739 initial_address = &a[init];
4740 if OFFSET is supplied:
4741 initial_address = &a[init + OFFSET];
4742 if BYTE_OFFSET is supplied:
4743 initial_address = &a[init] + BYTE_OFFSET;
4745 Return the initial_address in INITIAL_ADDRESS.
4747 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4748 update the pointer in each iteration of the loop.
4750 Return the increment stmt that updates the pointer in PTR_INCR.
4752 3. Return the pointer. */
4755 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4756 tree aggr_type
, class loop
*at_loop
, tree offset
,
4757 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
4758 gimple
**ptr_incr
, bool only_init
,
4759 tree byte_offset
, tree iv_step
)
4761 const char *base_name
;
4762 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4763 class loop
*loop
= NULL
;
4764 bool nested_in_vect_loop
= false;
4765 class loop
*containing_loop
= NULL
;
4769 gimple_seq new_stmt_list
= NULL
;
4773 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4774 struct data_reference
*dr
= dr_info
->dr
;
4776 gimple_stmt_iterator incr_gsi
;
4778 tree indx_before_incr
, indx_after_incr
;
4780 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
4782 gcc_assert (iv_step
!= NULL_TREE
4783 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4784 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4788 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4789 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
4790 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
4791 pe
= loop_preheader_edge (loop
);
4795 gcc_assert (bb_vinfo
);
4800 /* Create an expression for the first address accessed by this load
4802 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4804 if (dump_enabled_p ())
4806 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4807 dump_printf_loc (MSG_NOTE
, vect_location
,
4808 "create %s-pointer variable to type: %T",
4809 get_tree_code_name (TREE_CODE (aggr_type
)),
4811 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4812 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4813 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4814 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4815 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4816 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4818 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4819 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
4822 /* (1) Create the new aggregate-pointer variable.
4823 Vector and array types inherit the alias set of their component
4824 type by default so we need to use a ref-all pointer if the data
4825 reference does not conflict with the created aggregated data
4826 reference because it is not addressable. */
4827 bool need_ref_all
= false;
4828 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4829 get_alias_set (DR_REF (dr
))))
4830 need_ref_all
= true;
4831 /* Likewise for any of the data references in the stmt group. */
4832 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4834 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4837 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4838 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4839 get_alias_set (DR_REF (sdr
))))
4841 need_ref_all
= true;
4844 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
4848 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4850 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4853 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4854 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4855 def-use update cycles for the pointer: one relative to the outer-loop
4856 (LOOP), which is what steps (3) and (4) below do. The other is relative
4857 to the inner-loop (which is the inner-most loop containing the dataref),
4858 and this is done be step (5) below.
4860 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4861 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4862 redundant. Steps (3),(4) create the following:
4865 LOOP: vp1 = phi(vp0,vp2)
4871 If there is an inner-loop nested in loop, then step (5) will also be
4872 applied, and an additional update in the inner-loop will be created:
4875 LOOP: vp1 = phi(vp0,vp2)
4877 inner: vp3 = phi(vp1,vp4)
4878 vp4 = vp3 + inner_step
4884 /* (2) Calculate the initial address of the aggregate-pointer, and set
4885 the aggregate-pointer to point to it before the loop. */
4887 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4889 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
4890 stmt_info
, &new_stmt_list
,
4891 offset
, byte_offset
);
4896 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4897 gcc_assert (!new_bb
);
4900 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4903 *initial_address
= new_temp
;
4904 aggr_ptr_init
= new_temp
;
4906 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4907 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4908 inner-loop nested in LOOP (during outer-loop vectorization). */
4910 /* No update in loop is required. */
4911 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4912 aptr
= aggr_ptr_init
;
4915 /* Accesses to invariant addresses should be handled specially
4917 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
4918 gcc_assert (!integer_zerop (step
));
4920 if (iv_step
== NULL_TREE
)
4922 /* The step of the aggregate pointer is the type size,
4923 negated for downward accesses. */
4924 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4925 if (tree_int_cst_sgn (step
) == -1)
4926 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4929 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4931 create_iv (aggr_ptr_init
,
4932 fold_convert (aggr_ptr_type
, iv_step
),
4933 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4934 &indx_before_incr
, &indx_after_incr
);
4935 incr
= gsi_stmt (incr_gsi
);
4937 /* Copy the points-to information if it exists. */
4938 if (DR_PTR_INFO (dr
))
4940 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4941 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4946 aptr
= indx_before_incr
;
4949 if (!nested_in_vect_loop
|| only_init
)
4953 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4954 nested in LOOP, if exists. */
4956 gcc_assert (nested_in_vect_loop
);
4959 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4961 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4962 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4964 incr
= gsi_stmt (incr_gsi
);
4966 /* Copy the points-to information if it exists. */
4967 if (DR_PTR_INFO (dr
))
4969 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4970 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4975 return indx_before_incr
;
4982 /* Function bump_vector_ptr
4984 Increment a pointer (to a vector type) by vector-size. If requested,
4985 i.e. if PTR-INCR is given, then also connect the new increment stmt
4986 to the existing def-use update-chain of the pointer, by modifying
4987 the PTR_INCR as illustrated below:
4989 The pointer def-use update-chain before this function:
4990 DATAREF_PTR = phi (p_0, p_2)
4992 PTR_INCR: p_2 = DATAREF_PTR + step
4994 The pointer def-use update-chain after this function:
4995 DATAREF_PTR = phi (p_0, p_2)
4997 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4999 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5002 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5004 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5005 the loop. The increment amount across iterations is expected
5007 BSI - location where the new update stmt is to be placed.
5008 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5009 BUMP - optional. The offset by which to bump the pointer. If not given,
5010 the offset is assumed to be vector_size.
5012 Output: Return NEW_DATAREF_PTR as illustrated above.
5017 bump_vector_ptr (vec_info
*vinfo
,
5018 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5019 stmt_vec_info stmt_info
, tree bump
)
5021 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5022 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5023 tree update
= TYPE_SIZE_UNIT (vectype
);
5026 use_operand_p use_p
;
5027 tree new_dataref_ptr
;
5032 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5033 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5035 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5036 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5037 dataref_ptr
, update
);
5038 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5040 /* Copy the points-to information if it exists. */
5041 if (DR_PTR_INFO (dr
))
5043 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5044 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5048 return new_dataref_ptr
;
5050 /* Update the vector-pointer's cross-iteration increment. */
5051 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5053 tree use
= USE_FROM_PTR (use_p
);
5055 if (use
== dataref_ptr
)
5056 SET_USE (use_p
, new_dataref_ptr
);
5058 gcc_assert (operand_equal_p (use
, update
, 0));
5061 return new_dataref_ptr
;
5065 /* Copy memory reference info such as base/clique from the SRC reference
5066 to the DEST MEM_REF. */
5069 vect_copy_ref_info (tree dest
, tree src
)
5071 if (TREE_CODE (dest
) != MEM_REF
)
5074 tree src_base
= src
;
5075 while (handled_component_p (src_base
))
5076 src_base
= TREE_OPERAND (src_base
, 0);
5077 if (TREE_CODE (src_base
) != MEM_REF
5078 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5081 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5082 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5086 /* Function vect_create_destination_var.
5088 Create a new temporary of type VECTYPE. */
5091 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5097 enum vect_var_kind kind
;
5100 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5104 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5106 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5108 name
= get_name (scalar_dest
);
5110 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5112 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5113 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5119 /* Function vect_grouped_store_supported.
5121 Returns TRUE if interleave high and interleave low permutations
5122 are supported, and FALSE otherwise. */
5125 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5127 machine_mode mode
= TYPE_MODE (vectype
);
5129 /* vect_permute_store_chain requires the group size to be equal to 3 or
5130 be a power of two. */
5131 if (count
!= 3 && exact_log2 (count
) == -1)
5133 if (dump_enabled_p ())
5134 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5135 "the size of the group of accesses"
5136 " is not a power of 2 or not eqaul to 3\n");
5140 /* Check that the permutation is supported. */
5141 if (VECTOR_MODE_P (mode
))
5146 unsigned int j0
= 0, j1
= 0, j2
= 0;
5150 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5152 if (dump_enabled_p ())
5153 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5154 "cannot handle groups of 3 stores for"
5155 " variable-length vectors\n");
5159 vec_perm_builder
sel (nelt
, nelt
, 1);
5160 sel
.quick_grow (nelt
);
5161 vec_perm_indices indices
;
5162 for (j
= 0; j
< 3; j
++)
5164 int nelt0
= ((3 - j
) * nelt
) % 3;
5165 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5166 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5167 for (i
= 0; i
< nelt
; i
++)
5169 if (3 * i
+ nelt0
< nelt
)
5170 sel
[3 * i
+ nelt0
] = j0
++;
5171 if (3 * i
+ nelt1
< nelt
)
5172 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5173 if (3 * i
+ nelt2
< nelt
)
5174 sel
[3 * i
+ nelt2
] = 0;
5176 indices
.new_vector (sel
, 2, nelt
);
5177 if (!can_vec_perm_const_p (mode
, indices
))
5179 if (dump_enabled_p ())
5180 dump_printf (MSG_MISSED_OPTIMIZATION
,
5181 "permutation op not supported by target.\n");
5185 for (i
= 0; i
< nelt
; i
++)
5187 if (3 * i
+ nelt0
< nelt
)
5188 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5189 if (3 * i
+ nelt1
< nelt
)
5190 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5191 if (3 * i
+ nelt2
< nelt
)
5192 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5194 indices
.new_vector (sel
, 2, nelt
);
5195 if (!can_vec_perm_const_p (mode
, indices
))
5197 if (dump_enabled_p ())
5198 dump_printf (MSG_MISSED_OPTIMIZATION
,
5199 "permutation op not supported by target.\n");
5207 /* If length is not equal to 3 then only power of 2 is supported. */
5208 gcc_assert (pow2p_hwi (count
));
5209 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5211 /* The encoding has 2 interleaved stepped patterns. */
5212 vec_perm_builder
sel (nelt
, 2, 3);
5214 for (i
= 0; i
< 3; i
++)
5217 sel
[i
* 2 + 1] = i
+ nelt
;
5219 vec_perm_indices
indices (sel
, 2, nelt
);
5220 if (can_vec_perm_const_p (mode
, indices
))
5222 for (i
= 0; i
< 6; i
++)
5223 sel
[i
] += exact_div (nelt
, 2);
5224 indices
.new_vector (sel
, 2, nelt
);
5225 if (can_vec_perm_const_p (mode
, indices
))
5231 if (dump_enabled_p ())
5232 dump_printf (MSG_MISSED_OPTIMIZATION
,
5233 "permutation op not supported by target.\n");
5238 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5239 type VECTYPE. MASKED_P says whether the masked form is needed. */
5242 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5246 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5247 vec_mask_store_lanes_optab
,
5250 return vect_lanes_optab_supported_p ("vec_store_lanes",
5251 vec_store_lanes_optab
,
5256 /* Function vect_permute_store_chain.
5258 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5259 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5260 the data correctly for the stores. Return the final references for stores
5263 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5264 The input is 4 vectors each containing 8 elements. We assign a number to
5265 each element, the input sequence is:
5267 1st vec: 0 1 2 3 4 5 6 7
5268 2nd vec: 8 9 10 11 12 13 14 15
5269 3rd vec: 16 17 18 19 20 21 22 23
5270 4th vec: 24 25 26 27 28 29 30 31
5272 The output sequence should be:
5274 1st vec: 0 8 16 24 1 9 17 25
5275 2nd vec: 2 10 18 26 3 11 19 27
5276 3rd vec: 4 12 20 28 5 13 21 30
5277 4th vec: 6 14 22 30 7 15 23 31
5279 i.e., we interleave the contents of the four vectors in their order.
5281 We use interleave_high/low instructions to create such output. The input of
5282 each interleave_high/low operation is two vectors:
5285 the even elements of the result vector are obtained left-to-right from the
5286 high/low elements of the first vector. The odd elements of the result are
5287 obtained left-to-right from the high/low elements of the second vector.
5288 The output of interleave_high will be: 0 4 1 5
5289 and of interleave_low: 2 6 3 7
5292 The permutation is done in log LENGTH stages. In each stage interleave_high
5293 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5294 where the first argument is taken from the first half of DR_CHAIN and the
5295 second argument from it's second half.
5298 I1: interleave_high (1st vec, 3rd vec)
5299 I2: interleave_low (1st vec, 3rd vec)
5300 I3: interleave_high (2nd vec, 4th vec)
5301 I4: interleave_low (2nd vec, 4th vec)
5303 The output for the first stage is:
5305 I1: 0 16 1 17 2 18 3 19
5306 I2: 4 20 5 21 6 22 7 23
5307 I3: 8 24 9 25 10 26 11 27
5308 I4: 12 28 13 29 14 30 15 31
5310 The output of the second stage, i.e. the final result is:
5312 I1: 0 8 16 24 1 9 17 25
5313 I2: 2 10 18 26 3 11 19 27
5314 I3: 4 12 20 28 5 13 21 30
5315 I4: 6 14 22 30 7 15 23 31. */
5318 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
5319 unsigned int length
,
5320 stmt_vec_info stmt_info
,
5321 gimple_stmt_iterator
*gsi
,
5322 vec
<tree
> *result_chain
)
5324 tree vect1
, vect2
, high
, low
;
5326 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5327 tree perm_mask_low
, perm_mask_high
;
5329 tree perm3_mask_low
, perm3_mask_high
;
5330 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5332 result_chain
->quick_grow (length
);
5333 memcpy (result_chain
->address (), dr_chain
.address (),
5334 length
* sizeof (tree
));
5338 /* vect_grouped_store_supported ensures that this is constant. */
5339 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5340 unsigned int j0
= 0, j1
= 0, j2
= 0;
5342 vec_perm_builder
sel (nelt
, nelt
, 1);
5343 sel
.quick_grow (nelt
);
5344 vec_perm_indices indices
;
5345 for (j
= 0; j
< 3; j
++)
5347 int nelt0
= ((3 - j
) * nelt
) % 3;
5348 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5349 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5351 for (i
= 0; i
< nelt
; i
++)
5353 if (3 * i
+ nelt0
< nelt
)
5354 sel
[3 * i
+ nelt0
] = j0
++;
5355 if (3 * i
+ nelt1
< nelt
)
5356 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5357 if (3 * i
+ nelt2
< nelt
)
5358 sel
[3 * i
+ nelt2
] = 0;
5360 indices
.new_vector (sel
, 2, nelt
);
5361 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5363 for (i
= 0; i
< nelt
; i
++)
5365 if (3 * i
+ nelt0
< nelt
)
5366 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5367 if (3 * i
+ nelt1
< nelt
)
5368 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5369 if (3 * i
+ nelt2
< nelt
)
5370 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5372 indices
.new_vector (sel
, 2, nelt
);
5373 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5375 vect1
= dr_chain
[0];
5376 vect2
= dr_chain
[1];
5378 /* Create interleaving stmt:
5379 low = VEC_PERM_EXPR <vect1, vect2,
5380 {j, nelt, *, j + 1, nelt + j + 1, *,
5381 j + 2, nelt + j + 2, *, ...}> */
5382 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5383 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5384 vect2
, perm3_mask_low
);
5385 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5388 vect2
= dr_chain
[2];
5389 /* Create interleaving stmt:
5390 low = VEC_PERM_EXPR <vect1, vect2,
5391 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5392 6, 7, nelt + j + 2, ...}> */
5393 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5394 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5395 vect2
, perm3_mask_high
);
5396 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5397 (*result_chain
)[j
] = data_ref
;
5402 /* If length is not equal to 3 then only power of 2 is supported. */
5403 gcc_assert (pow2p_hwi (length
));
5405 /* The encoding has 2 interleaved stepped patterns. */
5406 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5407 vec_perm_builder
sel (nelt
, 2, 3);
5409 for (i
= 0; i
< 3; i
++)
5412 sel
[i
* 2 + 1] = i
+ nelt
;
5414 vec_perm_indices
indices (sel
, 2, nelt
);
5415 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5417 for (i
= 0; i
< 6; i
++)
5418 sel
[i
] += exact_div (nelt
, 2);
5419 indices
.new_vector (sel
, 2, nelt
);
5420 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5422 for (i
= 0, n
= log_length
; i
< n
; i
++)
5424 for (j
= 0; j
< length
/2; j
++)
5426 vect1
= dr_chain
[j
];
5427 vect2
= dr_chain
[j
+length
/2];
5429 /* Create interleaving stmt:
5430 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5432 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5433 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5434 vect2
, perm_mask_high
);
5435 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5436 (*result_chain
)[2*j
] = high
;
5438 /* Create interleaving stmt:
5439 low = VEC_PERM_EXPR <vect1, vect2,
5440 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5442 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5443 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5444 vect2
, perm_mask_low
);
5445 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5446 (*result_chain
)[2*j
+1] = low
;
5448 memcpy (dr_chain
.address (), result_chain
->address (),
5449 length
* sizeof (tree
));
5454 /* Function vect_setup_realignment
5456 This function is called when vectorizing an unaligned load using
5457 the dr_explicit_realign[_optimized] scheme.
5458 This function generates the following code at the loop prolog:
5461 x msq_init = *(floor(p)); # prolog load
5462 realignment_token = call target_builtin;
5464 x msq = phi (msq_init, ---)
5466 The stmts marked with x are generated only for the case of
5467 dr_explicit_realign_optimized.
5469 The code above sets up a new (vector) pointer, pointing to the first
5470 location accessed by STMT_INFO, and a "floor-aligned" load using that
5471 pointer. It also generates code to compute the "realignment-token"
5472 (if the relevant target hook was defined), and creates a phi-node at the
5473 loop-header bb whose arguments are the result of the prolog-load (created
5474 by this function) and the result of a load that takes place in the loop
5475 (to be created by the caller to this function).
5477 For the case of dr_explicit_realign_optimized:
5478 The caller to this function uses the phi-result (msq) to create the
5479 realignment code inside the loop, and sets up the missing phi argument,
5482 msq = phi (msq_init, lsq)
5483 lsq = *(floor(p')); # load in loop
5484 result = realign_load (msq, lsq, realignment_token);
5486 For the case of dr_explicit_realign:
5488 msq = *(floor(p)); # load in loop
5490 lsq = *(floor(p')); # load in loop
5491 result = realign_load (msq, lsq, realignment_token);
5494 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5495 a memory location that may be unaligned.
5496 BSI - place where new code is to be inserted.
5497 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5501 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5502 target hook, if defined.
5503 Return value - the result of the loop-header phi node. */
5506 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5507 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
5508 enum dr_alignment_support alignment_support_scheme
,
5510 class loop
**at_loop
)
5512 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5513 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5514 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5515 struct data_reference
*dr
= dr_info
->dr
;
5516 class loop
*loop
= NULL
;
5518 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
5524 tree msq_init
= NULL_TREE
;
5527 tree msq
= NULL_TREE
;
5528 gimple_seq stmts
= NULL
;
5529 bool compute_in_loop
= false;
5530 bool nested_in_vect_loop
= false;
5531 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5532 class loop
*loop_for_initial_load
= NULL
;
5536 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5537 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5540 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5541 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5543 /* We need to generate three things:
5544 1. the misalignment computation
5545 2. the extra vector load (for the optimized realignment scheme).
5546 3. the phi node for the two vectors from which the realignment is
5547 done (for the optimized realignment scheme). */
5549 /* 1. Determine where to generate the misalignment computation.
5551 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5552 calculation will be generated by this function, outside the loop (in the
5553 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5554 caller, inside the loop.
5556 Background: If the misalignment remains fixed throughout the iterations of
5557 the loop, then both realignment schemes are applicable, and also the
5558 misalignment computation can be done outside LOOP. This is because we are
5559 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5560 are a multiple of VS (the Vector Size), and therefore the misalignment in
5561 different vectorized LOOP iterations is always the same.
5562 The problem arises only if the memory access is in an inner-loop nested
5563 inside LOOP, which is now being vectorized using outer-loop vectorization.
5564 This is the only case when the misalignment of the memory access may not
5565 remain fixed throughout the iterations of the inner-loop (as explained in
5566 detail in vect_supportable_dr_alignment). In this case, not only is the
5567 optimized realignment scheme not applicable, but also the misalignment
5568 computation (and generation of the realignment token that is passed to
5569 REALIGN_LOAD) have to be done inside the loop.
5571 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5572 or not, which in turn determines if the misalignment is computed inside
5573 the inner-loop, or outside LOOP. */
5575 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5577 compute_in_loop
= true;
5578 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5582 /* 2. Determine where to generate the extra vector load.
5584 For the optimized realignment scheme, instead of generating two vector
5585 loads in each iteration, we generate a single extra vector load in the
5586 preheader of the loop, and in each iteration reuse the result of the
5587 vector load from the previous iteration. In case the memory access is in
5588 an inner-loop nested inside LOOP, which is now being vectorized using
5589 outer-loop vectorization, we need to determine whether this initial vector
5590 load should be generated at the preheader of the inner-loop, or can be
5591 generated at the preheader of LOOP. If the memory access has no evolution
5592 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5593 to be generated inside LOOP (in the preheader of the inner-loop). */
5595 if (nested_in_vect_loop
)
5597 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5598 bool invariant_in_outerloop
=
5599 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5600 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5603 loop_for_initial_load
= loop
;
5605 *at_loop
= loop_for_initial_load
;
5607 if (loop_for_initial_load
)
5608 pe
= loop_preheader_edge (loop_for_initial_load
);
5610 /* 3. For the case of the optimized realignment, create the first vector
5611 load at the loop preheader. */
5613 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5615 /* Create msq_init = *(floor(p1)) in the loop preheader */
5618 gcc_assert (!compute_in_loop
);
5619 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5620 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
5621 loop_for_initial_load
, NULL_TREE
,
5622 &init_addr
, NULL
, &inc
, true);
5623 if (TREE_CODE (ptr
) == SSA_NAME
)
5624 new_temp
= copy_ssa_name (ptr
);
5626 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5627 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
5628 tree type
= TREE_TYPE (ptr
);
5629 new_stmt
= gimple_build_assign
5630 (new_temp
, BIT_AND_EXPR
, ptr
,
5631 fold_build2 (MINUS_EXPR
, type
,
5632 build_int_cst (type
, 0),
5633 build_int_cst (type
, align
)));
5634 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5635 gcc_assert (!new_bb
);
5637 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5638 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5639 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5640 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5641 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5642 gimple_assign_set_lhs (new_stmt
, new_temp
);
5645 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5646 gcc_assert (!new_bb
);
5649 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5651 msq_init
= gimple_assign_lhs (new_stmt
);
5654 /* 4. Create realignment token using a target builtin, if available.
5655 It is done either inside the containing loop, or before LOOP (as
5656 determined above). */
5658 if (targetm
.vectorize
.builtin_mask_for_load
)
5663 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5666 /* Generate the INIT_ADDR computation outside LOOP. */
5667 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
5672 pe
= loop_preheader_edge (loop
);
5673 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5674 gcc_assert (!new_bb
);
5677 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5680 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5681 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5683 vect_create_destination_var (scalar_dest
,
5684 gimple_call_return_type (new_stmt
));
5685 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5686 gimple_call_set_lhs (new_stmt
, new_temp
);
5688 if (compute_in_loop
)
5689 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5692 /* Generate the misalignment computation outside LOOP. */
5693 pe
= loop_preheader_edge (loop
);
5694 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5695 gcc_assert (!new_bb
);
5698 *realignment_token
= gimple_call_lhs (new_stmt
);
5700 /* The result of the CALL_EXPR to this builtin is determined from
5701 the value of the parameter and no global variables are touched
5702 which makes the builtin a "const" function. Requiring the
5703 builtin to have the "const" attribute makes it unnecessary
5704 to call mark_call_clobbered. */
5705 gcc_assert (TREE_READONLY (builtin_decl
));
5708 if (alignment_support_scheme
== dr_explicit_realign
)
5711 gcc_assert (!compute_in_loop
);
5712 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5715 /* 5. Create msq = phi <msq_init, lsq> in loop */
5717 pe
= loop_preheader_edge (containing_loop
);
5718 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5719 msq
= make_ssa_name (vec_dest
);
5720 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5721 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5727 /* Function vect_grouped_load_supported.
5729 COUNT is the size of the load group (the number of statements plus the
5730 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5731 only one statement, with a gap of COUNT - 1.
5733 Returns true if a suitable permute exists. */
5736 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5737 unsigned HOST_WIDE_INT count
)
5739 machine_mode mode
= TYPE_MODE (vectype
);
5741 /* If this is single-element interleaving with an element distance
5742 that leaves unused vector loads around punt - we at least create
5743 very sub-optimal code in that case (and blow up memory,
5745 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5747 if (dump_enabled_p ())
5748 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5749 "single-element interleaving not supported "
5750 "for not adjacent vector loads\n");
5754 /* vect_permute_load_chain requires the group size to be equal to 3 or
5755 be a power of two. */
5756 if (count
!= 3 && exact_log2 (count
) == -1)
5758 if (dump_enabled_p ())
5759 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5760 "the size of the group of accesses"
5761 " is not a power of 2 or not equal to 3\n");
5765 /* Check that the permutation is supported. */
5766 if (VECTOR_MODE_P (mode
))
5772 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5774 if (dump_enabled_p ())
5775 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5776 "cannot handle groups of 3 loads for"
5777 " variable-length vectors\n");
5781 vec_perm_builder
sel (nelt
, nelt
, 1);
5782 sel
.quick_grow (nelt
);
5783 vec_perm_indices indices
;
5785 for (k
= 0; k
< 3; k
++)
5787 for (i
= 0; i
< nelt
; i
++)
5788 if (3 * i
+ k
< 2 * nelt
)
5792 indices
.new_vector (sel
, 2, nelt
);
5793 if (!can_vec_perm_const_p (mode
, indices
))
5795 if (dump_enabled_p ())
5796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5797 "shuffle of 3 loads is not supported by"
5801 for (i
= 0, j
= 0; i
< nelt
; i
++)
5802 if (3 * i
+ k
< 2 * nelt
)
5805 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5806 indices
.new_vector (sel
, 2, nelt
);
5807 if (!can_vec_perm_const_p (mode
, indices
))
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5811 "shuffle of 3 loads is not supported by"
5820 /* If length is not equal to 3 then only power of 2 is supported. */
5821 gcc_assert (pow2p_hwi (count
));
5822 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5824 /* The encoding has a single stepped pattern. */
5825 vec_perm_builder
sel (nelt
, 1, 3);
5827 for (i
= 0; i
< 3; i
++)
5829 vec_perm_indices
indices (sel
, 2, nelt
);
5830 if (can_vec_perm_const_p (mode
, indices
))
5832 for (i
= 0; i
< 3; i
++)
5834 indices
.new_vector (sel
, 2, nelt
);
5835 if (can_vec_perm_const_p (mode
, indices
))
5841 if (dump_enabled_p ())
5842 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5843 "extract even/odd not supported by target\n");
5847 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5848 type VECTYPE. MASKED_P says whether the masked form is needed. */
5851 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5855 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5856 vec_mask_load_lanes_optab
,
5859 return vect_lanes_optab_supported_p ("vec_load_lanes",
5860 vec_load_lanes_optab
,
5864 /* Function vect_permute_load_chain.
5866 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5867 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5868 the input data correctly. Return the final references for loads in
5871 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5872 The input is 4 vectors each containing 8 elements. We assign a number to each
5873 element, the input sequence is:
5875 1st vec: 0 1 2 3 4 5 6 7
5876 2nd vec: 8 9 10 11 12 13 14 15
5877 3rd vec: 16 17 18 19 20 21 22 23
5878 4th vec: 24 25 26 27 28 29 30 31
5880 The output sequence should be:
5882 1st vec: 0 4 8 12 16 20 24 28
5883 2nd vec: 1 5 9 13 17 21 25 29
5884 3rd vec: 2 6 10 14 18 22 26 30
5885 4th vec: 3 7 11 15 19 23 27 31
5887 i.e., the first output vector should contain the first elements of each
5888 interleaving group, etc.
5890 We use extract_even/odd instructions to create such output. The input of
5891 each extract_even/odd operation is two vectors
5895 and the output is the vector of extracted even/odd elements. The output of
5896 extract_even will be: 0 2 4 6
5897 and of extract_odd: 1 3 5 7
5900 The permutation is done in log LENGTH stages. In each stage extract_even
5901 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5902 their order. In our example,
5904 E1: extract_even (1st vec, 2nd vec)
5905 E2: extract_odd (1st vec, 2nd vec)
5906 E3: extract_even (3rd vec, 4th vec)
5907 E4: extract_odd (3rd vec, 4th vec)
5909 The output for the first stage will be:
5911 E1: 0 2 4 6 8 10 12 14
5912 E2: 1 3 5 7 9 11 13 15
5913 E3: 16 18 20 22 24 26 28 30
5914 E4: 17 19 21 23 25 27 29 31
5916 In order to proceed and create the correct sequence for the next stage (or
5917 for the correct output, if the second stage is the last one, as in our
5918 example), we first put the output of extract_even operation and then the
5919 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5920 The input for the second stage is:
5922 1st vec (E1): 0 2 4 6 8 10 12 14
5923 2nd vec (E3): 16 18 20 22 24 26 28 30
5924 3rd vec (E2): 1 3 5 7 9 11 13 15
5925 4th vec (E4): 17 19 21 23 25 27 29 31
5927 The output of the second stage:
5929 E1: 0 4 8 12 16 20 24 28
5930 E2: 2 6 10 14 18 22 26 30
5931 E3: 1 5 9 13 17 21 25 29
5932 E4: 3 7 11 15 19 23 27 31
5934 And RESULT_CHAIN after reordering:
5936 1st vec (E1): 0 4 8 12 16 20 24 28
5937 2nd vec (E3): 1 5 9 13 17 21 25 29
5938 3rd vec (E2): 2 6 10 14 18 22 26 30
5939 4th vec (E4): 3 7 11 15 19 23 27 31. */
5942 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
5943 unsigned int length
,
5944 stmt_vec_info stmt_info
,
5945 gimple_stmt_iterator
*gsi
,
5946 vec
<tree
> *result_chain
)
5948 tree data_ref
, first_vect
, second_vect
;
5949 tree perm_mask_even
, perm_mask_odd
;
5950 tree perm3_mask_low
, perm3_mask_high
;
5952 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5953 unsigned int i
, j
, log_length
= exact_log2 (length
);
5955 result_chain
->quick_grow (length
);
5956 memcpy (result_chain
->address (), dr_chain
.address (),
5957 length
* sizeof (tree
));
5961 /* vect_grouped_load_supported ensures that this is constant. */
5962 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5965 vec_perm_builder
sel (nelt
, nelt
, 1);
5966 sel
.quick_grow (nelt
);
5967 vec_perm_indices indices
;
5968 for (k
= 0; k
< 3; k
++)
5970 for (i
= 0; i
< nelt
; i
++)
5971 if (3 * i
+ k
< 2 * nelt
)
5975 indices
.new_vector (sel
, 2, nelt
);
5976 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5978 for (i
= 0, j
= 0; i
< nelt
; i
++)
5979 if (3 * i
+ k
< 2 * nelt
)
5982 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5983 indices
.new_vector (sel
, 2, nelt
);
5984 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5986 first_vect
= dr_chain
[0];
5987 second_vect
= dr_chain
[1];
5989 /* Create interleaving stmt (low part of):
5990 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5992 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5993 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5994 second_vect
, perm3_mask_low
);
5995 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5997 /* Create interleaving stmt (high part of):
5998 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6000 first_vect
= data_ref
;
6001 second_vect
= dr_chain
[2];
6002 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6003 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6004 second_vect
, perm3_mask_high
);
6005 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6006 (*result_chain
)[k
] = data_ref
;
6011 /* If length is not equal to 3 then only power of 2 is supported. */
6012 gcc_assert (pow2p_hwi (length
));
6014 /* The encoding has a single stepped pattern. */
6015 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6016 vec_perm_builder
sel (nelt
, 1, 3);
6018 for (i
= 0; i
< 3; ++i
)
6020 vec_perm_indices
indices (sel
, 2, nelt
);
6021 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6023 for (i
= 0; i
< 3; ++i
)
6025 indices
.new_vector (sel
, 2, nelt
);
6026 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6028 for (i
= 0; i
< log_length
; i
++)
6030 for (j
= 0; j
< length
; j
+= 2)
6032 first_vect
= dr_chain
[j
];
6033 second_vect
= dr_chain
[j
+1];
6035 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6036 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6037 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6038 first_vect
, second_vect
,
6040 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6041 (*result_chain
)[j
/2] = data_ref
;
6043 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6044 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6045 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6046 first_vect
, second_vect
,
6048 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6049 (*result_chain
)[j
/2+length
/2] = data_ref
;
6051 memcpy (dr_chain
.address (), result_chain
->address (),
6052 length
* sizeof (tree
));
6057 /* Function vect_shift_permute_load_chain.
6059 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6060 sequence of stmts to reorder the input data accordingly.
6061 Return the final references for loads in RESULT_CHAIN.
6062 Return true if successed, false otherwise.
6064 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6065 The input is 3 vectors each containing 8 elements. We assign a
6066 number to each element, the input sequence is:
6068 1st vec: 0 1 2 3 4 5 6 7
6069 2nd vec: 8 9 10 11 12 13 14 15
6070 3rd vec: 16 17 18 19 20 21 22 23
6072 The output sequence should be:
6074 1st vec: 0 3 6 9 12 15 18 21
6075 2nd vec: 1 4 7 10 13 16 19 22
6076 3rd vec: 2 5 8 11 14 17 20 23
6078 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6080 First we shuffle all 3 vectors to get correct elements order:
6082 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6083 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6084 3rd vec: (16 19 22) (17 20 23) (18 21)
6086 Next we unite and shift vector 3 times:
6089 shift right by 6 the concatenation of:
6090 "1st vec" and "2nd vec"
6091 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6092 "2nd vec" and "3rd vec"
6093 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6094 "3rd vec" and "1st vec"
6095 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6098 So that now new vectors are:
6100 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6101 2nd vec: (10 13) (16 19 22) (17 20 23)
6102 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6105 shift right by 5 the concatenation of:
6106 "1st vec" and "3rd vec"
6107 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6108 "2nd vec" and "1st vec"
6109 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6110 "3rd vec" and "2nd vec"
6111 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6114 So that now new vectors are:
6116 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6117 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6118 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6121 shift right by 5 the concatenation of:
6122 "1st vec" and "1st vec"
6123 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6124 shift right by 3 the concatenation of:
6125 "2nd vec" and "2nd vec"
6126 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6129 So that now all vectors are READY:
6130 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6131 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6132 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6134 This algorithm is faster than one in vect_permute_load_chain if:
6135 1. "shift of a concatination" is faster than general permutation.
6137 2. The TARGET machine can't execute vector instructions in parallel.
6138 This is because each step of the algorithm depends on previous.
6139 The algorithm in vect_permute_load_chain is much more parallel.
6141 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6145 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6146 unsigned int length
,
6147 stmt_vec_info stmt_info
,
6148 gimple_stmt_iterator
*gsi
,
6149 vec
<tree
> *result_chain
)
6151 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6152 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6153 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6156 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6158 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6160 unsigned HOST_WIDE_INT nelt
, vf
;
6161 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6162 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6163 /* Not supported for variable-length vectors. */
6166 vec_perm_builder
sel (nelt
, nelt
, 1);
6167 sel
.quick_grow (nelt
);
6169 result_chain
->quick_grow (length
);
6170 memcpy (result_chain
->address (), dr_chain
.address (),
6171 length
* sizeof (tree
));
6173 if (pow2p_hwi (length
) && vf
> 4)
6175 unsigned int j
, log_length
= exact_log2 (length
);
6176 for (i
= 0; i
< nelt
/ 2; ++i
)
6178 for (i
= 0; i
< nelt
/ 2; ++i
)
6179 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6180 vec_perm_indices
indices (sel
, 2, nelt
);
6181 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6183 if (dump_enabled_p ())
6184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6185 "shuffle of 2 fields structure is not \
6186 supported by target\n");
6189 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6191 for (i
= 0; i
< nelt
/ 2; ++i
)
6193 for (i
= 0; i
< nelt
/ 2; ++i
)
6194 sel
[nelt
/ 2 + i
] = i
* 2;
6195 indices
.new_vector (sel
, 2, nelt
);
6196 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6198 if (dump_enabled_p ())
6199 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6200 "shuffle of 2 fields structure is not \
6201 supported by target\n");
6204 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6206 /* Generating permutation constant to shift all elements.
6207 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6208 for (i
= 0; i
< nelt
; i
++)
6209 sel
[i
] = nelt
/ 2 + i
;
6210 indices
.new_vector (sel
, 2, nelt
);
6211 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6215 "shift permutation is not supported by target\n");
6218 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6220 /* Generating permutation constant to select vector from 2.
6221 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6222 for (i
= 0; i
< nelt
/ 2; i
++)
6224 for (i
= nelt
/ 2; i
< nelt
; i
++)
6226 indices
.new_vector (sel
, 2, nelt
);
6227 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6229 if (dump_enabled_p ())
6230 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6231 "select is not supported by target\n");
6234 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6236 for (i
= 0; i
< log_length
; i
++)
6238 for (j
= 0; j
< length
; j
+= 2)
6240 first_vect
= dr_chain
[j
];
6241 second_vect
= dr_chain
[j
+ 1];
6243 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6244 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6245 first_vect
, first_vect
,
6247 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6250 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6251 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6252 second_vect
, second_vect
,
6254 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6257 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6258 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6259 vect
[0], vect
[1], shift1_mask
);
6260 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6261 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6263 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6264 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6265 vect
[0], vect
[1], select_mask
);
6266 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6267 (*result_chain
)[j
/2] = data_ref
;
6269 memcpy (dr_chain
.address (), result_chain
->address (),
6270 length
* sizeof (tree
));
6274 if (length
== 3 && vf
> 2)
6276 unsigned int k
= 0, l
= 0;
6278 /* Generating permutation constant to get all elements in rigth order.
6279 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6280 for (i
= 0; i
< nelt
; i
++)
6282 if (3 * k
+ (l
% 3) >= nelt
)
6285 l
+= (3 - (nelt
% 3));
6287 sel
[i
] = 3 * k
+ (l
% 3);
6290 vec_perm_indices
indices (sel
, 2, nelt
);
6291 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6293 if (dump_enabled_p ())
6294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6295 "shuffle of 3 fields structure is not \
6296 supported by target\n");
6299 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6301 /* Generating permutation constant to shift all elements.
6302 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6303 for (i
= 0; i
< nelt
; i
++)
6304 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6305 indices
.new_vector (sel
, 2, nelt
);
6306 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6308 if (dump_enabled_p ())
6309 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6310 "shift permutation is not supported by target\n");
6313 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6315 /* Generating permutation constant to shift all elements.
6316 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6317 for (i
= 0; i
< nelt
; i
++)
6318 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6319 indices
.new_vector (sel
, 2, nelt
);
6320 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6324 "shift permutation is not supported by target\n");
6327 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6329 /* Generating permutation constant to shift all elements.
6330 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6331 for (i
= 0; i
< nelt
; i
++)
6332 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6333 indices
.new_vector (sel
, 2, nelt
);
6334 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6336 if (dump_enabled_p ())
6337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6338 "shift permutation is not supported by target\n");
6341 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6343 /* Generating permutation constant to shift all elements.
6344 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6345 for (i
= 0; i
< nelt
; i
++)
6346 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6347 indices
.new_vector (sel
, 2, nelt
);
6348 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6350 if (dump_enabled_p ())
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6352 "shift permutation is not supported by target\n");
6355 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6357 for (k
= 0; k
< 3; k
++)
6359 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6360 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6361 dr_chain
[k
], dr_chain
[k
],
6363 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6367 for (k
= 0; k
< 3; k
++)
6369 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6370 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6371 vect
[k
% 3], vect
[(k
+ 1) % 3],
6373 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6374 vect_shift
[k
] = data_ref
;
6377 for (k
= 0; k
< 3; k
++)
6379 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6380 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6381 vect_shift
[(4 - k
) % 3],
6382 vect_shift
[(3 - k
) % 3],
6384 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6388 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6390 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6391 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6392 vect
[0], shift3_mask
);
6393 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6394 (*result_chain
)[nelt
% 3] = data_ref
;
6396 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6397 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6398 vect
[1], shift4_mask
);
6399 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6400 (*result_chain
)[0] = data_ref
;
6406 /* Function vect_transform_grouped_load.
6408 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6409 to perform their permutation and ascribe the result vectorized statements to
6410 the scalar statements.
6414 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6416 int size
, gimple_stmt_iterator
*gsi
)
6419 vec
<tree
> result_chain
= vNULL
;
6421 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6422 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6423 vectors, that are ready for vector computation. */
6424 result_chain
.create (size
);
6426 /* If reassociation width for vector type is 2 or greater target machine can
6427 execute 2 or more vector instructions in parallel. Otherwise try to
6428 get chain for loads group using vect_shift_permute_load_chain. */
6429 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6430 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6432 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6433 gsi
, &result_chain
))
6434 vect_permute_load_chain (vinfo
, dr_chain
,
6435 size
, stmt_info
, gsi
, &result_chain
);
6436 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6437 result_chain
.release ();
6440 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6441 generated as part of the vectorization of STMT_INFO. Assign the statement
6442 for each vector to the associated scalar statement. */
6445 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
6446 vec
<tree
> result_chain
)
6448 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6449 unsigned int i
, gap_count
;
6452 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6453 Since we scan the chain starting from it's first node, their order
6454 corresponds the order of data-refs in RESULT_CHAIN. */
6455 stmt_vec_info next_stmt_info
= first_stmt_info
;
6457 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6459 if (!next_stmt_info
)
6462 /* Skip the gaps. Loads created for the gaps will be removed by dead
6463 code elimination pass later. No need to check for the first stmt in
6464 the group, since it always exists.
6465 DR_GROUP_GAP is the number of steps in elements from the previous
6466 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6467 correspond to the gaps. */
6468 if (next_stmt_info
!= first_stmt_info
6469 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6475 /* ??? The following needs cleanup after the removal of
6476 DR_GROUP_SAME_DR_STMT. */
6479 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6480 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6481 copies, and we put the new vector statement last. */
6482 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
6484 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
6490 /* Function vect_force_dr_alignment_p.
6492 Returns whether the alignment of a DECL can be forced to be aligned
6493 on ALIGNMENT bit boundary. */
6496 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
6501 if (decl_in_symtab_p (decl
)
6502 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6505 if (TREE_STATIC (decl
))
6506 return (known_le (alignment
,
6507 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
6509 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
6513 /* Return whether the data reference DR_INFO is supported with respect to its
6515 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6516 it is aligned, i.e., check if it is possible to vectorize it with different
6519 enum dr_alignment_support
6520 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
6521 bool check_aligned_accesses
)
6523 data_reference
*dr
= dr_info
->dr
;
6524 stmt_vec_info stmt_info
= dr_info
->stmt
;
6525 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6526 machine_mode mode
= TYPE_MODE (vectype
);
6527 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6528 class loop
*vect_loop
= NULL
;
6529 bool nested_in_vect_loop
= false;
6531 if (aligned_access_p (dr_info
) && !check_aligned_accesses
)
6534 /* For now assume all conditional loads/stores support unaligned
6535 access without any special code. */
6536 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
6537 if (gimple_call_internal_p (stmt
)
6538 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6539 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6540 return dr_unaligned_supported
;
6544 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6545 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
6548 /* Possibly unaligned access. */
6550 /* We can choose between using the implicit realignment scheme (generating
6551 a misaligned_move stmt) and the explicit realignment scheme (generating
6552 aligned loads with a REALIGN_LOAD). There are two variants to the
6553 explicit realignment scheme: optimized, and unoptimized.
6554 We can optimize the realignment only if the step between consecutive
6555 vector loads is equal to the vector size. Since the vector memory
6556 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6557 is guaranteed that the misalignment amount remains the same throughout the
6558 execution of the vectorized loop. Therefore, we can create the
6559 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6560 at the loop preheader.
6562 However, in the case of outer-loop vectorization, when vectorizing a
6563 memory access in the inner-loop nested within the LOOP that is now being
6564 vectorized, while it is guaranteed that the misalignment of the
6565 vectorized memory access will remain the same in different outer-loop
6566 iterations, it is *not* guaranteed that is will remain the same throughout
6567 the execution of the inner-loop. This is because the inner-loop advances
6568 with the original scalar step (and not in steps of VS). If the inner-loop
6569 step happens to be a multiple of VS, then the misalignment remains fixed
6570 and we can use the optimized realignment scheme. For example:
6576 When vectorizing the i-loop in the above example, the step between
6577 consecutive vector loads is 1, and so the misalignment does not remain
6578 fixed across the execution of the inner-loop, and the realignment cannot
6579 be optimized (as illustrated in the following pseudo vectorized loop):
6581 for (i=0; i<N; i+=4)
6582 for (j=0; j<M; j++){
6583 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6584 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6585 // (assuming that we start from an aligned address).
6588 We therefore have to use the unoptimized realignment scheme:
6590 for (i=0; i<N; i+=4)
6591 for (j=k; j<M; j+=4)
6592 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6593 // that the misalignment of the initial address is
6596 The loop can then be vectorized as follows:
6598 for (k=0; k<4; k++){
6599 rt = get_realignment_token (&vp[k]);
6600 for (i=0; i<N; i+=4){
6602 for (j=k; j<M; j+=4){
6604 va = REALIGN_LOAD <v1,v2,rt>;
6611 if (DR_IS_READ (dr
))
6613 bool is_packed
= false;
6614 tree type
= (TREE_TYPE (DR_REF (dr
)));
6616 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6617 && (!targetm
.vectorize
.builtin_mask_for_load
6618 || targetm
.vectorize
.builtin_mask_for_load ()))
6620 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6622 /* If we are doing SLP then the accesses need not have the
6623 same alignment, instead it depends on the SLP group size. */
6625 && STMT_SLP_TYPE (stmt_info
)
6626 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6628 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6629 TYPE_VECTOR_SUBPARTS (vectype
)))
6631 else if (!loop_vinfo
6632 || (nested_in_vect_loop
6633 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6634 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6635 return dr_explicit_realign
;
6637 return dr_explicit_realign_optimized
;
6639 if (!known_alignment_for_access_p (dr_info
))
6640 is_packed
= not_size_aligned (DR_REF (dr
));
6642 if (targetm
.vectorize
.support_vector_misalignment
6643 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6644 /* Can't software pipeline the loads, but can at least do them. */
6645 return dr_unaligned_supported
;
6649 bool is_packed
= false;
6650 tree type
= (TREE_TYPE (DR_REF (dr
)));
6652 if (!known_alignment_for_access_p (dr_info
))
6653 is_packed
= not_size_aligned (DR_REF (dr
));
6655 if (targetm
.vectorize
.support_vector_misalignment
6656 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6657 return dr_unaligned_supported
;
6661 return dr_unaligned_unsupported
;