1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode
), count
);
83 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
87 "cannot use %s<%s><%s>\n", name
,
88 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE
, vect_location
,
94 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
95 GET_MODE_NAME (mode
));
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
,
120 HOST_WIDE_INT
*lhs_size_unit
,
121 HOST_WIDE_INT
*rhs_size_unit
)
123 tree scalar_type
= gimple_expr_type (stmt_info
->stmt
);
124 HOST_WIDE_INT lhs
, rhs
;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
131 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
133 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
135 && (gimple_assign_cast_p (assign
)
136 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
))
142 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
144 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
146 scalar_type
= rhs_type
;
149 *lhs_size_unit
= lhs
;
150 *rhs_size_unit
= rhs
;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
160 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
162 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
165 return opt_result::failure_at (vect_location
,
166 "will not create alias checks, as"
167 " --param vect-max-version-for-alias-checks"
171 = runtime_alias_check_p (ddr
, loop
,
172 optimize_loop_nest_for_speed_p (loop
));
176 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
177 return opt_result::success ();
180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
183 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
185 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
186 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
187 if (checks
[i
] == value
)
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_NOTE
, vect_location
,
192 "need run-time check that %T is nonzero\n",
194 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
197 /* Return true if we know that the order of vectorized DR_INFO_A and
198 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
199 DR_INFO_B. At least one of the accesses is a write. */
202 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
204 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
205 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
207 /* Single statements are always kept in their original order. */
208 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
209 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
212 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
213 group are emitted at the position of the last scalar load and all
214 stores in a group are emitted at the position of the last scalar store.
215 Compute that position and check whether the resulting order matches
217 stmt_vec_info last_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
219 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (last_a
); s
;
220 s
= DR_GROUP_NEXT_ELEMENT (s
))
221 last_a
= get_later_stmt (last_a
, s
);
224 stmt_vec_info last_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
226 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (last_b
); s
;
227 s
= DR_GROUP_NEXT_ELEMENT (s
))
228 last_b
= get_later_stmt (last_b
, s
);
231 return ((get_later_stmt (last_a
, last_b
) == last_a
)
232 == (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
));
235 /* A subroutine of vect_analyze_data_ref_dependence. Handle
236 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
237 distances. These distances are conservatively correct but they don't
238 reflect a guaranteed dependence.
240 Return true if this function does all the work necessary to avoid
241 an alias or false if the caller should use the dependence distances
242 to limit the vectorization factor in the usual way. LOOP_DEPTH is
243 the depth of the loop described by LOOP_VINFO and the other arguments
244 are as for vect_analyze_data_ref_dependence. */
247 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
248 loop_vec_info loop_vinfo
,
249 int loop_depth
, unsigned int *max_vf
)
251 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
252 lambda_vector dist_v
;
254 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
256 int dist
= dist_v
[loop_depth
];
257 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
259 /* If the user asserted safelen >= DIST consecutive iterations
260 can be executed concurrently, assume independence.
262 ??? An alternative would be to add the alias check even
263 in this case, and vectorize the fallback loop with the
264 maximum VF set to safelen. However, if the user has
265 explicitly given a length, it's less likely that that
267 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
269 if ((unsigned int) loop
->safelen
< *max_vf
)
270 *max_vf
= loop
->safelen
;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
275 /* For dependence distances of 2 or more, we have the option
276 of limiting VF or checking for an alias at runtime.
277 Prefer to check at runtime if we can, to avoid limiting
278 the VF unnecessarily when the bases are in fact independent.
280 Note that the alias checks will be removed if the VF ends up
281 being small enough. */
282 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
283 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
284 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
285 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
286 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
293 /* Function vect_analyze_data_ref_dependence.
295 FIXME: I needed to change the sense of the returned flag.
297 Return FALSE if there (might) exist a dependence between a memory-reference
298 DRA and a memory-reference DRB. When versioning for alias may check a
299 dependence at run-time, return TRUE. Adjust *MAX_VF according to
300 the data dependence. */
303 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
304 loop_vec_info loop_vinfo
,
305 unsigned int *max_vf
)
308 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
309 struct data_reference
*dra
= DDR_A (ddr
);
310 struct data_reference
*drb
= DDR_B (ddr
);
311 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
312 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
313 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
314 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
315 lambda_vector dist_v
;
316 unsigned int loop_depth
;
318 /* In loop analysis all data references should be vectorizable. */
319 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
320 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
323 /* Independent data accesses. */
324 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
325 return opt_result::success ();
328 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
329 return opt_result::success ();
331 /* We do not have to consider dependences between accesses that belong
332 to the same group, unless the stride could be smaller than the
334 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
335 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
336 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
337 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
338 return opt_result::success ();
340 /* Even if we have an anti-dependence then, as the vectorized loop covers at
341 least two scalar iterations, there is always also a true dependence.
342 As the vectorizer does not re-order loads and stores we can ignore
343 the anti-dependence if TBAA can disambiguate both DRs similar to the
344 case with known negative distance anti-dependences (positive
345 distance anti-dependences would violate TBAA constraints). */
346 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
347 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
348 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
349 get_alias_set (DR_REF (drb
))))
350 return opt_result::success ();
352 /* Unknown data dependence. */
353 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
355 /* If user asserted safelen consecutive iterations can be
356 executed concurrently, assume independence. */
357 if (loop
->safelen
>= 2)
359 if ((unsigned int) loop
->safelen
< *max_vf
)
360 *max_vf
= loop
->safelen
;
361 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
362 return opt_result::success ();
365 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
366 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
367 return opt_result::failure_at
369 "versioning for alias not supported for: "
370 "can't determine dependence between %T and %T\n",
371 DR_REF (dra
), DR_REF (drb
));
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
375 "versioning for alias required: "
376 "can't determine dependence between %T and %T\n",
377 DR_REF (dra
), DR_REF (drb
));
379 /* Add to list of ddrs that need to be tested at run-time. */
380 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
383 /* Known data dependence. */
384 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
386 /* If user asserted safelen consecutive iterations can be
387 executed concurrently, assume independence. */
388 if (loop
->safelen
>= 2)
390 if ((unsigned int) loop
->safelen
< *max_vf
)
391 *max_vf
= loop
->safelen
;
392 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
393 return opt_result::success ();
396 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
397 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
398 return opt_result::failure_at
400 "versioning for alias not supported for: "
401 "bad dist vector for %T and %T\n",
402 DR_REF (dra
), DR_REF (drb
));
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
406 "versioning for alias required: "
407 "bad dist vector for %T and %T\n",
408 DR_REF (dra
), DR_REF (drb
));
409 /* Add to list of ddrs that need to be tested at run-time. */
410 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
413 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
415 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
416 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
418 return opt_result::success ();
420 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
422 int dist
= dist_v
[loop_depth
];
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE
, vect_location
,
426 "dependence distance = %d.\n", dist
);
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_NOTE
, vect_location
,
432 "dependence distance == 0 between %T and %T\n",
433 DR_REF (dra
), DR_REF (drb
));
435 /* When we perform grouped accesses and perform implicit CSE
436 by detecting equal accesses and doing disambiguation with
437 runtime alias tests like for
445 where we will end up loading { a[i], a[i+1] } once, make
446 sure that inserting group loads before the first load and
447 stores after the last store will do the right thing.
448 Similar for groups like
452 where loads from the group interleave with the store. */
453 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
454 return opt_result::failure_at (stmtinfo_a
->stmt
,
455 "READ_WRITE dependence"
456 " in interleaving.\n");
458 if (loop
->safelen
< 2)
460 tree indicator
= dr_zero_step_indicator (dra
);
461 if (!indicator
|| integer_zerop (indicator
))
462 return opt_result::failure_at (stmtinfo_a
->stmt
,
463 "access also has a zero step\n");
464 else if (TREE_CODE (indicator
) != INTEGER_CST
)
465 vect_check_nonzero_value (loop_vinfo
, indicator
);
470 if (dist
> 0 && DDR_REVERSED_P (ddr
))
472 /* If DDR_REVERSED_P the order of the data-refs in DDR was
473 reversed (to make distance vector positive), and the actual
474 distance is negative. */
475 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
477 "dependence distance negative.\n");
478 /* Record a negative dependence distance to later limit the
479 amount of stmt copying / unrolling we can perform.
480 Only need to handle read-after-write dependence. */
482 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
483 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
484 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
488 unsigned int abs_dist
= abs (dist
);
489 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
491 /* The dependence distance requires reduction of the maximal
492 vectorization factor. */
493 *max_vf
= abs (dist
);
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE
, vect_location
,
496 "adjusting maximal vectorization factor to %i\n",
500 if (abs_dist
>= *max_vf
)
502 /* Dependence distance does not create dependence, as far as
503 vectorization is concerned, in this case. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE
, vect_location
,
506 "dependence distance >= VF.\n");
510 return opt_result::failure_at (stmtinfo_a
->stmt
,
511 "not vectorized, possible dependence "
512 "between data-refs %T and %T\n",
513 DR_REF (dra
), DR_REF (drb
));
516 return opt_result::success ();
519 /* Function vect_analyze_data_ref_dependences.
521 Examine all the data references in the loop, and make sure there do not
522 exist any data dependences between them. Set *MAX_VF according to
523 the maximum vectorization factor the data dependences allow. */
526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
527 unsigned int *max_vf
)
530 struct data_dependence_relation
*ddr
;
532 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
534 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
536 LOOP_VINFO_DDRS (loop_vinfo
)
537 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
538 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
539 /* We need read-read dependences to compute
540 STMT_VINFO_SAME_ALIGN_REFS. */
541 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
542 &LOOP_VINFO_DDRS (loop_vinfo
),
543 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
548 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
550 /* For epilogues we either have no aliases or alias versioning
551 was applied to original loop. Therefore we may just get max_vf
552 using VF of original loop. */
553 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
554 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
556 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
559 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
564 return opt_result::success ();
568 /* Function vect_slp_analyze_data_ref_dependence.
570 Return TRUE if there (might) exist a dependence between a memory-reference
571 DRA and a memory-reference DRB for VINFO. When versioning for alias
572 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
573 according to the data dependence. */
576 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
577 struct data_dependence_relation
*ddr
)
579 struct data_reference
*dra
= DDR_A (ddr
);
580 struct data_reference
*drb
= DDR_B (ddr
);
581 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
582 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
584 /* We need to check dependences of statements marked as unvectorizable
585 as well, they still can prohibit vectorization. */
587 /* Independent data accesses. */
588 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
594 /* Read-read is OK. */
595 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
598 /* If dra and drb are part of the same interleaving chain consider
600 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
601 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
602 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
605 /* Unknown data dependence. */
606 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
610 "can't determine dependence between %T and %T\n",
611 DR_REF (dra
), DR_REF (drb
));
613 else if (dump_enabled_p ())
614 dump_printf_loc (MSG_NOTE
, vect_location
,
615 "determined dependence between %T and %T\n",
616 DR_REF (dra
), DR_REF (drb
));
622 /* Analyze dependences involved in the transform of SLP NODE. STORES
623 contain the vector of scalar stores of this instance if we are
624 disambiguating the loads. */
627 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
628 vec
<stmt_vec_info
> stores
,
629 stmt_vec_info last_store_info
)
631 /* This walks over all stmts involved in the SLP load/store done
632 in NODE verifying we can sink them up to the last stmt in the
634 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
635 vec_info
*vinfo
= last_access_info
->vinfo
;
636 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
638 stmt_vec_info access_info
= SLP_TREE_SCALAR_STMTS (node
)[k
];
639 if (access_info
== last_access_info
)
641 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
643 bool ref_initialized_p
= false;
644 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
645 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
647 gimple
*stmt
= gsi_stmt (gsi
);
648 if (! gimple_vuse (stmt
)
649 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
652 /* If we couldn't record a (single) data reference for this
653 stmt we have to resort to the alias oracle. */
654 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
655 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
658 /* We are moving a store or sinking a load - this means
659 we cannot use TBAA for disambiguation. */
660 if (!ref_initialized_p
)
661 ao_ref_init (&ref
, DR_REF (dr_a
));
662 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
663 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
668 bool dependent
= false;
669 /* If we run into a store of this same instance (we've just
670 marked those) then delay dependence checking until we run
671 into the last store because this is where it will have
672 been sunk to (and we verify if we can do that as well). */
673 if (gimple_visited_p (stmt
))
675 if (stmt_info
!= last_store_info
)
678 stmt_vec_info store_info
;
679 FOR_EACH_VEC_ELT (stores
, i
, store_info
)
681 data_reference
*store_dr
= STMT_VINFO_DATA_REF (store_info
);
682 ddr_p ddr
= initialize_data_dependence_relation
683 (dr_a
, store_dr
, vNULL
);
685 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
686 free_dependence_relation (ddr
);
693 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
695 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
696 free_dependence_relation (ddr
);
706 /* Function vect_analyze_data_ref_dependences.
708 Examine all the data references in the basic-block, and make sure there
709 do not exist any data dependences between them. Set *MAX_VF according to
710 the maximum vectorization factor the data dependences allow. */
713 vect_slp_analyze_instance_dependence (slp_instance instance
)
715 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
717 /* The stores of this instance are at the root of the SLP tree. */
718 slp_tree store
= SLP_INSTANCE_TREE (instance
);
719 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store
)[0]))
722 /* Verify we can sink stores to the vectorized stmt insert location. */
723 stmt_vec_info last_store_info
= NULL
;
726 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
729 /* Mark stores in this instance and remember the last one. */
730 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
731 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
732 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
737 /* Verify we can sink loads to the vectorized stmt insert location,
738 special-casing stores of this instance. */
741 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
742 if (! vect_slp_analyze_node_dependences (instance
, load
,
744 ? SLP_TREE_SCALAR_STMTS (store
)
745 : vNULL
, last_store_info
))
751 /* Unset the visited flag. */
753 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
754 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
759 /* Record the base alignment guarantee given by DRB, which occurs
763 vect_record_base_alignment (stmt_vec_info stmt_info
,
764 innermost_loop_behavior
*drb
)
766 vec_info
*vinfo
= stmt_info
->vinfo
;
768 innermost_loop_behavior
*&entry
769 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
770 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE
, vect_location
,
775 "recording new base alignment for %T\n"
777 " misalignment: %d\n"
781 drb
->base_misalignment
,
786 /* If the region we're going to vectorize is reached, all unconditional
787 data references occur at least once. We can therefore pool the base
788 alignment guarantees from each unconditional reference. Do this by
789 going through all the data references in VINFO and checking whether
790 the containing statement makes the reference unconditionally. If so,
791 record the alignment of the base address in VINFO so that it can be
792 used for all other references with the same base. */
795 vect_record_base_alignments (vec_info
*vinfo
)
797 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
798 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
801 FOR_EACH_VEC_ELT (vinfo
->shared
->datarefs
, i
, dr
)
803 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
804 stmt_vec_info stmt_info
= dr_info
->stmt
;
805 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
806 && STMT_VINFO_VECTORIZABLE (stmt_info
)
807 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
809 vect_record_base_alignment (stmt_info
, &DR_INNERMOST (dr
));
811 /* If DR is nested in the loop that is being vectorized, we can also
812 record the alignment of the base wrt the outer loop. */
813 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
814 vect_record_base_alignment
815 (stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
820 /* Return the target alignment for the vectorized form of DR_INFO. */
823 vect_calculate_target_alignment (dr_vec_info
*dr_info
)
825 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
826 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
829 /* Function vect_compute_data_ref_alignment
831 Compute the misalignment of the data reference DR_INFO.
834 1. DR_MISALIGNMENT (DR_INFO) is defined.
836 FOR NOW: No analysis is actually performed. Misalignment is calculated
837 only for trivial cases. TODO. */
840 vect_compute_data_ref_alignment (dr_vec_info
*dr_info
)
842 stmt_vec_info stmt_info
= dr_info
->stmt
;
843 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
844 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
845 struct loop
*loop
= NULL
;
846 tree ref
= DR_REF (dr_info
->dr
);
847 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE
, vect_location
,
851 "vect_compute_data_ref_alignment:\n");
854 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
856 /* Initialize misalignment to unknown. */
857 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
859 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
862 innermost_loop_behavior
*drb
= vect_dr_behavior (dr_info
);
863 bool step_preserves_misalignment_p
;
865 poly_uint64 vector_alignment
866 = exact_div (vect_calculate_target_alignment (dr_info
), BITS_PER_UNIT
);
867 DR_TARGET_ALIGNMENT (dr_info
) = vector_alignment
;
869 unsigned HOST_WIDE_INT vect_align_c
;
870 if (!vector_alignment
.is_constant (&vect_align_c
))
873 /* No step for BB vectorization. */
876 gcc_assert (integer_zerop (drb
->step
));
877 step_preserves_misalignment_p
= true;
880 /* In case the dataref is in an inner-loop of the loop that is being
881 vectorized (LOOP), we use the base and misalignment information
882 relative to the outer-loop (LOOP). This is ok only if the misalignment
883 stays the same throughout the execution of the inner-loop, which is why
884 we have to check that the stride of the dataref in the inner-loop evenly
885 divides by the vector alignment. */
886 else if (nested_in_vect_loop_p (loop
, stmt_info
))
888 step_preserves_misalignment_p
889 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
891 if (dump_enabled_p ())
893 if (step_preserves_misalignment_p
)
894 dump_printf_loc (MSG_NOTE
, vect_location
,
895 "inner step divides the vector alignment.\n");
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
898 "inner step doesn't divide the vector"
903 /* Similarly we can only use base and misalignment information relative to
904 an innermost loop if the misalignment stays the same throughout the
905 execution of the loop. As above, this is the case if the stride of
906 the dataref evenly divides by the alignment. */
909 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
910 step_preserves_misalignment_p
911 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
913 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
915 "step doesn't divide the vector alignment.\n");
918 unsigned int base_alignment
= drb
->base_alignment
;
919 unsigned int base_misalignment
= drb
->base_misalignment
;
921 /* Calculate the maximum of the pooled base address alignment and the
922 alignment that we can compute for DR itself. */
923 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
924 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
926 base_alignment
= (*entry
)->base_alignment
;
927 base_misalignment
= (*entry
)->base_misalignment
;
930 if (drb
->offset_alignment
< vect_align_c
931 || !step_preserves_misalignment_p
932 /* We need to know whether the step wrt the vectorized loop is
933 negative when computing the starting misalignment below. */
934 || TREE_CODE (drb
->step
) != INTEGER_CST
)
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
938 "Unknown alignment for access: %T\n", ref
);
942 if (base_alignment
< vect_align_c
)
944 unsigned int max_alignment
;
945 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
946 if (max_alignment
< vect_align_c
947 || !vect_can_force_dr_alignment_p (base
,
948 vect_align_c
* BITS_PER_UNIT
))
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE
, vect_location
,
952 "can't force alignment of ref: %T\n", ref
);
956 /* Force the alignment of the decl.
957 NOTE: This is the only change to the code we make during
958 the analysis phase, before deciding to vectorize the loop. */
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE
, vect_location
,
961 "force alignment of %T\n", ref
);
963 dr_info
->base_decl
= base
;
964 dr_info
->base_misaligned
= true;
965 base_misalignment
= 0;
967 poly_int64 misalignment
968 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
970 /* If this is a backward running DR then first access in the larger
971 vectype actually is N-1 elements before the address in the DR.
972 Adjust misalign accordingly. */
973 if (tree_int_cst_sgn (drb
->step
) < 0)
974 /* PLUS because STEP is negative. */
975 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
976 * TREE_INT_CST_LOW (drb
->step
));
978 unsigned int const_misalignment
;
979 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
983 "Non-constant misalignment for access: %T\n", ref
);
987 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
989 if (dump_enabled_p ())
990 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
991 "misalign = %d bytes of ref %T\n",
992 DR_MISALIGNMENT (dr_info
), ref
);
997 /* Function vect_update_misalignment_for_peel.
998 Sets DR_INFO's misalignment
999 - to 0 if it has the same alignment as DR_PEEL_INFO,
1000 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1001 - to -1 (unknown) otherwise.
1003 DR_INFO - the data reference whose misalignment is to be adjusted.
1004 DR_PEEL_INFO - the data reference whose misalignment is being made
1005 zero in the vector loop by the peel.
1006 NPEEL - the number of iterations in the peel loop if the misalignment
1007 of DR_PEEL_INFO is known at compile time. */
1010 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1011 dr_vec_info
*dr_peel_info
, int npeel
)
1014 vec
<dr_p
> same_aligned_drs
;
1015 struct data_reference
*current_dr
;
1016 stmt_vec_info peel_stmt_info
= dr_peel_info
->stmt
;
1018 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1019 it is aligned in the vector loop. */
1020 same_aligned_drs
= STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info
);
1021 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1023 if (current_dr
!= dr_info
->dr
)
1025 gcc_assert (!known_alignment_for_access_p (dr_info
)
1026 || !known_alignment_for_access_p (dr_peel_info
)
1027 || (DR_MISALIGNMENT (dr_info
)
1028 == DR_MISALIGNMENT (dr_peel_info
)));
1029 SET_DR_MISALIGNMENT (dr_info
, 0);
1033 unsigned HOST_WIDE_INT alignment
;
1034 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1035 && known_alignment_for_access_p (dr_info
)
1036 && known_alignment_for_access_p (dr_peel_info
))
1038 int misal
= DR_MISALIGNMENT (dr_info
);
1039 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1040 misal
&= alignment
- 1;
1041 SET_DR_MISALIGNMENT (dr_info
, misal
);
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1047 "to unknown (-1).\n");
1048 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1052 /* Function verify_data_ref_alignment
1054 Return TRUE if DR_INFO can be handled with respect to alignment. */
1057 verify_data_ref_alignment (dr_vec_info
*dr_info
)
1059 enum dr_alignment_support supportable_dr_alignment
1060 = vect_supportable_dr_alignment (dr_info
, false);
1061 if (!supportable_dr_alignment
)
1062 return opt_result::failure_at
1063 (dr_info
->stmt
->stmt
,
1064 DR_IS_READ (dr_info
->dr
)
1065 ? "not vectorized: unsupported unaligned load: %T\n"
1066 : "not vectorized: unsupported unaligned store: %T\n",
1067 DR_REF (dr_info
->dr
));
1069 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1070 dump_printf_loc (MSG_NOTE
, vect_location
,
1071 "Vectorizing an unaligned access.\n");
1073 return opt_result::success ();
1076 /* Function vect_verify_datarefs_alignment
1078 Return TRUE if all data references in the loop can be
1079 handled with respect to alignment. */
1082 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1084 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
1085 struct data_reference
*dr
;
1088 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1090 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
1091 stmt_vec_info stmt_info
= dr_info
->stmt
;
1093 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1096 /* For interleaving, only the alignment of the first access matters. */
1097 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1098 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1101 /* Strided accesses perform only component accesses, alignment is
1102 irrelevant for them. */
1103 if (STMT_VINFO_STRIDED_P (stmt_info
)
1104 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1107 opt_result res
= verify_data_ref_alignment (dr_info
);
1112 return opt_result::success ();
1115 /* Given an memory reference EXP return whether its alignment is less
1119 not_size_aligned (tree exp
)
1121 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1124 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1125 > get_object_alignment (exp
));
1128 /* Function vector_alignment_reachable_p
1130 Return true if vector alignment for DR_INFO is reachable by peeling
1131 a few loop iterations. Return false otherwise. */
1134 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1136 stmt_vec_info stmt_info
= dr_info
->stmt
;
1137 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1139 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1141 /* For interleaved access we peel only if number of iterations in
1142 the prolog loop ({VF - misalignment}), is a multiple of the
1143 number of the interleaved accesses. */
1144 int elem_size
, mis_in_elements
;
1146 /* FORNOW: handle only known alignment. */
1147 if (!known_alignment_for_access_p (dr_info
))
1150 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1151 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1152 elem_size
= vector_element_size (vector_size
, nelements
);
1153 mis_in_elements
= DR_MISALIGNMENT (dr_info
) / elem_size
;
1155 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1159 /* If misalignment is known at the compile time then allow peeling
1160 only if natural alignment is reachable through peeling. */
1161 if (known_alignment_for_access_p (dr_info
) && !aligned_access_p (dr_info
))
1163 HOST_WIDE_INT elmsize
=
1164 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1165 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_NOTE
, vect_location
,
1168 "data size = %wd. misalignment = %d.\n", elmsize
,
1169 DR_MISALIGNMENT (dr_info
));
1171 if (DR_MISALIGNMENT (dr_info
) % elmsize
)
1173 if (dump_enabled_p ())
1174 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1175 "data size does not divide the misalignment.\n");
1180 if (!known_alignment_for_access_p (dr_info
))
1182 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1183 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1186 "Unknown misalignment, %snaturally aligned\n",
1187 is_packed
? "not " : "");
1188 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1195 /* Calculate the cost of the memory access represented by DR_INFO. */
1198 vect_get_data_access_cost (dr_vec_info
*dr_info
,
1199 unsigned int *inside_cost
,
1200 unsigned int *outside_cost
,
1201 stmt_vector_for_cost
*body_cost_vec
,
1202 stmt_vector_for_cost
*prologue_cost_vec
)
1204 stmt_vec_info stmt_info
= dr_info
->stmt
;
1205 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1208 if (PURE_SLP_STMT (stmt_info
))
1211 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1213 if (DR_IS_READ (dr_info
->dr
))
1214 vect_get_load_cost (stmt_info
, ncopies
, true, inside_cost
, outside_cost
,
1215 prologue_cost_vec
, body_cost_vec
, false);
1217 vect_get_store_cost (stmt_info
, ncopies
, inside_cost
, body_cost_vec
);
1219 if (dump_enabled_p ())
1220 dump_printf_loc (MSG_NOTE
, vect_location
,
1221 "vect_get_data_access_cost: inside_cost = %d, "
1222 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1226 typedef struct _vect_peel_info
1228 dr_vec_info
*dr_info
;
1233 typedef struct _vect_peel_extended_info
1235 struct _vect_peel_info peel_info
;
1236 unsigned int inside_cost
;
1237 unsigned int outside_cost
;
1238 } *vect_peel_extended_info
;
1241 /* Peeling hashtable helpers. */
1243 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1245 static inline hashval_t
hash (const _vect_peel_info
*);
1246 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1250 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1252 return (hashval_t
) peel_info
->npeel
;
1256 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1258 return (a
->npeel
== b
->npeel
);
1262 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1265 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1266 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1269 struct _vect_peel_info elem
, *slot
;
1270 _vect_peel_info
**new_slot
;
1271 bool supportable_dr_alignment
1272 = vect_supportable_dr_alignment (dr_info
, true);
1275 slot
= peeling_htab
->find (&elem
);
1280 slot
= XNEW (struct _vect_peel_info
);
1281 slot
->npeel
= npeel
;
1282 slot
->dr_info
= dr_info
;
1284 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1288 if (!supportable_dr_alignment
1289 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1290 slot
->count
+= VECT_MAX_COST
;
1294 /* Traverse peeling hash table to find peeling option that aligns maximum
1295 number of data accesses. */
1298 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1299 _vect_peel_extended_info
*max
)
1301 vect_peel_info elem
= *slot
;
1303 if (elem
->count
> max
->peel_info
.count
1304 || (elem
->count
== max
->peel_info
.count
1305 && max
->peel_info
.npeel
> elem
->npeel
))
1307 max
->peel_info
.npeel
= elem
->npeel
;
1308 max
->peel_info
.count
= elem
->count
;
1309 max
->peel_info
.dr_info
= elem
->dr_info
;
1315 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1316 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1317 we assume DR0_INFO's misalignment will be zero after peeling. */
1320 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1321 dr_vec_info
*dr0_info
,
1322 unsigned int *inside_cost
,
1323 unsigned int *outside_cost
,
1324 stmt_vector_for_cost
*body_cost_vec
,
1325 stmt_vector_for_cost
*prologue_cost_vec
,
1327 bool unknown_misalignment
)
1329 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1333 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1335 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1336 stmt_vec_info stmt_info
= dr_info
->stmt
;
1337 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1340 /* For interleaving, only the alignment of the first access
1342 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1343 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1346 /* Strided accesses perform only component accesses, alignment is
1347 irrelevant for them. */
1348 if (STMT_VINFO_STRIDED_P (stmt_info
)
1349 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1352 int save_misalignment
;
1353 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1356 else if (unknown_misalignment
&& dr_info
== dr0_info
)
1357 SET_DR_MISALIGNMENT (dr_info
, 0);
1359 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1360 vect_get_data_access_cost (dr_info
, inside_cost
, outside_cost
,
1361 body_cost_vec
, prologue_cost_vec
);
1362 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1366 /* Traverse peeling hash table and calculate cost for each peeling option.
1367 Find the one with the lowest cost. */
1370 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1371 _vect_peel_extended_info
*min
)
1373 vect_peel_info elem
= *slot
;
1375 unsigned int inside_cost
= 0, outside_cost
= 0;
1376 stmt_vec_info stmt_info
= elem
->dr_info
->stmt
;
1377 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1378 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1381 prologue_cost_vec
.create (2);
1382 body_cost_vec
.create (2);
1383 epilogue_cost_vec
.create (2);
1385 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1386 &outside_cost
, &body_cost_vec
,
1387 &prologue_cost_vec
, elem
->npeel
, false);
1389 body_cost_vec
.release ();
1391 outside_cost
+= vect_get_known_peeling_cost
1392 (loop_vinfo
, elem
->npeel
, &dummy
,
1393 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1394 &prologue_cost_vec
, &epilogue_cost_vec
);
1396 /* Prologue and epilogue costs are added to the target model later.
1397 These costs depend only on the scalar iteration cost, the
1398 number of peeling iterations finally chosen, and the number of
1399 misaligned statements. So discard the information found here. */
1400 prologue_cost_vec
.release ();
1401 epilogue_cost_vec
.release ();
1403 if (inside_cost
< min
->inside_cost
1404 || (inside_cost
== min
->inside_cost
1405 && outside_cost
< min
->outside_cost
))
1407 min
->inside_cost
= inside_cost
;
1408 min
->outside_cost
= outside_cost
;
1409 min
->peel_info
.dr_info
= elem
->dr_info
;
1410 min
->peel_info
.npeel
= elem
->npeel
;
1411 min
->peel_info
.count
= elem
->count
;
1418 /* Choose best peeling option by traversing peeling hash table and either
1419 choosing an option with the lowest cost (if cost model is enabled) or the
1420 option that aligns as many accesses as possible. */
1422 static struct _vect_peel_extended_info
1423 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1424 loop_vec_info loop_vinfo
)
1426 struct _vect_peel_extended_info res
;
1428 res
.peel_info
.dr_info
= NULL
;
1430 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1432 res
.inside_cost
= INT_MAX
;
1433 res
.outside_cost
= INT_MAX
;
1434 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1435 vect_peeling_hash_get_lowest_cost
> (&res
);
1439 res
.peel_info
.count
= 0;
1440 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1441 vect_peeling_hash_get_most_frequent
> (&res
);
1442 res
.inside_cost
= 0;
1443 res
.outside_cost
= 0;
1449 /* Return true if the new peeling NPEEL is supported. */
1452 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1456 struct data_reference
*dr
= NULL
;
1457 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1458 enum dr_alignment_support supportable_dr_alignment
;
1460 /* Ensure that all data refs can be vectorized after the peel. */
1461 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1463 int save_misalignment
;
1465 if (dr
== dr0_info
->dr
)
1468 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1469 stmt_vec_info stmt_info
= dr_info
->stmt
;
1470 /* For interleaving, only the alignment of the first access
1472 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1473 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1476 /* Strided accesses perform only component accesses, alignment is
1477 irrelevant for them. */
1478 if (STMT_VINFO_STRIDED_P (stmt_info
)
1479 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1482 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1483 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1484 supportable_dr_alignment
1485 = vect_supportable_dr_alignment (dr_info
, false);
1486 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1488 if (!supportable_dr_alignment
)
1495 /* Function vect_enhance_data_refs_alignment
1497 This pass will use loop versioning and loop peeling in order to enhance
1498 the alignment of data references in the loop.
1500 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1501 original loop is to be vectorized. Any other loops that are created by
1502 the transformations performed in this pass - are not supposed to be
1503 vectorized. This restriction will be relaxed.
1505 This pass will require a cost model to guide it whether to apply peeling
1506 or versioning or a combination of the two. For example, the scheme that
1507 intel uses when given a loop with several memory accesses, is as follows:
1508 choose one memory access ('p') which alignment you want to force by doing
1509 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1510 other accesses are not necessarily aligned, or (2) use loop versioning to
1511 generate one loop in which all accesses are aligned, and another loop in
1512 which only 'p' is necessarily aligned.
1514 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1515 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1516 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1518 Devising a cost model is the most critical aspect of this work. It will
1519 guide us on which access to peel for, whether to use loop versioning, how
1520 many versions to create, etc. The cost model will probably consist of
1521 generic considerations as well as target specific considerations (on
1522 powerpc for example, misaligned stores are more painful than misaligned
1525 Here are the general steps involved in alignment enhancements:
1527 -- original loop, before alignment analysis:
1528 for (i=0; i<N; i++){
1529 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1530 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1533 -- After vect_compute_data_refs_alignment:
1534 for (i=0; i<N; i++){
1535 x = q[i]; # DR_MISALIGNMENT(q) = 3
1536 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1539 -- Possibility 1: we do loop versioning:
1541 for (i=0; i<N; i++){ # loop 1A
1542 x = q[i]; # DR_MISALIGNMENT(q) = 3
1543 p[i] = y; # DR_MISALIGNMENT(p) = 0
1547 for (i=0; i<N; i++){ # loop 1B
1548 x = q[i]; # DR_MISALIGNMENT(q) = 3
1549 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1553 -- Possibility 2: we do loop peeling:
1554 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1558 for (i = 3; i < N; i++){ # loop 2A
1559 x = q[i]; # DR_MISALIGNMENT(q) = 0
1560 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1563 -- Possibility 3: combination of loop peeling and versioning:
1564 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1569 for (i = 3; i<N; i++){ # loop 3A
1570 x = q[i]; # DR_MISALIGNMENT(q) = 0
1571 p[i] = y; # DR_MISALIGNMENT(p) = 0
1575 for (i = 3; i<N; i++){ # loop 3B
1576 x = q[i]; # DR_MISALIGNMENT(q) = 0
1577 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1581 These loops are later passed to loop_transform to be vectorized. The
1582 vectorizer will use the alignment information to guide the transformation
1583 (whether to generate regular loads/stores, or with special handling for
1587 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1589 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1590 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1591 enum dr_alignment_support supportable_dr_alignment
;
1592 dr_vec_info
*first_store
= NULL
;
1593 dr_vec_info
*dr0_info
= NULL
;
1594 struct data_reference
*dr
;
1596 bool do_peeling
= false;
1597 bool do_versioning
= false;
1598 unsigned int npeel
= 0;
1599 bool one_misalignment_known
= false;
1600 bool one_misalignment_unknown
= false;
1601 bool one_dr_unsupportable
= false;
1602 dr_vec_info
*unsupportable_dr_info
= NULL
;
1603 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1604 unsigned possible_npeel_number
= 1;
1606 unsigned int mis
, same_align_drs_max
= 0;
1607 hash_table
<peel_info_hasher
> peeling_htab (1);
1609 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1611 /* Reset data so we can safely be called multiple times. */
1612 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1613 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1615 /* While cost model enhancements are expected in the future, the high level
1616 view of the code at this time is as follows:
1618 A) If there is a misaligned access then see if peeling to align
1619 this access can make all data references satisfy
1620 vect_supportable_dr_alignment. If so, update data structures
1621 as needed and return true.
1623 B) If peeling wasn't possible and there is a data reference with an
1624 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1625 then see if loop versioning checks can be used to make all data
1626 references satisfy vect_supportable_dr_alignment. If so, update
1627 data structures as needed and return true.
1629 C) If neither peeling nor versioning were successful then return false if
1630 any data reference does not satisfy vect_supportable_dr_alignment.
1632 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1634 Note, Possibility 3 above (which is peeling and versioning together) is not
1635 being done at this time. */
1637 /* (1) Peeling to force alignment. */
1639 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1641 + How many accesses will become aligned due to the peeling
1642 - How many accesses will become unaligned due to the peeling,
1643 and the cost of misaligned accesses.
1644 - The cost of peeling (the extra runtime checks, the increase
1647 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1649 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1650 stmt_vec_info stmt_info
= dr_info
->stmt
;
1652 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1655 /* For interleaving, only the alignment of the first access
1657 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1658 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1661 /* For scatter-gather or invariant accesses there is nothing
1663 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1664 || integer_zerop (DR_STEP (dr
)))
1667 /* Strided accesses perform only component accesses, alignment is
1668 irrelevant for them. */
1669 if (STMT_VINFO_STRIDED_P (stmt_info
)
1670 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1673 supportable_dr_alignment
= vect_supportable_dr_alignment (dr_info
, true);
1674 do_peeling
= vector_alignment_reachable_p (dr_info
);
1677 if (known_alignment_for_access_p (dr_info
))
1679 unsigned int npeel_tmp
= 0;
1680 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1681 size_zero_node
) < 0;
1683 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1684 /* If known_alignment_for_access_p then we have set
1685 DR_MISALIGNMENT which is only done if we know it at compiler
1686 time, so it is safe to assume target alignment is constant.
1688 unsigned int target_align
=
1689 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
1690 unsigned int dr_size
= vect_get_scalar_dr_size (dr_info
);
1692 ? DR_MISALIGNMENT (dr_info
)
1693 : -DR_MISALIGNMENT (dr_info
));
1694 if (DR_MISALIGNMENT (dr_info
) != 0)
1695 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1697 /* For multiple types, it is possible that the bigger type access
1698 will have more than one peeling option. E.g., a loop with two
1699 types: one of size (vector size / 4), and the other one of
1700 size (vector size / 8). Vectorization factor will 8. If both
1701 accesses are misaligned by 3, the first one needs one scalar
1702 iteration to be aligned, and the second one needs 5. But the
1703 first one will be aligned also by peeling 5 scalar
1704 iterations, and in that case both accesses will be aligned.
1705 Hence, except for the immediate peeling amount, we also want
1706 to try to add full vector size, while we don't exceed
1707 vectorization factor.
1708 We do this automatically for cost model, since we calculate
1709 cost for every peeling option. */
1710 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1712 poly_uint64 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1713 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1714 possible_npeel_number
1715 = vect_get_num_vectors (nscalars
, vectype
);
1717 /* NPEEL_TMP is 0 when there is no misalignment, but also
1718 allow peeling NELEMENTS. */
1719 if (DR_MISALIGNMENT (dr_info
) == 0)
1720 possible_npeel_number
++;
1723 /* Save info about DR in the hash table. Also include peeling
1724 amounts according to the explanation above. */
1725 for (j
= 0; j
< possible_npeel_number
; j
++)
1727 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1728 dr_info
, npeel_tmp
);
1729 npeel_tmp
+= target_align
/ dr_size
;
1732 one_misalignment_known
= true;
1736 /* If we don't know any misalignment values, we prefer
1737 peeling for data-ref that has the maximum number of data-refs
1738 with the same alignment, unless the target prefers to align
1739 stores over load. */
1740 unsigned same_align_drs
1741 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1743 || same_align_drs_max
< same_align_drs
)
1745 same_align_drs_max
= same_align_drs
;
1748 /* For data-refs with the same number of related
1749 accesses prefer the one where the misalign
1750 computation will be invariant in the outermost loop. */
1751 else if (same_align_drs_max
== same_align_drs
)
1753 struct loop
*ivloop0
, *ivloop
;
1754 ivloop0
= outermost_invariant_loop_for_expr
1755 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
1756 ivloop
= outermost_invariant_loop_for_expr
1757 (loop
, DR_BASE_ADDRESS (dr
));
1758 if ((ivloop
&& !ivloop0
)
1759 || (ivloop
&& ivloop0
1760 && flow_loop_nested_p (ivloop
, ivloop0
)))
1764 one_misalignment_unknown
= true;
1766 /* Check for data refs with unsupportable alignment that
1768 if (!supportable_dr_alignment
)
1770 one_dr_unsupportable
= true;
1771 unsupportable_dr_info
= dr_info
;
1774 if (!first_store
&& DR_IS_WRITE (dr
))
1775 first_store
= dr_info
;
1780 if (!aligned_access_p (dr_info
))
1782 if (dump_enabled_p ())
1783 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1784 "vector alignment may not be reachable\n");
1790 /* Check if we can possibly peel the loop. */
1791 if (!vect_can_advance_ivs_p (loop_vinfo
)
1792 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1796 struct _vect_peel_extended_info peel_for_known_alignment
;
1797 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1798 struct _vect_peel_extended_info best_peel
;
1800 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1801 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1802 peel_for_unknown_alignment
.peel_info
.count
= 0;
1805 && one_misalignment_unknown
)
1807 /* Check if the target requires to prefer stores over loads, i.e., if
1808 misaligned stores are more expensive than misaligned loads (taking
1809 drs with same alignment into account). */
1810 unsigned int load_inside_cost
= 0;
1811 unsigned int load_outside_cost
= 0;
1812 unsigned int store_inside_cost
= 0;
1813 unsigned int store_outside_cost
= 0;
1814 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1816 stmt_vector_for_cost dummy
;
1818 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
1821 &dummy
, &dummy
, estimated_npeels
, true);
1827 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
1829 &store_outside_cost
,
1831 estimated_npeels
, true);
1836 store_inside_cost
= INT_MAX
;
1837 store_outside_cost
= INT_MAX
;
1840 if (load_inside_cost
> store_inside_cost
1841 || (load_inside_cost
== store_inside_cost
1842 && load_outside_cost
> store_outside_cost
))
1844 dr0_info
= first_store
;
1845 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1846 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1850 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1851 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1854 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1855 prologue_cost_vec
.create (2);
1856 epilogue_cost_vec
.create (2);
1859 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1860 (loop_vinfo
, estimated_npeels
, &dummy2
,
1861 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1862 &prologue_cost_vec
, &epilogue_cost_vec
);
1864 prologue_cost_vec
.release ();
1865 epilogue_cost_vec
.release ();
1867 peel_for_unknown_alignment
.peel_info
.count
= 1
1868 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info
->stmt
).length ();
1871 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1872 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
1874 best_peel
= peel_for_unknown_alignment
;
1876 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1877 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1878 peel_for_known_alignment
.peel_info
.count
= 0;
1879 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
1881 if (do_peeling
&& one_misalignment_known
)
1883 /* Peeling is possible, but there is no data access that is not supported
1884 unless aligned. So we try to choose the best possible peeling from
1886 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1887 (&peeling_htab
, loop_vinfo
);
1890 /* Compare costs of peeling for known and unknown alignment. */
1891 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
1892 && peel_for_unknown_alignment
.inside_cost
1893 >= peel_for_known_alignment
.inside_cost
)
1895 best_peel
= peel_for_known_alignment
;
1897 /* If the best peeling for known alignment has NPEEL == 0, perform no
1898 peeling at all except if there is an unsupportable dr that we can
1900 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1904 /* If there is an unsupportable data ref, prefer this over all choices so far
1905 since we'd have to discard a chosen peeling except when it accidentally
1906 aligned the unsupportable data ref. */
1907 if (one_dr_unsupportable
)
1908 dr0_info
= unsupportable_dr_info
;
1909 else if (do_peeling
)
1911 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1912 TODO: Use nopeel_outside_cost or get rid of it? */
1913 unsigned nopeel_inside_cost
= 0;
1914 unsigned nopeel_outside_cost
= 0;
1916 stmt_vector_for_cost dummy
;
1918 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
1919 &nopeel_outside_cost
, &dummy
, &dummy
,
1923 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1924 costs will be recorded. */
1925 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1926 prologue_cost_vec
.create (2);
1927 epilogue_cost_vec
.create (2);
1930 nopeel_outside_cost
+= vect_get_known_peeling_cost
1931 (loop_vinfo
, 0, &dummy2
,
1932 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1933 &prologue_cost_vec
, &epilogue_cost_vec
);
1935 prologue_cost_vec
.release ();
1936 epilogue_cost_vec
.release ();
1938 npeel
= best_peel
.peel_info
.npeel
;
1939 dr0_info
= best_peel
.peel_info
.dr_info
;
1941 /* If no peeling is not more expensive than the best peeling we
1942 have so far, don't perform any peeling. */
1943 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
1949 stmt_vec_info stmt_info
= dr0_info
->stmt
;
1950 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1952 if (known_alignment_for_access_p (dr0_info
))
1954 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
1955 size_zero_node
) < 0;
1958 /* Since it's known at compile time, compute the number of
1959 iterations in the peeled loop (the peeling factor) for use in
1960 updating DR_MISALIGNMENT values. The peeling factor is the
1961 vectorization factor minus the misalignment as an element
1964 ? DR_MISALIGNMENT (dr0_info
)
1965 : -DR_MISALIGNMENT (dr0_info
));
1966 /* If known_alignment_for_access_p then we have set
1967 DR_MISALIGNMENT which is only done if we know it at compiler
1968 time, so it is safe to assume target alignment is constant.
1970 unsigned int target_align
=
1971 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
1972 npeel
= ((mis
& (target_align
- 1))
1973 / vect_get_scalar_dr_size (dr0_info
));
1976 /* For interleaved data access every iteration accesses all the
1977 members of the group, therefore we divide the number of iterations
1978 by the group size. */
1979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1980 npeel
/= DR_GROUP_SIZE (stmt_info
);
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE
, vect_location
,
1984 "Try peeling by %d\n", npeel
);
1987 /* Ensure that all datarefs can be vectorized after the peel. */
1988 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
1991 /* Check if all datarefs are supportable and log. */
1992 if (do_peeling
&& known_alignment_for_access_p (dr0_info
) && npeel
== 0)
1994 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2001 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2004 unsigned max_allowed_peel
2005 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
2006 if (max_allowed_peel
!= (unsigned)-1)
2008 unsigned max_peel
= npeel
;
2011 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2012 unsigned HOST_WIDE_INT target_align_c
;
2013 if (target_align
.is_constant (&target_align_c
))
2015 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE
, vect_location
,
2021 "Disable peeling, max peels set and vector"
2022 " alignment unknown\n");
2025 if (max_peel
> max_allowed_peel
)
2028 if (dump_enabled_p ())
2029 dump_printf_loc (MSG_NOTE
, vect_location
,
2030 "Disable peeling, max peels reached: %d\n", max_peel
);
2035 /* Cost model #2 - if peeling may result in a remaining loop not
2036 iterating enough to be vectorized then do not peel. Since this
2037 is a cost heuristic rather than a correctness decision, use the
2038 most likely runtime value for variable vectorization factors. */
2040 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2042 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2043 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2044 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2045 < assumed_vf
+ max_peel
)
2051 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2052 If the misalignment of DR_i is identical to that of dr0 then set
2053 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2054 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2055 by the peeling factor times the element size of DR_i (MOD the
2056 vectorization factor times the size). Otherwise, the
2057 misalignment of DR_i must be set to unknown. */
2058 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2059 if (dr
!= dr0_info
->dr
)
2061 /* Strided accesses perform only component accesses, alignment
2062 is irrelevant for them. */
2063 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2064 stmt_info
= dr_info
->stmt
;
2065 if (STMT_VINFO_STRIDED_P (stmt_info
)
2066 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2069 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2072 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2074 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2076 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2077 = DR_MISALIGNMENT (dr0_info
);
2078 SET_DR_MISALIGNMENT (dr0_info
, 0);
2079 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_NOTE
, vect_location
,
2082 "Alignment of access forced using peeling.\n");
2083 dump_printf_loc (MSG_NOTE
, vect_location
,
2084 "Peeling for alignment will be applied.\n");
2087 /* The inside-loop cost will be accounted for in vectorizable_load
2088 and vectorizable_store correctly with adjusted alignments.
2089 Drop the body_cst_vec on the floor here. */
2090 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2096 /* (2) Versioning to force alignment. */
2098 /* Try versioning if:
2099 1) optimize loop for speed
2100 2) there is at least one unsupported misaligned data ref with an unknown
2102 3) all misaligned data refs with a known misalignment are supported, and
2103 4) the number of runtime alignment checks is within reason. */
2106 optimize_loop_nest_for_speed_p (loop
)
2107 && (!loop
->inner
); /* FORNOW */
2111 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2113 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2114 stmt_vec_info stmt_info
= dr_info
->stmt
;
2116 /* For interleaving, only the alignment of the first access
2118 if (aligned_access_p (dr_info
)
2119 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2120 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
))
2123 if (STMT_VINFO_STRIDED_P (stmt_info
))
2125 /* Strided loads perform only component accesses, alignment is
2126 irrelevant for them. */
2127 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2129 do_versioning
= false;
2133 supportable_dr_alignment
2134 = vect_supportable_dr_alignment (dr_info
, false);
2136 if (!supportable_dr_alignment
)
2141 if (known_alignment_for_access_p (dr_info
)
2142 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2143 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2145 do_versioning
= false;
2149 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2150 gcc_assert (vectype
);
2152 /* At present we don't support versioning for alignment
2153 with variable VF, since there's no guarantee that the
2154 VF is a power of two. We could relax this if we added
2155 a way of enforcing a power-of-two size. */
2156 unsigned HOST_WIDE_INT size
;
2157 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2159 do_versioning
= false;
2163 /* Forcing alignment in the first iteration is no good if
2164 we don't keep it across iterations. For now, just disable
2165 versioning in this case.
2166 ?? We could actually unroll the loop to achieve the required
2167 overall step alignment, and forcing the alignment could be
2168 done by doing some iterations of the non-vectorized loop. */
2169 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2170 * DR_STEP_ALIGNMENT (dr
),
2171 DR_TARGET_ALIGNMENT (dr_info
)))
2173 do_versioning
= false;
2177 /* The rightmost bits of an aligned address must be zeros.
2178 Construct the mask needed for this test. For example,
2179 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2180 mask must be 15 = 0xf. */
2183 /* FORNOW: use the same mask to test all potentially unaligned
2184 references in the loop. The vectorizer currently supports
2185 a single vector size, see the reference to
2186 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2187 vectorization factor is computed. */
2188 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2189 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2190 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2191 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2195 /* Versioning requires at least one misaligned data reference. */
2196 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2197 do_versioning
= false;
2198 else if (!do_versioning
)
2199 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2204 vec
<stmt_vec_info
> may_misalign_stmts
2205 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2206 stmt_vec_info stmt_info
;
2208 /* It can now be assumed that the data references in the statements
2209 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2210 of the loop being vectorized. */
2211 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2213 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2214 SET_DR_MISALIGNMENT (dr_info
, 0);
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_NOTE
, vect_location
,
2217 "Alignment of access forced using versioning.\n");
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_NOTE
, vect_location
,
2222 "Versioning for alignment will be applied.\n");
2224 /* Peeling and versioning can't be done together at this time. */
2225 gcc_assert (! (do_peeling
&& do_versioning
));
2227 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2232 /* This point is reached if neither peeling nor versioning is being done. */
2233 gcc_assert (! (do_peeling
|| do_versioning
));
2235 opt_result stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2240 /* Function vect_find_same_alignment_drs.
2242 Update group and alignment relations in VINFO according to the chosen
2243 vectorization factor. */
2246 vect_find_same_alignment_drs (vec_info
*vinfo
, data_dependence_relation
*ddr
)
2248 struct data_reference
*dra
= DDR_A (ddr
);
2249 struct data_reference
*drb
= DDR_B (ddr
);
2250 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
2251 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
2252 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
2253 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
2255 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2261 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
2262 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2265 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2266 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2267 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2270 /* Two references with distance zero have the same alignment. */
2271 poly_offset_int diff
= (wi::to_poly_offset (DR_INIT (dra
))
2272 - wi::to_poly_offset (DR_INIT (drb
)));
2273 if (maybe_ne (diff
, 0))
2275 /* Get the wider of the two alignments. */
2276 poly_uint64 align_a
=
2277 exact_div (vect_calculate_target_alignment (dr_info_a
),
2279 poly_uint64 align_b
=
2280 exact_div (vect_calculate_target_alignment (dr_info_b
),
2282 unsigned HOST_WIDE_INT align_a_c
, align_b_c
;
2283 if (!align_a
.is_constant (&align_a_c
)
2284 || !align_b
.is_constant (&align_b_c
))
2287 unsigned HOST_WIDE_INT max_align
= MAX (align_a_c
, align_b_c
);
2289 /* Require the gap to be a multiple of the larger vector alignment. */
2290 if (!multiple_p (diff
, max_align
))
2294 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2295 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE
, vect_location
,
2298 "accesses have the same alignment: %T and %T\n",
2299 DR_REF (dra
), DR_REF (drb
));
2303 /* Function vect_analyze_data_refs_alignment
2305 Analyze the alignment of the data-references in the loop.
2306 Return FALSE if a data reference is found that cannot be vectorized. */
2309 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2311 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2313 /* Mark groups of data references with same alignment using
2314 data dependence information. */
2315 vec
<ddr_p
> ddrs
= vinfo
->shared
->ddrs
;
2316 struct data_dependence_relation
*ddr
;
2319 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2320 vect_find_same_alignment_drs (vinfo
, ddr
);
2322 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2323 struct data_reference
*dr
;
2325 vect_record_base_alignments (vinfo
);
2326 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2328 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
2329 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2330 vect_compute_data_ref_alignment (dr_info
);
2333 return opt_result::success ();
2337 /* Analyze alignment of DRs of stmts in NODE. */
2340 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2342 /* We vectorize from the first scalar stmt in the node unless
2343 the node is permuted in which case we start from the first
2344 element in the group. */
2345 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2346 dr_vec_info
*first_dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2347 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2348 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2350 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2351 vect_compute_data_ref_alignment (dr_info
);
2352 /* For creating the data-ref pointer we need alignment of the
2353 first element anyway. */
2354 if (dr_info
!= first_dr_info
)
2355 vect_compute_data_ref_alignment (first_dr_info
);
2356 if (! verify_data_ref_alignment (dr_info
))
2358 if (dump_enabled_p ())
2359 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2360 "not vectorized: bad data alignment in basic "
2368 /* Function vect_slp_analyze_instance_alignment
2370 Analyze the alignment of the data-references in the SLP instance.
2371 Return FALSE if a data reference is found that cannot be vectorized. */
2374 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2376 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2380 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2381 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2384 node
= SLP_INSTANCE_TREE (instance
);
2385 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node
)[0])
2386 && ! vect_slp_analyze_and_verify_node_alignment
2387 (SLP_INSTANCE_TREE (instance
)))
2394 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2395 accesses of legal size, step, etc. Detect gaps, single element
2396 interleaving, and other special cases. Set grouped access info.
2397 Collect groups of strided stores for further use in SLP analysis.
2398 Worker for vect_analyze_group_access. */
2401 vect_analyze_group_access_1 (dr_vec_info
*dr_info
)
2403 data_reference
*dr
= dr_info
->dr
;
2404 tree step
= DR_STEP (dr
);
2405 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2406 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2407 stmt_vec_info stmt_info
= dr_info
->stmt
;
2408 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2409 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2410 HOST_WIDE_INT dr_step
= -1;
2411 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2412 bool slp_impossible
= false;
2414 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2415 size of the interleaving group (including gaps). */
2416 if (tree_fits_shwi_p (step
))
2418 dr_step
= tree_to_shwi (step
);
2419 /* Check that STEP is a multiple of type size. Otherwise there is
2420 a non-element-sized gap at the end of the group which we
2421 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2422 ??? As we can handle non-constant step fine here we should
2423 simply remove uses of DR_GROUP_GAP between the last and first
2424 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2425 simply not include that gap. */
2426 if ((dr_step
% type_size
) != 0)
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE
, vect_location
,
2430 "Step %T is not a multiple of the element size"
2435 groupsize
= absu_hwi (dr_step
) / type_size
;
2440 /* Not consecutive access is possible only if it is a part of interleaving. */
2441 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2443 /* Check if it this DR is a part of interleaving, and is a single
2444 element of the group that is accessed in the loop. */
2446 /* Gaps are supported only for loads. STEP must be a multiple of the type
2449 && (dr_step
% type_size
) == 0
2452 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2453 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2454 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2455 if (dump_enabled_p ())
2456 dump_printf_loc (MSG_NOTE
, vect_location
,
2457 "Detected single element interleaving %T"
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2466 "not consecutive access %G", stmt_info
->stmt
);
2470 /* Mark the statement as unvectorizable. */
2471 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2475 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2477 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2481 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2483 /* First stmt in the interleaving chain. Check the chain. */
2484 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2485 struct data_reference
*data_ref
= dr
;
2486 unsigned int count
= 1;
2487 tree prev_init
= DR_INIT (data_ref
);
2488 stmt_vec_info prev
= stmt_info
;
2489 HOST_WIDE_INT diff
, gaps
= 0;
2491 /* By construction, all group members have INTEGER_CST DR_INITs. */
2494 /* Skip same data-refs. In case that two or more stmts share
2495 data-ref (supported only for loads), we vectorize only the first
2496 stmt, and the rest get their vectorized loads from the first
2498 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2499 DR_INIT (STMT_VINFO_DATA_REF (next
))))
2501 if (DR_IS_WRITE (data_ref
))
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2505 "Two store stmts share the same dr.\n");
2509 if (dump_enabled_p ())
2510 dump_printf_loc (MSG_NOTE
, vect_location
,
2511 "Two or more load stmts share the same dr.\n");
2513 /* For load use the same data-ref load. */
2514 DR_GROUP_SAME_DR_STMT (next
) = prev
;
2517 next
= DR_GROUP_NEXT_ELEMENT (next
);
2522 data_ref
= STMT_VINFO_DATA_REF (next
);
2524 /* All group members have the same STEP by construction. */
2525 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2527 /* Check that the distance between two accesses is equal to the type
2528 size. Otherwise, we have gaps. */
2529 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2530 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2533 /* FORNOW: SLP of accesses with gaps is not supported. */
2534 slp_impossible
= true;
2535 if (DR_IS_WRITE (data_ref
))
2537 if (dump_enabled_p ())
2538 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2539 "interleaved store with gaps\n");
2546 last_accessed_element
+= diff
;
2548 /* Store the gap from the previous member of the group. If there is no
2549 gap in the access, DR_GROUP_GAP is always 1. */
2550 DR_GROUP_GAP (next
) = diff
;
2552 prev_init
= DR_INIT (data_ref
);
2553 next
= DR_GROUP_NEXT_ELEMENT (next
);
2554 /* Count the number of data-refs in the chain. */
2559 groupsize
= count
+ gaps
;
2561 /* This could be UINT_MAX but as we are generating code in a very
2562 inefficient way we have to cap earlier. See PR78699 for example. */
2563 if (groupsize
> 4096)
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2567 "group is too large\n");
2571 /* Check that the size of the interleaving is equal to count for stores,
2572 i.e., that there are no gaps. */
2573 if (groupsize
!= count
2574 && !DR_IS_READ (dr
))
2577 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2580 /* If there is a gap after the last load in the group it is the
2581 difference between the groupsize and the last accessed
2583 When there is no gap, this difference should be 0. */
2584 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
2586 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2587 if (dump_enabled_p ())
2589 dump_printf_loc (MSG_NOTE
, vect_location
,
2590 "Detected interleaving ");
2591 if (DR_IS_READ (dr
))
2592 dump_printf (MSG_NOTE
, "load ");
2593 else if (STMT_VINFO_STRIDED_P (stmt_info
))
2594 dump_printf (MSG_NOTE
, "strided store ");
2596 dump_printf (MSG_NOTE
, "store ");
2597 dump_printf (MSG_NOTE
, "of size %u\n",
2598 (unsigned)groupsize
);
2599 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
2600 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2603 if (DR_GROUP_GAP (next
) != 1)
2604 dump_printf_loc (MSG_NOTE
, vect_location
,
2605 "\t<gap of %d elements>\n",
2606 DR_GROUP_GAP (next
) - 1);
2607 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
2608 next
= DR_GROUP_NEXT_ELEMENT (next
);
2610 if (DR_GROUP_GAP (stmt_info
) != 0)
2611 dump_printf_loc (MSG_NOTE
, vect_location
,
2612 "\t<gap of %d elements>\n",
2613 DR_GROUP_GAP (stmt_info
));
2616 /* SLP: create an SLP data structure for every interleaving group of
2617 stores for further analysis in vect_analyse_slp. */
2618 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2621 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
2623 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
2630 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2631 accesses of legal size, step, etc. Detect gaps, single element
2632 interleaving, and other special cases. Set grouped access info.
2633 Collect groups of strided stores for further use in SLP analysis. */
2636 vect_analyze_group_access (dr_vec_info
*dr_info
)
2638 if (!vect_analyze_group_access_1 (dr_info
))
2640 /* Dissolve the group if present. */
2641 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
2644 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2645 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2646 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
2654 /* Analyze the access pattern of the data-reference DR_INFO.
2655 In case of non-consecutive accesses call vect_analyze_group_access() to
2656 analyze groups of accesses. */
2659 vect_analyze_data_ref_access (dr_vec_info
*dr_info
)
2661 data_reference
*dr
= dr_info
->dr
;
2662 tree step
= DR_STEP (dr
);
2663 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2664 stmt_vec_info stmt_info
= dr_info
->stmt
;
2665 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2666 struct loop
*loop
= NULL
;
2668 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2672 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2674 if (loop_vinfo
&& !step
)
2676 if (dump_enabled_p ())
2677 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2678 "bad data-ref access in loop\n");
2682 /* Allow loads with zero step in inner-loop vectorization. */
2683 if (loop_vinfo
&& integer_zerop (step
))
2685 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2686 if (!nested_in_vect_loop_p (loop
, stmt_info
))
2687 return DR_IS_READ (dr
);
2688 /* Allow references with zero step for outer loops marked
2689 with pragma omp simd only - it guarantees absence of
2690 loop-carried dependencies between inner loop iterations. */
2691 if (loop
->safelen
< 2)
2693 if (dump_enabled_p ())
2694 dump_printf_loc (MSG_NOTE
, vect_location
,
2695 "zero step in inner loop of nest\n");
2700 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2702 /* Interleaved accesses are not yet supported within outer-loop
2703 vectorization for references in the inner-loop. */
2704 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2706 /* For the rest of the analysis we use the outer-loop step. */
2707 step
= STMT_VINFO_DR_STEP (stmt_info
);
2708 if (integer_zerop (step
))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE
, vect_location
,
2712 "zero step in outer loop.\n");
2713 return DR_IS_READ (dr
);
2718 if (TREE_CODE (step
) == INTEGER_CST
)
2720 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2721 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2723 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2725 /* Mark that it is not interleaving. */
2726 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2731 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2733 if (dump_enabled_p ())
2734 dump_printf_loc (MSG_NOTE
, vect_location
,
2735 "grouped access in outer loop.\n");
2740 /* Assume this is a DR handled by non-constant strided load case. */
2741 if (TREE_CODE (step
) != INTEGER_CST
)
2742 return (STMT_VINFO_STRIDED_P (stmt_info
)
2743 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2744 || vect_analyze_group_access (dr_info
)));
2746 /* Not consecutive access - check if it's a part of interleaving group. */
2747 return vect_analyze_group_access (dr_info
);
2750 /* Compare two data-references DRA and DRB to group them into chunks
2751 suitable for grouping. */
2754 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2756 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2757 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2760 /* Stabilize sort. */
2764 /* DRs in different loops never belong to the same group. */
2765 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2766 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2768 return loopa
->num
< loopb
->num
? -1 : 1;
2770 /* Ordering of DRs according to base. */
2771 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2772 DR_BASE_ADDRESS (drb
));
2776 /* And according to DR_OFFSET. */
2777 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2781 /* Put reads before writes. */
2782 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2783 return DR_IS_READ (dra
) ? -1 : 1;
2785 /* Then sort after access size. */
2786 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2787 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2791 /* And after step. */
2792 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2796 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2797 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2799 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2803 /* If OP is the result of a conversion, return the unconverted value,
2804 otherwise return null. */
2807 strip_conversion (tree op
)
2809 if (TREE_CODE (op
) != SSA_NAME
)
2811 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2812 if (!is_gimple_assign (stmt
)
2813 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2815 return gimple_assign_rhs1 (stmt
);
2818 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2819 and STMT2_INFO being in a single group. */
2822 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
)
2824 if (gimple_assign_single_p (stmt1_info
->stmt
))
2825 return gimple_assign_single_p (stmt2_info
->stmt
);
2827 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
2828 if (call1
&& gimple_call_internal_p (call1
))
2830 /* Check for two masked loads or two masked stores. */
2831 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
2832 if (!call2
|| !gimple_call_internal_p (call2
))
2834 internal_fn ifn
= gimple_call_internal_fn (call1
);
2835 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2837 if (ifn
!= gimple_call_internal_fn (call2
))
2840 /* Check that the masks are the same. Cope with casts of masks,
2841 like those created by build_mask_conversion. */
2842 tree mask1
= gimple_call_arg (call1
, 2);
2843 tree mask2
= gimple_call_arg (call2
, 2);
2844 if (!operand_equal_p (mask1
, mask2
, 0))
2846 mask1
= strip_conversion (mask1
);
2849 mask2
= strip_conversion (mask2
);
2852 if (!operand_equal_p (mask1
, mask2
, 0))
2861 /* Function vect_analyze_data_ref_accesses.
2863 Analyze the access pattern of all the data references in the loop.
2865 FORNOW: the only access pattern that is considered vectorizable is a
2866 simple step 1 (consecutive) access.
2868 FORNOW: handle only arrays and pointer accesses. */
2871 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2874 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2875 struct data_reference
*dr
;
2877 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2879 if (datarefs
.is_empty ())
2880 return opt_result::success ();
2882 /* Sort the array of datarefs to make building the interleaving chains
2883 linear. Don't modify the original vector's order, it is needed for
2884 determining what dependencies are reversed. */
2885 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2886 datarefs_copy
.qsort (dr_group_sort_cmp
);
2887 hash_set
<stmt_vec_info
> to_fixup
;
2889 /* Build the interleaving chains. */
2890 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2892 data_reference_p dra
= datarefs_copy
[i
];
2893 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
2894 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
2895 stmt_vec_info lastinfo
= NULL
;
2896 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2897 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2902 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2904 data_reference_p drb
= datarefs_copy
[i
];
2905 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
2906 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
2907 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2908 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2911 /* ??? Imperfect sorting (non-compatible types, non-modulo
2912 accesses, same accesses) can lead to a group to be artificially
2913 split here as we don't just skip over those. If it really
2914 matters we can push those to a worklist and re-iterate
2915 over them. The we can just skip ahead to the next DR here. */
2917 /* DRs in a different loop should not be put into the same
2918 interleaving group. */
2919 if (gimple_bb (DR_STMT (dra
))->loop_father
2920 != gimple_bb (DR_STMT (drb
))->loop_father
)
2923 /* Check that the data-refs have same first location (except init)
2924 and they are both either store or load (not load and store,
2925 not masked loads or stores). */
2926 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2927 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2928 DR_BASE_ADDRESS (drb
)) != 0
2929 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2930 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
))
2933 /* Check that the data-refs have the same constant size. */
2934 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2935 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2936 if (!tree_fits_uhwi_p (sza
)
2937 || !tree_fits_uhwi_p (szb
)
2938 || !tree_int_cst_equal (sza
, szb
))
2941 /* Check that the data-refs have the same step. */
2942 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
2945 /* Check the types are compatible.
2946 ??? We don't distinguish this during sorting. */
2947 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2948 TREE_TYPE (DR_REF (drb
))))
2951 /* Check that the DR_INITs are compile-time constants. */
2952 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
2953 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
2956 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2957 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2958 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2959 HOST_WIDE_INT init_prev
2960 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
2961 gcc_assert (init_a
<= init_b
2962 && init_a
<= init_prev
2963 && init_prev
<= init_b
);
2965 /* Do not place the same access in the interleaving chain twice. */
2966 if (init_b
== init_prev
)
2968 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
2969 < gimple_uid (DR_STMT (drb
)));
2970 /* Simply link in duplicates and fix up the chain below. */
2974 /* If init_b == init_a + the size of the type * k, we have an
2975 interleaving, and DRA is accessed before DRB. */
2976 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2977 if (type_size_a
== 0
2978 || (init_b
- init_a
) % type_size_a
!= 0)
2981 /* If we have a store, the accesses are adjacent. This splits
2982 groups into chunks we support (we don't support vectorization
2983 of stores with gaps). */
2984 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
2987 /* If the step (if not zero or non-constant) is greater than the
2988 difference between data-refs' inits this splits groups into
2990 if (tree_fits_shwi_p (DR_STEP (dra
)))
2992 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2993 if (step
!= 0 && step
<= (init_b
- init_a
))
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_NOTE
, vect_location
,
3001 ? "Detected interleaving load %T and %T\n"
3002 : "Detected interleaving store %T and %T\n",
3003 DR_REF (dra
), DR_REF (drb
));
3005 /* Link the found element into the group list. */
3006 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3008 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3009 lastinfo
= stmtinfo_a
;
3011 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3012 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3013 lastinfo
= stmtinfo_b
;
3015 if (init_b
== init_prev
3016 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3017 && dump_enabled_p ())
3018 dump_printf_loc (MSG_NOTE
, vect_location
,
3019 "Queuing group with duplicate access for fixup\n");
3023 /* Fixup groups with duplicate entries by splitting it. */
3026 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3027 if (!(it
!= to_fixup
.end ()))
3029 stmt_vec_info grp
= *it
;
3030 to_fixup
.remove (grp
);
3032 /* Find the earliest duplicate group member. */
3033 unsigned first_duplicate
= -1u;
3034 stmt_vec_info next
, g
= grp
;
3035 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3037 if ((DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
)
3038 == DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3039 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3040 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3043 if (first_duplicate
== -1U)
3046 /* Then move all stmts after the first duplicate to a new group.
3047 Note this is a heuristic but one with the property that *it
3048 is fixed up completely. */
3050 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3051 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3053 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3055 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3059 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3061 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3064 g
= DR_GROUP_NEXT_ELEMENT (g
);
3066 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3068 /* Fixup the new group which still may contain duplicates. */
3069 to_fixup
.add (newgroup
);
3072 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
3074 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
3075 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3076 && !vect_analyze_data_ref_access (dr_info
))
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3080 "not vectorized: complicated access pattern.\n");
3082 if (is_a
<bb_vec_info
> (vinfo
))
3084 /* Mark the statement as not vectorizable. */
3085 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3090 datarefs_copy
.release ();
3091 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3093 " complicated access pattern.\n");
3098 datarefs_copy
.release ();
3099 return opt_result::success ();
3102 /* Function vect_vfa_segment_size.
3105 DR_INFO: The data reference.
3106 LENGTH_FACTOR: segment length to consider.
3108 Return a value suitable for the dr_with_seg_len::seg_len field.
3109 This is the "distance travelled" by the pointer from the first
3110 iteration in the segment to the last. Note that it does not include
3111 the size of the access; in effect it only describes the first byte. */
3114 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3116 length_factor
= size_binop (MINUS_EXPR
,
3117 fold_convert (sizetype
, length_factor
),
3119 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3123 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3124 gives the worst-case number of bytes covered by the segment. */
3126 static unsigned HOST_WIDE_INT
3127 vect_vfa_access_size (dr_vec_info
*dr_info
)
3129 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3130 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3131 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3132 unsigned HOST_WIDE_INT access_size
= ref_size
;
3133 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3135 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3136 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3138 if (STMT_VINFO_VEC_STMT (stmt_vinfo
)
3139 && (vect_supportable_dr_alignment (dr_info
, false)
3140 == dr_explicit_realign_optimized
))
3142 /* We might access a full vector's worth. */
3143 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3144 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3149 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3153 vect_vfa_align (dr_vec_info
*dr_info
)
3155 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info
->dr
)));
3158 /* Function vect_no_alias_p.
3160 Given data references A and B with equal base and offset, see whether
3161 the alias relation can be decided at compilation time. Return 1 if
3162 it can and the references alias, 0 if it can and the references do
3163 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3164 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3165 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3168 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3169 tree segment_length_a
, tree segment_length_b
,
3170 unsigned HOST_WIDE_INT access_size_a
,
3171 unsigned HOST_WIDE_INT access_size_b
)
3173 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3174 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3175 poly_uint64 const_length_a
;
3176 poly_uint64 const_length_b
;
3178 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3179 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3181 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3183 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3184 offset_a
= (offset_a
+ access_size_a
) - const_length_a
;
3187 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3188 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3190 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3191 offset_b
= (offset_b
+ access_size_b
) - const_length_b
;
3194 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3196 const_length_a
+= access_size_a
;
3197 const_length_b
+= access_size_b
;
3199 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3200 offset_b
, const_length_b
))
3203 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3204 offset_b
, const_length_b
))
3210 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3214 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3215 unsigned int loop_depth
, poly_uint64 vf
)
3217 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3218 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3221 /* If the dependence is exact, we should have limited the VF instead. */
3222 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3225 lambda_vector dist_v
;
3226 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3228 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3230 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3231 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3235 if (dump_enabled_p ())
3236 dump_printf_loc (MSG_NOTE
, vect_location
,
3237 "dependence distance between %T and %T is >= VF\n",
3238 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3243 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3246 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3248 dump_printf (dump_kind
, "%s (%T) >= ",
3249 lower_bound
.unsigned_p
? "unsigned" : "abs",
3251 dump_dec (dump_kind
, lower_bound
.min_value
);
3254 /* Record that the vectorized loop requires the vec_lower_bound described
3255 by EXPR, UNSIGNED_P and MIN_VALUE. */
3258 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3259 poly_uint64 min_value
)
3261 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3262 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3263 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3265 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3266 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3267 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3268 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3270 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3271 lower_bounds
[i
].min_value
= min_value
;
3272 if (dump_enabled_p ())
3274 dump_printf_loc (MSG_NOTE
, vect_location
,
3275 "updating run-time check to ");
3276 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3277 dump_printf (MSG_NOTE
, "\n");
3283 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3284 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3287 dump_lower_bound (MSG_NOTE
, lower_bound
);
3288 dump_printf (MSG_NOTE
, "\n");
3290 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3293 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3294 will span fewer than GAP bytes. */
3297 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3300 stmt_vec_info stmt_info
= dr_info
->stmt
;
3302 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3303 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3304 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3305 return (estimated_poly_value (gap
)
3306 <= count
* vect_get_scalar_dr_size (dr_info
));
3309 /* Return true if we know that there is no alias between DR_INFO_A and
3310 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3311 When returning true, set *LOWER_BOUND_OUT to this N. */
3314 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3315 poly_uint64
*lower_bound_out
)
3317 /* Check that there is a constant gap of known sign between DR_A
3319 data_reference
*dr_a
= dr_info_a
->dr
;
3320 data_reference
*dr_b
= dr_info_b
->dr
;
3321 poly_int64 init_a
, init_b
;
3322 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3323 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3324 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3325 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3326 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3327 || !ordered_p (init_a
, init_b
))
3330 /* Sort DR_A and DR_B by the address they access. */
3331 if (maybe_lt (init_b
, init_a
))
3333 std::swap (init_a
, init_b
);
3334 std::swap (dr_info_a
, dr_info_b
);
3335 std::swap (dr_a
, dr_b
);
3338 /* If the two accesses could be dependent within a scalar iteration,
3339 make sure that we'd retain their order. */
3340 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3341 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3344 /* There is no alias if abs (DR_STEP) is greater than or equal to
3345 the bytes spanned by the combination of the two accesses. */
3346 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3350 /* Function vect_prune_runtime_alias_test_list.
3352 Prune a list of ddrs to be tested at run-time by versioning for alias.
3353 Merge several alias checks into one if possible.
3354 Return FALSE if resulting list of ddrs is longer then allowed by
3355 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3358 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3360 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3361 hash_set
<tree_pair_hash
> compared_objects
;
3363 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3364 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3365 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3366 vec
<vec_object_pair
> &check_unequal_addrs
3367 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3368 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3369 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3375 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3377 /* Step values are irrelevant for aliasing if the number of vector
3378 iterations is equal to the number of scalar iterations (which can
3379 happen for fully-SLP loops). */
3380 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3384 /* Convert the checks for nonzero steps into bound tests. */
3386 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3387 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3390 if (may_alias_ddrs
.is_empty ())
3391 return opt_result::success ();
3393 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3395 unsigned int loop_depth
3396 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3397 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3399 /* First, we collect all data ref pairs for aliasing checks. */
3400 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3403 poly_uint64 lower_bound
;
3404 tree segment_length_a
, segment_length_b
;
3405 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3406 unsigned int align_a
, align_b
;
3408 /* Ignore the alias if the VF we chose ended up being no greater
3409 than the dependence distance. */
3410 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3413 if (DDR_OBJECT_A (ddr
))
3415 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3416 if (!compared_objects
.add (new_pair
))
3418 if (dump_enabled_p ())
3419 dump_printf_loc (MSG_NOTE
, vect_location
,
3420 "checking that %T and %T"
3421 " have different addresses\n",
3422 new_pair
.first
, new_pair
.second
);
3423 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3428 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3429 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3431 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3432 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3434 /* Skip the pair if inter-iteration dependencies are irrelevant
3435 and intra-iteration dependencies are guaranteed to be honored. */
3437 && (vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
)
3438 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3441 if (dump_enabled_p ())
3442 dump_printf_loc (MSG_NOTE
, vect_location
,
3443 "no need for alias check between "
3444 "%T and %T when VF is 1\n",
3445 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3449 /* See whether we can handle the alias using a bounds check on
3450 the step, and whether that's likely to be the best approach.
3451 (It might not be, for example, if the minimum step is much larger
3452 than the number of bytes handled by one vector iteration.) */
3454 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3455 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3457 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3458 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3460 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3461 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3464 "%T and %T when the step %T is outside ",
3465 DR_REF (dr_info_a
->dr
),
3466 DR_REF (dr_info_b
->dr
),
3467 DR_STEP (dr_info_a
->dr
));
3469 dump_printf (MSG_NOTE
, "[0");
3472 dump_printf (MSG_NOTE
, "(");
3473 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3475 dump_printf (MSG_NOTE
, ", ");
3476 dump_dec (MSG_NOTE
, lower_bound
);
3477 dump_printf (MSG_NOTE
, ")\n");
3479 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
3480 unsigned_p
, lower_bound
);
3484 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
3485 if (dr_group_first_a
)
3487 stmt_info_a
= dr_group_first_a
;
3488 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
3491 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
3492 if (dr_group_first_b
)
3494 stmt_info_b
= dr_group_first_b
;
3495 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
3500 segment_length_a
= size_zero_node
;
3501 segment_length_b
= size_zero_node
;
3505 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
3506 DR_STEP (dr_info_b
->dr
), 0))
3507 length_factor
= scalar_loop_iters
;
3509 length_factor
= size_int (vect_factor
);
3510 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
3511 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
3513 access_size_a
= vect_vfa_access_size (dr_info_a
);
3514 access_size_b
= vect_vfa_access_size (dr_info_b
);
3515 align_a
= vect_vfa_align (dr_info_a
);
3516 align_b
= vect_vfa_align (dr_info_b
);
3518 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a
->dr
),
3519 DR_BASE_ADDRESS (dr_info_b
->dr
));
3521 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_info_a
->dr
),
3522 DR_OFFSET (dr_info_b
->dr
));
3524 /* See whether the alias is known at compilation time. */
3526 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
3527 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
3528 && poly_int_tree_p (segment_length_a
)
3529 && poly_int_tree_p (segment_length_b
))
3531 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
3536 if (res
>= 0 && dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE
, vect_location
,
3539 "can tell at compile time that %T and %T",
3540 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3542 dump_printf (MSG_NOTE
, " do not alias\n");
3544 dump_printf (MSG_NOTE
, " alias\n");
3551 return opt_result::failure_at (stmt_info_b
->stmt
,
3553 " compilation time alias: %G%G",
3558 dr_with_seg_len_pair_t dr_with_seg_len_pair
3559 (dr_with_seg_len (dr_info_a
->dr
, segment_length_a
,
3560 access_size_a
, align_a
),
3561 dr_with_seg_len (dr_info_b
->dr
, segment_length_b
,
3562 access_size_b
, align_b
));
3564 /* Canonicalize pairs by sorting the two DR members. */
3566 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3568 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3571 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3573 unsigned int count
= (comp_alias_ddrs
.length ()
3574 + check_unequal_addrs
.length ());
3576 if (dump_enabled_p ())
3577 dump_printf_loc (MSG_NOTE
, vect_location
,
3578 "improved number of alias checks from %d to %d\n",
3579 may_alias_ddrs
.length (), count
);
3580 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3581 return opt_result::failure_at
3583 "number of versioning for alias "
3584 "run-time tests exceeds %d "
3585 "(--param vect-max-version-for-alias-checks)\n",
3586 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3588 return opt_result::success ();
3591 /* Check whether we can use an internal function for a gather load
3592 or scatter store. READ_P is true for loads and false for stores.
3593 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3594 the type of the memory elements being loaded or stored. OFFSET_BITS
3595 is the number of bits in each scalar offset and OFFSET_SIGN is the
3596 sign of the offset. SCALE is the amount by which the offset should
3597 be multiplied *after* it has been converted to address width.
3599 Return true if the function is supported, storing the function
3600 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3603 vect_gather_scatter_fn_p (bool read_p
, bool masked_p
, tree vectype
,
3604 tree memory_type
, unsigned int offset_bits
,
3605 signop offset_sign
, int scale
,
3606 internal_fn
*ifn_out
, tree
*element_type_out
)
3608 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3609 unsigned int element_bits
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype
)));
3610 if (offset_bits
> element_bits
)
3611 /* Internal functions require the offset to be the same width as
3612 the vector elements. We can extend narrower offsets, but it isn't
3613 safe to truncate wider offsets. */
3616 if (element_bits
!= memory_bits
)
3617 /* For now the vector elements must be the same width as the
3621 /* Work out which function we need. */
3624 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3626 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3628 /* Test whether the target supports this combination. */
3629 if (!internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3630 offset_sign
, scale
))
3634 *element_type_out
= TREE_TYPE (vectype
);
3638 /* STMT_INFO is a call to an internal gather load or scatter store function.
3639 Describe the operation in INFO. */
3642 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
3643 gather_scatter_info
*info
)
3645 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
3646 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3647 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3649 info
->ifn
= gimple_call_internal_fn (call
);
3650 info
->decl
= NULL_TREE
;
3651 info
->base
= gimple_call_arg (call
, 0);
3652 info
->offset
= gimple_call_arg (call
, 1);
3653 info
->offset_dt
= vect_unknown_def_type
;
3654 info
->offset_vectype
= NULL_TREE
;
3655 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3656 info
->element_type
= TREE_TYPE (vectype
);
3657 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3660 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3661 gather load or scatter store. Describe the operation in *INFO if so. */
3664 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
3665 gather_scatter_info
*info
)
3667 HOST_WIDE_INT scale
= 1;
3668 poly_int64 pbitpos
, pbitsize
;
3669 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3670 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3671 tree offtype
= NULL_TREE
;
3672 tree decl
= NULL_TREE
, base
, off
;
3673 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3674 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3676 int punsignedp
, reversep
, pvolatilep
= 0;
3679 bool masked_p
= false;
3681 /* See whether this is already a call to a gather/scatter internal function.
3682 If not, see whether it's a masked load or store. */
3683 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
3684 if (call
&& gimple_call_internal_p (call
))
3686 ifn
= gimple_call_internal_fn (call
);
3687 if (internal_gather_scatter_fn_p (ifn
))
3689 vect_describe_gather_scatter_call (stmt_info
, info
);
3692 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3695 /* True if we should aim to use internal functions rather than
3696 built-in functions. */
3697 bool use_ifn_p
= (DR_IS_READ (dr
)
3698 ? supports_vec_gather_load_p ()
3699 : supports_vec_scatter_store_p ());
3702 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3703 see if we can use the def stmt of the address. */
3705 && TREE_CODE (base
) == MEM_REF
3706 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3707 && integer_zerop (TREE_OPERAND (base
, 1))
3708 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3710 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3711 if (is_gimple_assign (def_stmt
)
3712 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3713 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3716 /* The gather and scatter builtins need address of the form
3717 loop_invariant + vector * {1, 2, 4, 8}
3719 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3720 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3721 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3722 multiplications and additions in it. To get a vector, we need
3723 a single SSA_NAME that will be defined in the loop and will
3724 contain everything that is not loop invariant and that can be
3725 vectorized. The following code attempts to find such a preexistng
3726 SSA_NAME OFF and put the loop invariants into a tree BASE
3727 that can be gimplified before the loop. */
3728 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3729 &punsignedp
, &reversep
, &pvolatilep
);
3733 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3735 if (TREE_CODE (base
) == MEM_REF
)
3737 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3739 if (off
== NULL_TREE
)
3740 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3742 off
= size_binop (PLUS_EXPR
, off
,
3743 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3745 base
= TREE_OPERAND (base
, 0);
3748 base
= build_fold_addr_expr (base
);
3750 if (off
== NULL_TREE
)
3751 off
= size_zero_node
;
3753 /* If base is not loop invariant, either off is 0, then we start with just
3754 the constant offset in the loop invariant BASE and continue with base
3755 as OFF, otherwise give up.
3756 We could handle that case by gimplifying the addition of base + off
3757 into some SSA_NAME and use that as off, but for now punt. */
3758 if (!expr_invariant_in_loop_p (loop
, base
))
3760 if (!integer_zerop (off
))
3763 base
= size_int (pbytepos
);
3765 /* Otherwise put base + constant offset into the loop invariant BASE
3766 and continue with OFF. */
3769 base
= fold_convert (sizetype
, base
);
3770 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3773 /* OFF at this point may be either a SSA_NAME or some tree expression
3774 from get_inner_reference. Try to peel off loop invariants from it
3775 into BASE as long as possible. */
3777 while (offtype
== NULL_TREE
)
3779 enum tree_code code
;
3780 tree op0
, op1
, add
= NULL_TREE
;
3782 if (TREE_CODE (off
) == SSA_NAME
)
3784 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3786 if (expr_invariant_in_loop_p (loop
, off
))
3789 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3792 op0
= gimple_assign_rhs1 (def_stmt
);
3793 code
= gimple_assign_rhs_code (def_stmt
);
3794 op1
= gimple_assign_rhs2 (def_stmt
);
3798 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3800 code
= TREE_CODE (off
);
3801 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3805 case POINTER_PLUS_EXPR
:
3807 if (expr_invariant_in_loop_p (loop
, op0
))
3812 add
= fold_convert (sizetype
, add
);
3814 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3815 base
= size_binop (PLUS_EXPR
, base
, add
);
3818 if (expr_invariant_in_loop_p (loop
, op1
))
3826 if (expr_invariant_in_loop_p (loop
, op1
))
3828 add
= fold_convert (sizetype
, op1
);
3829 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3835 if (scale
== 1 && tree_fits_shwi_p (op1
))
3837 int new_scale
= tree_to_shwi (op1
);
3838 /* Only treat this as a scaling operation if the target
3841 && !vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
,
3842 vectype
, memory_type
, 1,
3843 TYPE_SIGN (TREE_TYPE (op0
)),
3856 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3857 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3859 if (TYPE_PRECISION (TREE_TYPE (op0
))
3860 == TYPE_PRECISION (TREE_TYPE (off
)))
3866 /* The internal functions need the offset to be the same width
3867 as the elements of VECTYPE. Don't include operations that
3868 cast the offset from that width to a different width. */
3870 && (int_size_in_bytes (TREE_TYPE (vectype
))
3871 == int_size_in_bytes (TREE_TYPE (off
))))
3874 if (TYPE_PRECISION (TREE_TYPE (op0
))
3875 < TYPE_PRECISION (TREE_TYPE (off
)))
3878 offtype
= TREE_TYPE (off
);
3889 /* If at the end OFF still isn't a SSA_NAME or isn't
3890 defined in the loop, punt. */
3891 if (TREE_CODE (off
) != SSA_NAME
3892 || expr_invariant_in_loop_p (loop
, off
))
3895 if (offtype
== NULL_TREE
)
3896 offtype
= TREE_TYPE (off
);
3900 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
, vectype
,
3901 memory_type
, TYPE_PRECISION (offtype
),
3902 TYPE_SIGN (offtype
), scale
, &ifn
,
3908 if (DR_IS_READ (dr
))
3910 if (targetm
.vectorize
.builtin_gather
)
3911 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
3915 if (targetm
.vectorize
.builtin_scatter
)
3916 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
3923 element_type
= TREE_TYPE (vectype
);
3930 info
->offset_dt
= vect_unknown_def_type
;
3931 info
->offset_vectype
= NULL_TREE
;
3932 info
->scale
= scale
;
3933 info
->element_type
= element_type
;
3934 info
->memory_type
= memory_type
;
3938 /* Find the data references in STMT, analyze them with respect to LOOP and
3939 append them to DATAREFS. Return false if datarefs in this stmt cannot
3943 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
3944 vec
<data_reference_p
> *datarefs
)
3946 /* We can ignore clobbers for dataref analysis - they are removed during
3947 loop vectorization and BB vectorization checks dependences with a
3949 if (gimple_clobber_p (stmt
))
3950 return opt_result::success ();
3952 if (gimple_has_volatile_ops (stmt
))
3953 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
3956 if (stmt_can_throw_internal (cfun
, stmt
))
3957 return opt_result::failure_at (stmt
,
3959 " statement can throw an exception: %G",
3962 auto_vec
<data_reference_p
, 2> refs
;
3963 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
3967 if (refs
.is_empty ())
3968 return opt_result::success ();
3970 if (refs
.length () > 1)
3971 return opt_result::failure_at (stmt
,
3973 " more than one data ref in stmt: %G", stmt
);
3975 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
3976 if (!gimple_call_internal_p (call
)
3977 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
3978 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
3979 return opt_result::failure_at (stmt
,
3980 "not vectorized: dr in a call %G", stmt
);
3982 data_reference_p dr
= refs
.pop ();
3983 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3984 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3985 return opt_result::failure_at (stmt
,
3987 " statement is bitfield access %G", stmt
);
3989 if (DR_BASE_ADDRESS (dr
)
3990 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3991 return opt_result::failure_at (stmt
,
3993 " base addr of dr is a constant\n");
3995 /* Check whether this may be a SIMD lane access and adjust the
3996 DR to make it easier for us to handle it. */
3999 && (!DR_BASE_ADDRESS (dr
)
4004 struct data_reference
*newdr
4005 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4006 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4007 if (DR_BASE_ADDRESS (newdr
)
4008 && DR_OFFSET (newdr
)
4011 && integer_zerop (DR_STEP (newdr
)))
4013 tree off
= DR_OFFSET (newdr
);
4015 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4016 && TREE_CODE (off
) == MULT_EXPR
4017 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4019 tree step
= TREE_OPERAND (off
, 1);
4020 off
= TREE_OPERAND (off
, 0);
4022 if (CONVERT_EXPR_P (off
)
4023 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4024 < TYPE_PRECISION (TREE_TYPE (off
))))
4025 off
= TREE_OPERAND (off
, 0);
4026 if (TREE_CODE (off
) == SSA_NAME
)
4028 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4029 tree reft
= TREE_TYPE (DR_REF (newdr
));
4030 if (is_gimple_call (def
)
4031 && gimple_call_internal_p (def
)
4032 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4034 tree arg
= gimple_call_arg (def
, 0);
4035 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4036 arg
= SSA_NAME_VAR (arg
);
4037 if (arg
== loop
->simduid
4039 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4041 DR_OFFSET (newdr
) = ssize_int (0);
4042 DR_STEP (newdr
) = step
;
4043 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4044 DR_STEP_ALIGNMENT (newdr
)
4045 = highest_pow2_factor (step
);
4046 /* Mark as simd-lane access. */
4047 newdr
->aux
= (void *)-1;
4049 datarefs
->safe_push (newdr
);
4050 return opt_result::success ();
4056 free_data_ref (newdr
);
4059 datarefs
->safe_push (dr
);
4060 return opt_result::success ();
4063 /* Function vect_analyze_data_refs.
4065 Find all the data references in the loop or basic block.
4067 The general structure of the analysis of data refs in the vectorizer is as
4069 1- vect_analyze_data_refs(loop/bb): call
4070 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4071 in the loop/bb and their dependences.
4072 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4073 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4074 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4079 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
)
4081 struct loop
*loop
= NULL
;
4083 struct data_reference
*dr
;
4086 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4088 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4089 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4091 /* Go through the data-refs, check that the analysis succeeded. Update
4092 pointer from stmt_vec_info struct to DR and vectype. */
4094 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4095 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4097 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4100 gcc_assert (DR_REF (dr
));
4101 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4102 gcc_assert (!stmt_info
->dr_aux
.dr
);
4103 stmt_info
->dr_aux
.dr
= dr
;
4104 stmt_info
->dr_aux
.stmt
= stmt_info
;
4106 /* Check that analysis of the data-ref succeeded. */
4107 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4112 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4113 && (targetm
.vectorize
.builtin_gather
!= NULL
4114 || supports_vec_gather_load_p ());
4117 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4118 && (targetm
.vectorize
.builtin_scatter
!= NULL
4119 || supports_vec_scatter_store_p ());
4121 /* If target supports vector gather loads or scatter stores,
4122 see if they can't be used. */
4123 if (is_a
<loop_vec_info
> (vinfo
)
4124 && !nested_in_vect_loop_p (loop
, stmt_info
))
4126 if (maybe_gather
|| maybe_scatter
)
4129 gatherscatter
= GATHER
;
4131 gatherscatter
= SCATTER
;
4135 if (gatherscatter
== SG_NONE
)
4137 if (dump_enabled_p ())
4138 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4139 "not vectorized: data ref analysis "
4140 "failed %G", stmt_info
->stmt
);
4141 if (is_a
<bb_vec_info
> (vinfo
))
4143 /* In BB vectorization the ref can still participate
4144 in dependence analysis, we just can't vectorize it. */
4145 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4148 return opt_result::failure_at (stmt_info
->stmt
,
4150 " data ref analysis failed: %G",
4155 /* See if this was detected as SIMD lane access. */
4156 if (dr
->aux
== (void *)-1)
4158 if (nested_in_vect_loop_p (loop
, stmt_info
))
4159 return opt_result::failure_at (stmt_info
->stmt
,
4161 " data ref analysis failed: %G",
4163 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
4166 tree base
= get_base_address (DR_REF (dr
));
4167 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4169 if (dump_enabled_p ())
4170 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4171 "not vectorized: base object not addressable "
4172 "for stmt: %G", stmt_info
->stmt
);
4173 if (is_a
<bb_vec_info
> (vinfo
))
4175 /* In BB vectorization the ref can still participate
4176 in dependence analysis, we just can't vectorize it. */
4177 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4180 return opt_result::failure_at (stmt_info
->stmt
,
4181 "not vectorized: base object not"
4182 " addressable for stmt: %G",
4186 if (is_a
<loop_vec_info
> (vinfo
)
4188 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4190 if (nested_in_vect_loop_p (loop
, stmt_info
))
4191 return opt_result::failure_at (stmt_info
->stmt
,
4193 "not suitable for strided load %G",
4195 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4198 /* Update DR field in stmt_vec_info struct. */
4200 /* If the dataref is in an inner-loop of the loop that is considered for
4201 for vectorization, we also want to analyze the access relative to
4202 the outer-loop (DR contains information only relative to the
4203 inner-most enclosing loop). We do that by building a reference to the
4204 first location accessed by the inner-loop, and analyze it relative to
4206 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4208 /* Build a reference to the first location accessed by the
4209 inner loop: *(BASE + INIT + OFFSET). By construction,
4210 this address must be invariant in the inner loop, so we
4211 can consider it as being used in the outer loop. */
4212 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4213 tree offset
= unshare_expr (DR_OFFSET (dr
));
4214 tree init
= unshare_expr (DR_INIT (dr
));
4215 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4217 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4218 tree init_ref
= build_fold_indirect_ref (init_addr
);
4220 if (dump_enabled_p ())
4221 dump_printf_loc (MSG_NOTE
, vect_location
,
4222 "analyze in outer loop: %T\n", init_ref
);
4225 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4226 init_ref
, loop
, stmt_info
->stmt
);
4228 /* dr_analyze_innermost already explained the failure. */
4231 if (dump_enabled_p ())
4232 dump_printf_loc (MSG_NOTE
, vect_location
,
4233 "\touter base_address: %T\n"
4234 "\touter offset from base address: %T\n"
4235 "\touter constant offset from base address: %T\n"
4236 "\touter step: %T\n"
4237 "\touter base alignment: %d\n\n"
4238 "\touter base misalignment: %d\n"
4239 "\touter offset alignment: %d\n"
4240 "\touter step alignment: %d\n",
4241 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4242 STMT_VINFO_DR_OFFSET (stmt_info
),
4243 STMT_VINFO_DR_INIT (stmt_info
),
4244 STMT_VINFO_DR_STEP (stmt_info
),
4245 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4246 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4247 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4248 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4251 /* Set vectype for STMT. */
4252 scalar_type
= TREE_TYPE (DR_REF (dr
));
4253 STMT_VINFO_VECTYPE (stmt_info
)
4254 = get_vectype_for_scalar_type (scalar_type
);
4255 if (!STMT_VINFO_VECTYPE (stmt_info
))
4257 if (dump_enabled_p ())
4259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4260 "not vectorized: no vectype for stmt: %G",
4262 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4263 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4265 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4268 if (is_a
<bb_vec_info
> (vinfo
))
4270 /* No vector type is fine, the ref can still participate
4271 in dependence analysis, we just can't vectorize it. */
4272 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4275 return opt_result::failure_at (stmt_info
->stmt
,
4277 " no vectype for stmt: %G"
4278 " scalar_type: %T\n",
4279 stmt_info
->stmt
, scalar_type
);
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_NOTE
, vect_location
,
4285 "got vectype for stmt: %G%T\n",
4286 stmt_info
->stmt
, STMT_VINFO_VECTYPE (stmt_info
));
4289 /* Adjust the minimal vectorization factor according to the
4291 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4292 *min_vf
= upper_bound (*min_vf
, vf
);
4294 if (gatherscatter
!= SG_NONE
)
4296 gather_scatter_info gs_info
;
4297 if (!vect_check_gather_scatter (stmt_info
,
4298 as_a
<loop_vec_info
> (vinfo
),
4300 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4301 return opt_result::failure_at
4303 (gatherscatter
== GATHER
) ?
4304 "not vectorized: not suitable for gather load %G" :
4305 "not vectorized: not suitable for scatter store %G",
4307 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4311 /* We used to stop processing and prune the list here. Verify we no
4313 gcc_assert (i
== datarefs
.length ());
4315 return opt_result::success ();
4319 /* Function vect_get_new_vect_var.
4321 Returns a name for a new variable. The current naming scheme appends the
4322 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4323 the name of vectorizer generated variables, and appends that to NAME if
4327 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4334 case vect_simple_var
:
4337 case vect_scalar_var
:
4343 case vect_pointer_var
:
4352 char* tmp
= concat (prefix
, "_", name
, NULL
);
4353 new_vect_var
= create_tmp_reg (type
, tmp
);
4357 new_vect_var
= create_tmp_reg (type
, prefix
);
4359 return new_vect_var
;
4362 /* Like vect_get_new_vect_var but return an SSA name. */
4365 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4372 case vect_simple_var
:
4375 case vect_scalar_var
:
4378 case vect_pointer_var
:
4387 char* tmp
= concat (prefix
, "_", name
, NULL
);
4388 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4392 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4394 return new_vect_var
;
4397 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4400 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
4402 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
4403 int misalign
= DR_MISALIGNMENT (dr_info
);
4404 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4405 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4407 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4408 known_alignment (DR_TARGET_ALIGNMENT (dr_info
)),
4412 /* Function vect_create_addr_base_for_vector_ref.
4414 Create an expression that computes the address of the first memory location
4415 that will be accessed for a data reference.
4418 STMT_INFO: The statement containing the data reference.
4419 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4420 OFFSET: Optional. If supplied, it is be added to the initial address.
4421 LOOP: Specify relative to which loop-nest should the address be computed.
4422 For example, when the dataref is in an inner-loop nested in an
4423 outer-loop that is now being vectorized, LOOP can be either the
4424 outer-loop, or the inner-loop. The first memory location accessed
4425 by the following dataref ('in' points to short):
4432 if LOOP=i_loop: &in (relative to i_loop)
4433 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4434 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4435 initial address. Unlike OFFSET, which is number of elements to
4436 be added, BYTE_OFFSET is measured in bytes.
4439 1. Return an SSA_NAME whose value is the address of the memory location of
4440 the first vector of the data reference.
4441 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4442 these statement(s) which define the returned SSA_NAME.
4444 FORNOW: We are only handling array accesses with step 1. */
4447 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info
,
4448 gimple_seq
*new_stmt_list
,
4452 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4453 struct data_reference
*dr
= dr_info
->dr
;
4454 const char *base_name
;
4457 gimple_seq seq
= NULL
;
4459 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4460 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4461 innermost_loop_behavior
*drb
= vect_dr_behavior (dr_info
);
4463 tree data_ref_base
= unshare_expr (drb
->base_address
);
4464 tree base_offset
= unshare_expr (drb
->offset
);
4465 tree init
= unshare_expr (drb
->init
);
4468 base_name
= get_name (data_ref_base
);
4471 base_offset
= ssize_int (0);
4472 init
= ssize_int (0);
4473 base_name
= get_name (DR_REF (dr
));
4476 /* Create base_offset */
4477 base_offset
= size_binop (PLUS_EXPR
,
4478 fold_convert (sizetype
, base_offset
),
4479 fold_convert (sizetype
, init
));
4483 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4484 fold_convert (sizetype
, offset
), step
);
4485 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4486 base_offset
, offset
);
4490 byte_offset
= fold_convert (sizetype
, byte_offset
);
4491 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4492 base_offset
, byte_offset
);
4495 /* base + base_offset */
4497 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4500 addr_base
= build1 (ADDR_EXPR
,
4501 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4502 unshare_expr (DR_REF (dr
)));
4505 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4506 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4507 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4508 gimple_seq_add_seq (new_stmt_list
, seq
);
4510 if (DR_PTR_INFO (dr
)
4511 && TREE_CODE (addr_base
) == SSA_NAME
4512 && !SSA_NAME_PTR_INFO (addr_base
))
4514 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
4515 if (offset
|| byte_offset
)
4516 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4519 if (dump_enabled_p ())
4520 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
4526 /* Function vect_create_data_ref_ptr.
4528 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4529 location accessed in the loop by STMT_INFO, along with the def-use update
4530 chain to appropriately advance the pointer through the loop iterations.
4531 Also set aliasing information for the pointer. This pointer is used by
4532 the callers to this function to create a memory reference expression for
4533 vector load/store access.
4536 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4537 GIMPLE_ASSIGN <name, data-ref> or
4538 GIMPLE_ASSIGN <data-ref, name>.
4539 2. AGGR_TYPE: the type of the reference, which should be either a vector
4541 3. AT_LOOP: the loop where the vector memref is to be created.
4542 4. OFFSET (optional): an offset to be added to the initial address accessed
4543 by the data-ref in STMT_INFO.
4544 5. BSI: location where the new stmts are to be placed if there is no loop
4545 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4546 pointing to the initial address.
4547 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4548 to the initial address accessed by the data-ref in STMT_INFO. This is
4549 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4551 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4552 to the IV during each iteration of the loop. NULL says to move
4553 by one copy of AGGR_TYPE up or down, depending on the step of the
4557 1. Declare a new ptr to vector_type, and have it point to the base of the
4558 data reference (initial addressed accessed by the data reference).
4559 For example, for vector of type V8HI, the following code is generated:
4562 ap = (v8hi *)initial_address;
4564 if OFFSET is not supplied:
4565 initial_address = &a[init];
4566 if OFFSET is supplied:
4567 initial_address = &a[init + OFFSET];
4568 if BYTE_OFFSET is supplied:
4569 initial_address = &a[init] + BYTE_OFFSET;
4571 Return the initial_address in INITIAL_ADDRESS.
4573 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4574 update the pointer in each iteration of the loop.
4576 Return the increment stmt that updates the pointer in PTR_INCR.
4578 3. Return the pointer. */
4581 vect_create_data_ref_ptr (stmt_vec_info stmt_info
, tree aggr_type
,
4582 struct loop
*at_loop
, tree offset
,
4583 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
4584 gimple
**ptr_incr
, bool only_init
,
4585 tree byte_offset
, tree iv_step
)
4587 const char *base_name
;
4588 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4589 struct loop
*loop
= NULL
;
4590 bool nested_in_vect_loop
= false;
4591 struct loop
*containing_loop
= NULL
;
4595 gimple_seq new_stmt_list
= NULL
;
4599 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4600 struct data_reference
*dr
= dr_info
->dr
;
4602 gimple_stmt_iterator incr_gsi
;
4604 tree indx_before_incr
, indx_after_incr
;
4606 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4608 gcc_assert (iv_step
!= NULL_TREE
4609 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4610 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4614 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4615 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
4616 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
4617 pe
= loop_preheader_edge (loop
);
4621 gcc_assert (bb_vinfo
);
4626 /* Create an expression for the first address accessed by this load
4628 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4630 if (dump_enabled_p ())
4632 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4633 dump_printf_loc (MSG_NOTE
, vect_location
,
4634 "create %s-pointer variable to type: %T",
4635 get_tree_code_name (TREE_CODE (aggr_type
)),
4637 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4638 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4639 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4640 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4641 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4642 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4644 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4645 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
4648 /* (1) Create the new aggregate-pointer variable.
4649 Vector and array types inherit the alias set of their component
4650 type by default so we need to use a ref-all pointer if the data
4651 reference does not conflict with the created aggregated data
4652 reference because it is not addressable. */
4653 bool need_ref_all
= false;
4654 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4655 get_alias_set (DR_REF (dr
))))
4656 need_ref_all
= true;
4657 /* Likewise for any of the data references in the stmt group. */
4658 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4660 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4663 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4664 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4665 get_alias_set (DR_REF (sdr
))))
4667 need_ref_all
= true;
4670 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
4674 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4676 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4679 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4680 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4681 def-use update cycles for the pointer: one relative to the outer-loop
4682 (LOOP), which is what steps (3) and (4) below do. The other is relative
4683 to the inner-loop (which is the inner-most loop containing the dataref),
4684 and this is done be step (5) below.
4686 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4687 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4688 redundant. Steps (3),(4) create the following:
4691 LOOP: vp1 = phi(vp0,vp2)
4697 If there is an inner-loop nested in loop, then step (5) will also be
4698 applied, and an additional update in the inner-loop will be created:
4701 LOOP: vp1 = phi(vp0,vp2)
4703 inner: vp3 = phi(vp1,vp4)
4704 vp4 = vp3 + inner_step
4710 /* (2) Calculate the initial address of the aggregate-pointer, and set
4711 the aggregate-pointer to point to it before the loop. */
4713 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4715 new_temp
= vect_create_addr_base_for_vector_ref (stmt_info
, &new_stmt_list
,
4716 offset
, byte_offset
);
4721 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4722 gcc_assert (!new_bb
);
4725 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4728 *initial_address
= new_temp
;
4729 aggr_ptr_init
= new_temp
;
4731 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4732 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4733 inner-loop nested in LOOP (during outer-loop vectorization). */
4735 /* No update in loop is required. */
4736 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4737 aptr
= aggr_ptr_init
;
4740 /* Accesses to invariant addresses should be handled specially
4742 tree step
= vect_dr_behavior (dr_info
)->step
;
4743 gcc_assert (!integer_zerop (step
));
4745 if (iv_step
== NULL_TREE
)
4747 /* The step of the aggregate pointer is the type size,
4748 negated for downward accesses. */
4749 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4750 if (tree_int_cst_sgn (step
) == -1)
4751 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4754 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4756 create_iv (aggr_ptr_init
,
4757 fold_convert (aggr_ptr_type
, iv_step
),
4758 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4759 &indx_before_incr
, &indx_after_incr
);
4760 incr
= gsi_stmt (incr_gsi
);
4761 loop_vinfo
->add_stmt (incr
);
4763 /* Copy the points-to information if it exists. */
4764 if (DR_PTR_INFO (dr
))
4766 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4767 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4772 aptr
= indx_before_incr
;
4775 if (!nested_in_vect_loop
|| only_init
)
4779 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4780 nested in LOOP, if exists. */
4782 gcc_assert (nested_in_vect_loop
);
4785 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4787 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4788 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4790 incr
= gsi_stmt (incr_gsi
);
4791 loop_vinfo
->add_stmt (incr
);
4793 /* Copy the points-to information if it exists. */
4794 if (DR_PTR_INFO (dr
))
4796 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4797 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4802 return indx_before_incr
;
4809 /* Function bump_vector_ptr
4811 Increment a pointer (to a vector type) by vector-size. If requested,
4812 i.e. if PTR-INCR is given, then also connect the new increment stmt
4813 to the existing def-use update-chain of the pointer, by modifying
4814 the PTR_INCR as illustrated below:
4816 The pointer def-use update-chain before this function:
4817 DATAREF_PTR = phi (p_0, p_2)
4819 PTR_INCR: p_2 = DATAREF_PTR + step
4821 The pointer def-use update-chain after this function:
4822 DATAREF_PTR = phi (p_0, p_2)
4824 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4826 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4829 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4831 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4832 the loop. The increment amount across iterations is expected
4834 BSI - location where the new update stmt is to be placed.
4835 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4836 BUMP - optional. The offset by which to bump the pointer. If not given,
4837 the offset is assumed to be vector_size.
4839 Output: Return NEW_DATAREF_PTR as illustrated above.
4844 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4845 stmt_vec_info stmt_info
, tree bump
)
4847 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4848 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4849 tree update
= TYPE_SIZE_UNIT (vectype
);
4852 use_operand_p use_p
;
4853 tree new_dataref_ptr
;
4858 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4859 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4861 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4862 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4863 dataref_ptr
, update
);
4864 vect_finish_stmt_generation (stmt_info
, incr_stmt
, gsi
);
4866 /* Copy the points-to information if it exists. */
4867 if (DR_PTR_INFO (dr
))
4869 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4870 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4874 return new_dataref_ptr
;
4876 /* Update the vector-pointer's cross-iteration increment. */
4877 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4879 tree use
= USE_FROM_PTR (use_p
);
4881 if (use
== dataref_ptr
)
4882 SET_USE (use_p
, new_dataref_ptr
);
4884 gcc_assert (operand_equal_p (use
, update
, 0));
4887 return new_dataref_ptr
;
4891 /* Copy memory reference info such as base/clique from the SRC reference
4892 to the DEST MEM_REF. */
4895 vect_copy_ref_info (tree dest
, tree src
)
4897 if (TREE_CODE (dest
) != MEM_REF
)
4900 tree src_base
= src
;
4901 while (handled_component_p (src_base
))
4902 src_base
= TREE_OPERAND (src_base
, 0);
4903 if (TREE_CODE (src_base
) != MEM_REF
4904 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
4907 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
4908 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
4912 /* Function vect_create_destination_var.
4914 Create a new temporary of type VECTYPE. */
4917 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4923 enum vect_var_kind kind
;
4926 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4930 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4932 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4934 name
= get_name (scalar_dest
);
4936 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4938 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4939 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4945 /* Function vect_grouped_store_supported.
4947 Returns TRUE if interleave high and interleave low permutations
4948 are supported, and FALSE otherwise. */
4951 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4953 machine_mode mode
= TYPE_MODE (vectype
);
4955 /* vect_permute_store_chain requires the group size to be equal to 3 or
4956 be a power of two. */
4957 if (count
!= 3 && exact_log2 (count
) == -1)
4959 if (dump_enabled_p ())
4960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4961 "the size of the group of accesses"
4962 " is not a power of 2 or not eqaul to 3\n");
4966 /* Check that the permutation is supported. */
4967 if (VECTOR_MODE_P (mode
))
4972 unsigned int j0
= 0, j1
= 0, j2
= 0;
4976 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4978 if (dump_enabled_p ())
4979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4980 "cannot handle groups of 3 stores for"
4981 " variable-length vectors\n");
4985 vec_perm_builder
sel (nelt
, nelt
, 1);
4986 sel
.quick_grow (nelt
);
4987 vec_perm_indices indices
;
4988 for (j
= 0; j
< 3; j
++)
4990 int nelt0
= ((3 - j
) * nelt
) % 3;
4991 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4992 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4993 for (i
= 0; i
< nelt
; i
++)
4995 if (3 * i
+ nelt0
< nelt
)
4996 sel
[3 * i
+ nelt0
] = j0
++;
4997 if (3 * i
+ nelt1
< nelt
)
4998 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4999 if (3 * i
+ nelt2
< nelt
)
5000 sel
[3 * i
+ nelt2
] = 0;
5002 indices
.new_vector (sel
, 2, nelt
);
5003 if (!can_vec_perm_const_p (mode
, indices
))
5005 if (dump_enabled_p ())
5006 dump_printf (MSG_MISSED_OPTIMIZATION
,
5007 "permutation op not supported by target.\n");
5011 for (i
= 0; i
< nelt
; i
++)
5013 if (3 * i
+ nelt0
< nelt
)
5014 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5015 if (3 * i
+ nelt1
< nelt
)
5016 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5017 if (3 * i
+ nelt2
< nelt
)
5018 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5020 indices
.new_vector (sel
, 2, nelt
);
5021 if (!can_vec_perm_const_p (mode
, indices
))
5023 if (dump_enabled_p ())
5024 dump_printf (MSG_MISSED_OPTIMIZATION
,
5025 "permutation op not supported by target.\n");
5033 /* If length is not equal to 3 then only power of 2 is supported. */
5034 gcc_assert (pow2p_hwi (count
));
5035 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5037 /* The encoding has 2 interleaved stepped patterns. */
5038 vec_perm_builder
sel (nelt
, 2, 3);
5040 for (i
= 0; i
< 3; i
++)
5043 sel
[i
* 2 + 1] = i
+ nelt
;
5045 vec_perm_indices
indices (sel
, 2, nelt
);
5046 if (can_vec_perm_const_p (mode
, indices
))
5048 for (i
= 0; i
< 6; i
++)
5049 sel
[i
] += exact_div (nelt
, 2);
5050 indices
.new_vector (sel
, 2, nelt
);
5051 if (can_vec_perm_const_p (mode
, indices
))
5057 if (dump_enabled_p ())
5058 dump_printf (MSG_MISSED_OPTIMIZATION
,
5059 "permutation op not supported by target.\n");
5064 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5065 type VECTYPE. MASKED_P says whether the masked form is needed. */
5068 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5072 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5073 vec_mask_store_lanes_optab
,
5076 return vect_lanes_optab_supported_p ("vec_store_lanes",
5077 vec_store_lanes_optab
,
5082 /* Function vect_permute_store_chain.
5084 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5085 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5086 the data correctly for the stores. Return the final references for stores
5089 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5090 The input is 4 vectors each containing 8 elements. We assign a number to
5091 each element, the input sequence is:
5093 1st vec: 0 1 2 3 4 5 6 7
5094 2nd vec: 8 9 10 11 12 13 14 15
5095 3rd vec: 16 17 18 19 20 21 22 23
5096 4th vec: 24 25 26 27 28 29 30 31
5098 The output sequence should be:
5100 1st vec: 0 8 16 24 1 9 17 25
5101 2nd vec: 2 10 18 26 3 11 19 27
5102 3rd vec: 4 12 20 28 5 13 21 30
5103 4th vec: 6 14 22 30 7 15 23 31
5105 i.e., we interleave the contents of the four vectors in their order.
5107 We use interleave_high/low instructions to create such output. The input of
5108 each interleave_high/low operation is two vectors:
5111 the even elements of the result vector are obtained left-to-right from the
5112 high/low elements of the first vector. The odd elements of the result are
5113 obtained left-to-right from the high/low elements of the second vector.
5114 The output of interleave_high will be: 0 4 1 5
5115 and of interleave_low: 2 6 3 7
5118 The permutation is done in log LENGTH stages. In each stage interleave_high
5119 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5120 where the first argument is taken from the first half of DR_CHAIN and the
5121 second argument from it's second half.
5124 I1: interleave_high (1st vec, 3rd vec)
5125 I2: interleave_low (1st vec, 3rd vec)
5126 I3: interleave_high (2nd vec, 4th vec)
5127 I4: interleave_low (2nd vec, 4th vec)
5129 The output for the first stage is:
5131 I1: 0 16 1 17 2 18 3 19
5132 I2: 4 20 5 21 6 22 7 23
5133 I3: 8 24 9 25 10 26 11 27
5134 I4: 12 28 13 29 14 30 15 31
5136 The output of the second stage, i.e. the final result is:
5138 I1: 0 8 16 24 1 9 17 25
5139 I2: 2 10 18 26 3 11 19 27
5140 I3: 4 12 20 28 5 13 21 30
5141 I4: 6 14 22 30 7 15 23 31. */
5144 vect_permute_store_chain (vec
<tree
> dr_chain
,
5145 unsigned int length
,
5146 stmt_vec_info stmt_info
,
5147 gimple_stmt_iterator
*gsi
,
5148 vec
<tree
> *result_chain
)
5150 tree vect1
, vect2
, high
, low
;
5152 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5153 tree perm_mask_low
, perm_mask_high
;
5155 tree perm3_mask_low
, perm3_mask_high
;
5156 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5158 result_chain
->quick_grow (length
);
5159 memcpy (result_chain
->address (), dr_chain
.address (),
5160 length
* sizeof (tree
));
5164 /* vect_grouped_store_supported ensures that this is constant. */
5165 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5166 unsigned int j0
= 0, j1
= 0, j2
= 0;
5168 vec_perm_builder
sel (nelt
, nelt
, 1);
5169 sel
.quick_grow (nelt
);
5170 vec_perm_indices indices
;
5171 for (j
= 0; j
< 3; j
++)
5173 int nelt0
= ((3 - j
) * nelt
) % 3;
5174 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5175 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5177 for (i
= 0; i
< nelt
; i
++)
5179 if (3 * i
+ nelt0
< nelt
)
5180 sel
[3 * i
+ nelt0
] = j0
++;
5181 if (3 * i
+ nelt1
< nelt
)
5182 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5183 if (3 * i
+ nelt2
< nelt
)
5184 sel
[3 * i
+ nelt2
] = 0;
5186 indices
.new_vector (sel
, 2, nelt
);
5187 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5189 for (i
= 0; i
< nelt
; i
++)
5191 if (3 * i
+ nelt0
< nelt
)
5192 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5193 if (3 * i
+ nelt1
< nelt
)
5194 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5195 if (3 * i
+ nelt2
< nelt
)
5196 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5198 indices
.new_vector (sel
, 2, nelt
);
5199 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5201 vect1
= dr_chain
[0];
5202 vect2
= dr_chain
[1];
5204 /* Create interleaving stmt:
5205 low = VEC_PERM_EXPR <vect1, vect2,
5206 {j, nelt, *, j + 1, nelt + j + 1, *,
5207 j + 2, nelt + j + 2, *, ...}> */
5208 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5209 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5210 vect2
, perm3_mask_low
);
5211 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5214 vect2
= dr_chain
[2];
5215 /* Create interleaving stmt:
5216 low = VEC_PERM_EXPR <vect1, vect2,
5217 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5218 6, 7, nelt + j + 2, ...}> */
5219 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5220 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5221 vect2
, perm3_mask_high
);
5222 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5223 (*result_chain
)[j
] = data_ref
;
5228 /* If length is not equal to 3 then only power of 2 is supported. */
5229 gcc_assert (pow2p_hwi (length
));
5231 /* The encoding has 2 interleaved stepped patterns. */
5232 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5233 vec_perm_builder
sel (nelt
, 2, 3);
5235 for (i
= 0; i
< 3; i
++)
5238 sel
[i
* 2 + 1] = i
+ nelt
;
5240 vec_perm_indices
indices (sel
, 2, nelt
);
5241 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5243 for (i
= 0; i
< 6; i
++)
5244 sel
[i
] += exact_div (nelt
, 2);
5245 indices
.new_vector (sel
, 2, nelt
);
5246 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5248 for (i
= 0, n
= log_length
; i
< n
; i
++)
5250 for (j
= 0; j
< length
/2; j
++)
5252 vect1
= dr_chain
[j
];
5253 vect2
= dr_chain
[j
+length
/2];
5255 /* Create interleaving stmt:
5256 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5258 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5259 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5260 vect2
, perm_mask_high
);
5261 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5262 (*result_chain
)[2*j
] = high
;
5264 /* Create interleaving stmt:
5265 low = VEC_PERM_EXPR <vect1, vect2,
5266 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5268 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5269 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5270 vect2
, perm_mask_low
);
5271 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5272 (*result_chain
)[2*j
+1] = low
;
5274 memcpy (dr_chain
.address (), result_chain
->address (),
5275 length
* sizeof (tree
));
5280 /* Function vect_setup_realignment
5282 This function is called when vectorizing an unaligned load using
5283 the dr_explicit_realign[_optimized] scheme.
5284 This function generates the following code at the loop prolog:
5287 x msq_init = *(floor(p)); # prolog load
5288 realignment_token = call target_builtin;
5290 x msq = phi (msq_init, ---)
5292 The stmts marked with x are generated only for the case of
5293 dr_explicit_realign_optimized.
5295 The code above sets up a new (vector) pointer, pointing to the first
5296 location accessed by STMT_INFO, and a "floor-aligned" load using that
5297 pointer. It also generates code to compute the "realignment-token"
5298 (if the relevant target hook was defined), and creates a phi-node at the
5299 loop-header bb whose arguments are the result of the prolog-load (created
5300 by this function) and the result of a load that takes place in the loop
5301 (to be created by the caller to this function).
5303 For the case of dr_explicit_realign_optimized:
5304 The caller to this function uses the phi-result (msq) to create the
5305 realignment code inside the loop, and sets up the missing phi argument,
5308 msq = phi (msq_init, lsq)
5309 lsq = *(floor(p')); # load in loop
5310 result = realign_load (msq, lsq, realignment_token);
5312 For the case of dr_explicit_realign:
5314 msq = *(floor(p)); # load in loop
5316 lsq = *(floor(p')); # load in loop
5317 result = realign_load (msq, lsq, realignment_token);
5320 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5321 a memory location that may be unaligned.
5322 BSI - place where new code is to be inserted.
5323 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5327 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5328 target hook, if defined.
5329 Return value - the result of the loop-header phi node. */
5332 vect_setup_realignment (stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
5333 tree
*realignment_token
,
5334 enum dr_alignment_support alignment_support_scheme
,
5336 struct loop
**at_loop
)
5338 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5339 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5340 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5341 struct data_reference
*dr
= dr_info
->dr
;
5342 struct loop
*loop
= NULL
;
5344 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
5350 tree msq_init
= NULL_TREE
;
5353 tree msq
= NULL_TREE
;
5354 gimple_seq stmts
= NULL
;
5355 bool compute_in_loop
= false;
5356 bool nested_in_vect_loop
= false;
5357 struct loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5358 struct loop
*loop_for_initial_load
= NULL
;
5362 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5363 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5366 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5367 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5369 /* We need to generate three things:
5370 1. the misalignment computation
5371 2. the extra vector load (for the optimized realignment scheme).
5372 3. the phi node for the two vectors from which the realignment is
5373 done (for the optimized realignment scheme). */
5375 /* 1. Determine where to generate the misalignment computation.
5377 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5378 calculation will be generated by this function, outside the loop (in the
5379 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5380 caller, inside the loop.
5382 Background: If the misalignment remains fixed throughout the iterations of
5383 the loop, then both realignment schemes are applicable, and also the
5384 misalignment computation can be done outside LOOP. This is because we are
5385 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5386 are a multiple of VS (the Vector Size), and therefore the misalignment in
5387 different vectorized LOOP iterations is always the same.
5388 The problem arises only if the memory access is in an inner-loop nested
5389 inside LOOP, which is now being vectorized using outer-loop vectorization.
5390 This is the only case when the misalignment of the memory access may not
5391 remain fixed throughout the iterations of the inner-loop (as explained in
5392 detail in vect_supportable_dr_alignment). In this case, not only is the
5393 optimized realignment scheme not applicable, but also the misalignment
5394 computation (and generation of the realignment token that is passed to
5395 REALIGN_LOAD) have to be done inside the loop.
5397 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5398 or not, which in turn determines if the misalignment is computed inside
5399 the inner-loop, or outside LOOP. */
5401 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5403 compute_in_loop
= true;
5404 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5408 /* 2. Determine where to generate the extra vector load.
5410 For the optimized realignment scheme, instead of generating two vector
5411 loads in each iteration, we generate a single extra vector load in the
5412 preheader of the loop, and in each iteration reuse the result of the
5413 vector load from the previous iteration. In case the memory access is in
5414 an inner-loop nested inside LOOP, which is now being vectorized using
5415 outer-loop vectorization, we need to determine whether this initial vector
5416 load should be generated at the preheader of the inner-loop, or can be
5417 generated at the preheader of LOOP. If the memory access has no evolution
5418 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5419 to be generated inside LOOP (in the preheader of the inner-loop). */
5421 if (nested_in_vect_loop
)
5423 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5424 bool invariant_in_outerloop
=
5425 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5426 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5429 loop_for_initial_load
= loop
;
5431 *at_loop
= loop_for_initial_load
;
5433 if (loop_for_initial_load
)
5434 pe
= loop_preheader_edge (loop_for_initial_load
);
5436 /* 3. For the case of the optimized realignment, create the first vector
5437 load at the loop preheader. */
5439 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5441 /* Create msq_init = *(floor(p1)) in the loop preheader */
5444 gcc_assert (!compute_in_loop
);
5445 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5446 ptr
= vect_create_data_ref_ptr (stmt_info
, vectype
,
5447 loop_for_initial_load
, NULL_TREE
,
5448 &init_addr
, NULL
, &inc
, true);
5449 if (TREE_CODE (ptr
) == SSA_NAME
)
5450 new_temp
= copy_ssa_name (ptr
);
5452 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5453 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
5454 tree type
= TREE_TYPE (ptr
);
5455 new_stmt
= gimple_build_assign
5456 (new_temp
, BIT_AND_EXPR
, ptr
,
5457 fold_build2 (MINUS_EXPR
, type
,
5458 build_int_cst (type
, 0),
5459 build_int_cst (type
, align
)));
5460 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5461 gcc_assert (!new_bb
);
5463 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5464 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5465 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5466 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5467 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5468 gimple_assign_set_lhs (new_stmt
, new_temp
);
5471 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5472 gcc_assert (!new_bb
);
5475 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5477 msq_init
= gimple_assign_lhs (new_stmt
);
5480 /* 4. Create realignment token using a target builtin, if available.
5481 It is done either inside the containing loop, or before LOOP (as
5482 determined above). */
5484 if (targetm
.vectorize
.builtin_mask_for_load
)
5489 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5492 /* Generate the INIT_ADDR computation outside LOOP. */
5493 init_addr
= vect_create_addr_base_for_vector_ref (stmt_info
, &stmts
,
5497 pe
= loop_preheader_edge (loop
);
5498 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5499 gcc_assert (!new_bb
);
5502 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5505 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5506 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5508 vect_create_destination_var (scalar_dest
,
5509 gimple_call_return_type (new_stmt
));
5510 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5511 gimple_call_set_lhs (new_stmt
, new_temp
);
5513 if (compute_in_loop
)
5514 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5517 /* Generate the misalignment computation outside LOOP. */
5518 pe
= loop_preheader_edge (loop
);
5519 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5520 gcc_assert (!new_bb
);
5523 *realignment_token
= gimple_call_lhs (new_stmt
);
5525 /* The result of the CALL_EXPR to this builtin is determined from
5526 the value of the parameter and no global variables are touched
5527 which makes the builtin a "const" function. Requiring the
5528 builtin to have the "const" attribute makes it unnecessary
5529 to call mark_call_clobbered. */
5530 gcc_assert (TREE_READONLY (builtin_decl
));
5533 if (alignment_support_scheme
== dr_explicit_realign
)
5536 gcc_assert (!compute_in_loop
);
5537 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5540 /* 5. Create msq = phi <msq_init, lsq> in loop */
5542 pe
= loop_preheader_edge (containing_loop
);
5543 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5544 msq
= make_ssa_name (vec_dest
);
5545 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5546 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5552 /* Function vect_grouped_load_supported.
5554 COUNT is the size of the load group (the number of statements plus the
5555 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5556 only one statement, with a gap of COUNT - 1.
5558 Returns true if a suitable permute exists. */
5561 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5562 unsigned HOST_WIDE_INT count
)
5564 machine_mode mode
= TYPE_MODE (vectype
);
5566 /* If this is single-element interleaving with an element distance
5567 that leaves unused vector loads around punt - we at least create
5568 very sub-optimal code in that case (and blow up memory,
5570 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5572 if (dump_enabled_p ())
5573 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5574 "single-element interleaving not supported "
5575 "for not adjacent vector loads\n");
5579 /* vect_permute_load_chain requires the group size to be equal to 3 or
5580 be a power of two. */
5581 if (count
!= 3 && exact_log2 (count
) == -1)
5583 if (dump_enabled_p ())
5584 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5585 "the size of the group of accesses"
5586 " is not a power of 2 or not equal to 3\n");
5590 /* Check that the permutation is supported. */
5591 if (VECTOR_MODE_P (mode
))
5597 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5599 if (dump_enabled_p ())
5600 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5601 "cannot handle groups of 3 loads for"
5602 " variable-length vectors\n");
5606 vec_perm_builder
sel (nelt
, nelt
, 1);
5607 sel
.quick_grow (nelt
);
5608 vec_perm_indices indices
;
5610 for (k
= 0; k
< 3; k
++)
5612 for (i
= 0; i
< nelt
; i
++)
5613 if (3 * i
+ k
< 2 * nelt
)
5617 indices
.new_vector (sel
, 2, nelt
);
5618 if (!can_vec_perm_const_p (mode
, indices
))
5620 if (dump_enabled_p ())
5621 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5622 "shuffle of 3 loads is not supported by"
5626 for (i
= 0, j
= 0; i
< nelt
; i
++)
5627 if (3 * i
+ k
< 2 * nelt
)
5630 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5631 indices
.new_vector (sel
, 2, nelt
);
5632 if (!can_vec_perm_const_p (mode
, indices
))
5634 if (dump_enabled_p ())
5635 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5636 "shuffle of 3 loads is not supported by"
5645 /* If length is not equal to 3 then only power of 2 is supported. */
5646 gcc_assert (pow2p_hwi (count
));
5647 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5649 /* The encoding has a single stepped pattern. */
5650 vec_perm_builder
sel (nelt
, 1, 3);
5652 for (i
= 0; i
< 3; i
++)
5654 vec_perm_indices
indices (sel
, 2, nelt
);
5655 if (can_vec_perm_const_p (mode
, indices
))
5657 for (i
= 0; i
< 3; i
++)
5659 indices
.new_vector (sel
, 2, nelt
);
5660 if (can_vec_perm_const_p (mode
, indices
))
5666 if (dump_enabled_p ())
5667 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5668 "extract even/odd not supported by target\n");
5672 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5673 type VECTYPE. MASKED_P says whether the masked form is needed. */
5676 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5680 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5681 vec_mask_load_lanes_optab
,
5684 return vect_lanes_optab_supported_p ("vec_load_lanes",
5685 vec_load_lanes_optab
,
5689 /* Function vect_permute_load_chain.
5691 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5692 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5693 the input data correctly. Return the final references for loads in
5696 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5697 The input is 4 vectors each containing 8 elements. We assign a number to each
5698 element, the input sequence is:
5700 1st vec: 0 1 2 3 4 5 6 7
5701 2nd vec: 8 9 10 11 12 13 14 15
5702 3rd vec: 16 17 18 19 20 21 22 23
5703 4th vec: 24 25 26 27 28 29 30 31
5705 The output sequence should be:
5707 1st vec: 0 4 8 12 16 20 24 28
5708 2nd vec: 1 5 9 13 17 21 25 29
5709 3rd vec: 2 6 10 14 18 22 26 30
5710 4th vec: 3 7 11 15 19 23 27 31
5712 i.e., the first output vector should contain the first elements of each
5713 interleaving group, etc.
5715 We use extract_even/odd instructions to create such output. The input of
5716 each extract_even/odd operation is two vectors
5720 and the output is the vector of extracted even/odd elements. The output of
5721 extract_even will be: 0 2 4 6
5722 and of extract_odd: 1 3 5 7
5725 The permutation is done in log LENGTH stages. In each stage extract_even
5726 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5727 their order. In our example,
5729 E1: extract_even (1st vec, 2nd vec)
5730 E2: extract_odd (1st vec, 2nd vec)
5731 E3: extract_even (3rd vec, 4th vec)
5732 E4: extract_odd (3rd vec, 4th vec)
5734 The output for the first stage will be:
5736 E1: 0 2 4 6 8 10 12 14
5737 E2: 1 3 5 7 9 11 13 15
5738 E3: 16 18 20 22 24 26 28 30
5739 E4: 17 19 21 23 25 27 29 31
5741 In order to proceed and create the correct sequence for the next stage (or
5742 for the correct output, if the second stage is the last one, as in our
5743 example), we first put the output of extract_even operation and then the
5744 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5745 The input for the second stage is:
5747 1st vec (E1): 0 2 4 6 8 10 12 14
5748 2nd vec (E3): 16 18 20 22 24 26 28 30
5749 3rd vec (E2): 1 3 5 7 9 11 13 15
5750 4th vec (E4): 17 19 21 23 25 27 29 31
5752 The output of the second stage:
5754 E1: 0 4 8 12 16 20 24 28
5755 E2: 2 6 10 14 18 22 26 30
5756 E3: 1 5 9 13 17 21 25 29
5757 E4: 3 7 11 15 19 23 27 31
5759 And RESULT_CHAIN after reordering:
5761 1st vec (E1): 0 4 8 12 16 20 24 28
5762 2nd vec (E3): 1 5 9 13 17 21 25 29
5763 3rd vec (E2): 2 6 10 14 18 22 26 30
5764 4th vec (E4): 3 7 11 15 19 23 27 31. */
5767 vect_permute_load_chain (vec
<tree
> dr_chain
,
5768 unsigned int length
,
5769 stmt_vec_info stmt_info
,
5770 gimple_stmt_iterator
*gsi
,
5771 vec
<tree
> *result_chain
)
5773 tree data_ref
, first_vect
, second_vect
;
5774 tree perm_mask_even
, perm_mask_odd
;
5775 tree perm3_mask_low
, perm3_mask_high
;
5777 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5778 unsigned int i
, j
, log_length
= exact_log2 (length
);
5780 result_chain
->quick_grow (length
);
5781 memcpy (result_chain
->address (), dr_chain
.address (),
5782 length
* sizeof (tree
));
5786 /* vect_grouped_load_supported ensures that this is constant. */
5787 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5790 vec_perm_builder
sel (nelt
, nelt
, 1);
5791 sel
.quick_grow (nelt
);
5792 vec_perm_indices indices
;
5793 for (k
= 0; k
< 3; k
++)
5795 for (i
= 0; i
< nelt
; i
++)
5796 if (3 * i
+ k
< 2 * nelt
)
5800 indices
.new_vector (sel
, 2, nelt
);
5801 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5803 for (i
= 0, j
= 0; i
< nelt
; i
++)
5804 if (3 * i
+ k
< 2 * nelt
)
5807 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5808 indices
.new_vector (sel
, 2, nelt
);
5809 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5811 first_vect
= dr_chain
[0];
5812 second_vect
= dr_chain
[1];
5814 /* Create interleaving stmt (low part of):
5815 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5817 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5818 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5819 second_vect
, perm3_mask_low
);
5820 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5822 /* Create interleaving stmt (high part of):
5823 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5825 first_vect
= data_ref
;
5826 second_vect
= dr_chain
[2];
5827 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5828 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5829 second_vect
, perm3_mask_high
);
5830 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5831 (*result_chain
)[k
] = data_ref
;
5836 /* If length is not equal to 3 then only power of 2 is supported. */
5837 gcc_assert (pow2p_hwi (length
));
5839 /* The encoding has a single stepped pattern. */
5840 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5841 vec_perm_builder
sel (nelt
, 1, 3);
5843 for (i
= 0; i
< 3; ++i
)
5845 vec_perm_indices
indices (sel
, 2, nelt
);
5846 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5848 for (i
= 0; i
< 3; ++i
)
5850 indices
.new_vector (sel
, 2, nelt
);
5851 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5853 for (i
= 0; i
< log_length
; i
++)
5855 for (j
= 0; j
< length
; j
+= 2)
5857 first_vect
= dr_chain
[j
];
5858 second_vect
= dr_chain
[j
+1];
5860 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5861 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5862 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5863 first_vect
, second_vect
,
5865 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5866 (*result_chain
)[j
/2] = data_ref
;
5868 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5869 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5870 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5871 first_vect
, second_vect
,
5873 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
5874 (*result_chain
)[j
/2+length
/2] = data_ref
;
5876 memcpy (dr_chain
.address (), result_chain
->address (),
5877 length
* sizeof (tree
));
5882 /* Function vect_shift_permute_load_chain.
5884 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5885 sequence of stmts to reorder the input data accordingly.
5886 Return the final references for loads in RESULT_CHAIN.
5887 Return true if successed, false otherwise.
5889 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5890 The input is 3 vectors each containing 8 elements. We assign a
5891 number to each element, the input sequence is:
5893 1st vec: 0 1 2 3 4 5 6 7
5894 2nd vec: 8 9 10 11 12 13 14 15
5895 3rd vec: 16 17 18 19 20 21 22 23
5897 The output sequence should be:
5899 1st vec: 0 3 6 9 12 15 18 21
5900 2nd vec: 1 4 7 10 13 16 19 22
5901 3rd vec: 2 5 8 11 14 17 20 23
5903 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5905 First we shuffle all 3 vectors to get correct elements order:
5907 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5908 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5909 3rd vec: (16 19 22) (17 20 23) (18 21)
5911 Next we unite and shift vector 3 times:
5914 shift right by 6 the concatenation of:
5915 "1st vec" and "2nd vec"
5916 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5917 "2nd vec" and "3rd vec"
5918 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5919 "3rd vec" and "1st vec"
5920 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5923 So that now new vectors are:
5925 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5926 2nd vec: (10 13) (16 19 22) (17 20 23)
5927 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5930 shift right by 5 the concatenation of:
5931 "1st vec" and "3rd vec"
5932 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5933 "2nd vec" and "1st vec"
5934 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5935 "3rd vec" and "2nd vec"
5936 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5939 So that now new vectors are:
5941 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5942 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5943 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5946 shift right by 5 the concatenation of:
5947 "1st vec" and "1st vec"
5948 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5949 shift right by 3 the concatenation of:
5950 "2nd vec" and "2nd vec"
5951 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5954 So that now all vectors are READY:
5955 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5956 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5957 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5959 This algorithm is faster than one in vect_permute_load_chain if:
5960 1. "shift of a concatination" is faster than general permutation.
5962 2. The TARGET machine can't execute vector instructions in parallel.
5963 This is because each step of the algorithm depends on previous.
5964 The algorithm in vect_permute_load_chain is much more parallel.
5966 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5970 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5971 unsigned int length
,
5972 stmt_vec_info stmt_info
,
5973 gimple_stmt_iterator
*gsi
,
5974 vec
<tree
> *result_chain
)
5976 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5977 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5978 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5981 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5983 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5985 unsigned HOST_WIDE_INT nelt
, vf
;
5986 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
5987 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
5988 /* Not supported for variable-length vectors. */
5991 vec_perm_builder
sel (nelt
, nelt
, 1);
5992 sel
.quick_grow (nelt
);
5994 result_chain
->quick_grow (length
);
5995 memcpy (result_chain
->address (), dr_chain
.address (),
5996 length
* sizeof (tree
));
5998 if (pow2p_hwi (length
) && vf
> 4)
6000 unsigned int j
, log_length
= exact_log2 (length
);
6001 for (i
= 0; i
< nelt
/ 2; ++i
)
6003 for (i
= 0; i
< nelt
/ 2; ++i
)
6004 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6005 vec_perm_indices
indices (sel
, 2, nelt
);
6006 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6008 if (dump_enabled_p ())
6009 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6010 "shuffle of 2 fields structure is not \
6011 supported by target\n");
6014 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6016 for (i
= 0; i
< nelt
/ 2; ++i
)
6018 for (i
= 0; i
< nelt
/ 2; ++i
)
6019 sel
[nelt
/ 2 + i
] = i
* 2;
6020 indices
.new_vector (sel
, 2, nelt
);
6021 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6023 if (dump_enabled_p ())
6024 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6025 "shuffle of 2 fields structure is not \
6026 supported by target\n");
6029 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6031 /* Generating permutation constant to shift all elements.
6032 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6033 for (i
= 0; i
< nelt
; i
++)
6034 sel
[i
] = nelt
/ 2 + i
;
6035 indices
.new_vector (sel
, 2, nelt
);
6036 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6038 if (dump_enabled_p ())
6039 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6040 "shift permutation is not supported by target\n");
6043 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6045 /* Generating permutation constant to select vector from 2.
6046 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6047 for (i
= 0; i
< nelt
/ 2; i
++)
6049 for (i
= nelt
/ 2; i
< nelt
; i
++)
6051 indices
.new_vector (sel
, 2, nelt
);
6052 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6054 if (dump_enabled_p ())
6055 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6056 "select is not supported by target\n");
6059 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6061 for (i
= 0; i
< log_length
; i
++)
6063 for (j
= 0; j
< length
; j
+= 2)
6065 first_vect
= dr_chain
[j
];
6066 second_vect
= dr_chain
[j
+ 1];
6068 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6069 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6070 first_vect
, first_vect
,
6072 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6075 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6076 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6077 second_vect
, second_vect
,
6079 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6082 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6083 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6084 vect
[0], vect
[1], shift1_mask
);
6085 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6086 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6088 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6089 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6090 vect
[0], vect
[1], select_mask
);
6091 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6092 (*result_chain
)[j
/2] = data_ref
;
6094 memcpy (dr_chain
.address (), result_chain
->address (),
6095 length
* sizeof (tree
));
6099 if (length
== 3 && vf
> 2)
6101 unsigned int k
= 0, l
= 0;
6103 /* Generating permutation constant to get all elements in rigth order.
6104 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6105 for (i
= 0; i
< nelt
; i
++)
6107 if (3 * k
+ (l
% 3) >= nelt
)
6110 l
+= (3 - (nelt
% 3));
6112 sel
[i
] = 3 * k
+ (l
% 3);
6115 vec_perm_indices
indices (sel
, 2, nelt
);
6116 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6118 if (dump_enabled_p ())
6119 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6120 "shuffle of 3 fields structure is not \
6121 supported by target\n");
6124 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6126 /* Generating permutation constant to shift all elements.
6127 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6128 for (i
= 0; i
< nelt
; i
++)
6129 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6130 indices
.new_vector (sel
, 2, nelt
);
6131 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6133 if (dump_enabled_p ())
6134 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6135 "shift permutation is not supported by target\n");
6138 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6140 /* Generating permutation constant to shift all elements.
6141 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6142 for (i
= 0; i
< nelt
; i
++)
6143 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6144 indices
.new_vector (sel
, 2, nelt
);
6145 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6147 if (dump_enabled_p ())
6148 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6149 "shift permutation is not supported by target\n");
6152 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6154 /* Generating permutation constant to shift all elements.
6155 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6156 for (i
= 0; i
< nelt
; i
++)
6157 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6158 indices
.new_vector (sel
, 2, nelt
);
6159 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6161 if (dump_enabled_p ())
6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6163 "shift permutation is not supported by target\n");
6166 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6168 /* Generating permutation constant to shift all elements.
6169 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6170 for (i
= 0; i
< nelt
; i
++)
6171 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6172 indices
.new_vector (sel
, 2, nelt
);
6173 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6175 if (dump_enabled_p ())
6176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6177 "shift permutation is not supported by target\n");
6180 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6182 for (k
= 0; k
< 3; k
++)
6184 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6185 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6186 dr_chain
[k
], dr_chain
[k
],
6188 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6192 for (k
= 0; k
< 3; k
++)
6194 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6195 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6196 vect
[k
% 3], vect
[(k
+ 1) % 3],
6198 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6199 vect_shift
[k
] = data_ref
;
6202 for (k
= 0; k
< 3; k
++)
6204 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6205 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6206 vect_shift
[(4 - k
) % 3],
6207 vect_shift
[(3 - k
) % 3],
6209 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6213 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6215 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6216 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6217 vect
[0], shift3_mask
);
6218 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6219 (*result_chain
)[nelt
% 3] = data_ref
;
6221 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6222 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6223 vect
[1], shift4_mask
);
6224 vect_finish_stmt_generation (stmt_info
, perm_stmt
, gsi
);
6225 (*result_chain
)[0] = data_ref
;
6231 /* Function vect_transform_grouped_load.
6233 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6234 to perform their permutation and ascribe the result vectorized statements to
6235 the scalar statements.
6239 vect_transform_grouped_load (stmt_vec_info stmt_info
, vec
<tree
> dr_chain
,
6240 int size
, gimple_stmt_iterator
*gsi
)
6243 vec
<tree
> result_chain
= vNULL
;
6245 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6246 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6247 vectors, that are ready for vector computation. */
6248 result_chain
.create (size
);
6250 /* If reassociation width for vector type is 2 or greater target machine can
6251 execute 2 or more vector instructions in parallel. Otherwise try to
6252 get chain for loads group using vect_shift_permute_load_chain. */
6253 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6254 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6256 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt_info
,
6257 gsi
, &result_chain
))
6258 vect_permute_load_chain (dr_chain
, size
, stmt_info
, gsi
, &result_chain
);
6259 vect_record_grouped_load_vectors (stmt_info
, result_chain
);
6260 result_chain
.release ();
6263 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6264 generated as part of the vectorization of STMT_INFO. Assign the statement
6265 for each vector to the associated scalar statement. */
6268 vect_record_grouped_load_vectors (stmt_vec_info stmt_info
,
6269 vec
<tree
> result_chain
)
6271 vec_info
*vinfo
= stmt_info
->vinfo
;
6272 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6273 unsigned int i
, gap_count
;
6276 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6277 Since we scan the chain starting from it's first node, their order
6278 corresponds the order of data-refs in RESULT_CHAIN. */
6279 stmt_vec_info next_stmt_info
= first_stmt_info
;
6281 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6283 if (!next_stmt_info
)
6286 /* Skip the gaps. Loads created for the gaps will be removed by dead
6287 code elimination pass later. No need to check for the first stmt in
6288 the group, since it always exists.
6289 DR_GROUP_GAP is the number of steps in elements from the previous
6290 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6291 correspond to the gaps. */
6292 if (next_stmt_info
!= first_stmt_info
6293 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6299 while (next_stmt_info
)
6301 stmt_vec_info new_stmt_info
= vinfo
->lookup_def (tmp_data_ref
);
6302 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6303 copies, and we put the new vector statement in the first available
6305 if (!STMT_VINFO_VEC_STMT (next_stmt_info
))
6306 STMT_VINFO_VEC_STMT (next_stmt_info
) = new_stmt_info
;
6309 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info
))
6311 stmt_vec_info prev_stmt_info
6312 = STMT_VINFO_VEC_STMT (next_stmt_info
);
6313 stmt_vec_info rel_stmt_info
6314 = STMT_VINFO_RELATED_STMT (prev_stmt_info
);
6315 while (rel_stmt_info
)
6317 prev_stmt_info
= rel_stmt_info
;
6318 rel_stmt_info
= STMT_VINFO_RELATED_STMT (rel_stmt_info
);
6321 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt_info
;
6325 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
6327 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6328 put the same TMP_DATA_REF as its vectorized statement; otherwise
6329 get the next data-ref from RESULT_CHAIN. */
6330 if (!next_stmt_info
|| !DR_GROUP_SAME_DR_STMT (next_stmt_info
))
6336 /* Function vect_force_dr_alignment_p.
6338 Returns whether the alignment of a DECL can be forced to be aligned
6339 on ALIGNMENT bit boundary. */
6342 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
6347 if (decl_in_symtab_p (decl
)
6348 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6351 if (TREE_STATIC (decl
))
6352 return (known_le (alignment
,
6353 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
6355 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
6359 /* Return whether the data reference DR_INFO is supported with respect to its
6361 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6362 it is aligned, i.e., check if it is possible to vectorize it with different
6365 enum dr_alignment_support
6366 vect_supportable_dr_alignment (dr_vec_info
*dr_info
,
6367 bool check_aligned_accesses
)
6369 data_reference
*dr
= dr_info
->dr
;
6370 stmt_vec_info stmt_info
= dr_info
->stmt
;
6371 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6372 machine_mode mode
= TYPE_MODE (vectype
);
6373 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6374 struct loop
*vect_loop
= NULL
;
6375 bool nested_in_vect_loop
= false;
6377 if (aligned_access_p (dr_info
) && !check_aligned_accesses
)
6380 /* For now assume all conditional loads/stores support unaligned
6381 access without any special code. */
6382 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
6383 if (gimple_call_internal_p (stmt
)
6384 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6385 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6386 return dr_unaligned_supported
;
6390 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6391 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
6394 /* Possibly unaligned access. */
6396 /* We can choose between using the implicit realignment scheme (generating
6397 a misaligned_move stmt) and the explicit realignment scheme (generating
6398 aligned loads with a REALIGN_LOAD). There are two variants to the
6399 explicit realignment scheme: optimized, and unoptimized.
6400 We can optimize the realignment only if the step between consecutive
6401 vector loads is equal to the vector size. Since the vector memory
6402 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6403 is guaranteed that the misalignment amount remains the same throughout the
6404 execution of the vectorized loop. Therefore, we can create the
6405 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6406 at the loop preheader.
6408 However, in the case of outer-loop vectorization, when vectorizing a
6409 memory access in the inner-loop nested within the LOOP that is now being
6410 vectorized, while it is guaranteed that the misalignment of the
6411 vectorized memory access will remain the same in different outer-loop
6412 iterations, it is *not* guaranteed that is will remain the same throughout
6413 the execution of the inner-loop. This is because the inner-loop advances
6414 with the original scalar step (and not in steps of VS). If the inner-loop
6415 step happens to be a multiple of VS, then the misalignment remains fixed
6416 and we can use the optimized realignment scheme. For example:
6422 When vectorizing the i-loop in the above example, the step between
6423 consecutive vector loads is 1, and so the misalignment does not remain
6424 fixed across the execution of the inner-loop, and the realignment cannot
6425 be optimized (as illustrated in the following pseudo vectorized loop):
6427 for (i=0; i<N; i+=4)
6428 for (j=0; j<M; j++){
6429 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6430 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6431 // (assuming that we start from an aligned address).
6434 We therefore have to use the unoptimized realignment scheme:
6436 for (i=0; i<N; i+=4)
6437 for (j=k; j<M; j+=4)
6438 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6439 // that the misalignment of the initial address is
6442 The loop can then be vectorized as follows:
6444 for (k=0; k<4; k++){
6445 rt = get_realignment_token (&vp[k]);
6446 for (i=0; i<N; i+=4){
6448 for (j=k; j<M; j+=4){
6450 va = REALIGN_LOAD <v1,v2,rt>;
6457 if (DR_IS_READ (dr
))
6459 bool is_packed
= false;
6460 tree type
= (TREE_TYPE (DR_REF (dr
)));
6462 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6463 && (!targetm
.vectorize
.builtin_mask_for_load
6464 || targetm
.vectorize
.builtin_mask_for_load ()))
6466 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6468 /* If we are doing SLP then the accesses need not have the
6469 same alignment, instead it depends on the SLP group size. */
6471 && STMT_SLP_TYPE (stmt_info
)
6472 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6474 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6475 TYPE_VECTOR_SUBPARTS (vectype
)))
6477 else if (!loop_vinfo
6478 || (nested_in_vect_loop
6479 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6480 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6481 return dr_explicit_realign
;
6483 return dr_explicit_realign_optimized
;
6485 if (!known_alignment_for_access_p (dr_info
))
6486 is_packed
= not_size_aligned (DR_REF (dr
));
6488 if (targetm
.vectorize
.support_vector_misalignment
6489 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6490 /* Can't software pipeline the loads, but can at least do them. */
6491 return dr_unaligned_supported
;
6495 bool is_packed
= false;
6496 tree type
= (TREE_TYPE (DR_REF (dr
)));
6498 if (!known_alignment_for_access_p (dr_info
))
6499 is_packed
= not_size_aligned (DR_REF (dr
));
6501 if (targetm
.vectorize
.support_vector_misalignment
6502 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6503 return dr_unaligned_supported
;
6507 return dr_unaligned_unsupported
;