Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob07b5ba11f31374db63c551878364a1e0a88b9764
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 HOST_WIDE_INT lhs, rhs;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
150 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
152 unsigned int i = 0;
153 if (gimple_call_internal_p (call))
155 internal_fn ifn = gimple_call_internal_fn (call);
156 if (internal_load_fn_p (ifn))
157 /* For loads the LHS type does the trick. */
158 i = ~0U;
159 else if (internal_store_fn_p (ifn))
161 /* For stores use the tyep of the stored value. */
162 i = internal_fn_stored_value_index (ifn);
163 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 i = ~0U;
166 else if (internal_fn_mask_index (ifn) == 0)
167 i = 1;
169 if (i < gimple_call_num_args (call))
171 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
174 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 if (rhs < lhs)
176 scalar_type = rhs_type;
181 return scalar_type;
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186 tested at run-time. Return TRUE if DDR was successfully inserted.
187 Return false if versioning is not supported. */
189 static opt_result
190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
194 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195 return opt_result::failure_at (vect_location,
196 "will not create alias checks, as"
197 " --param vect-max-version-for-alias-checks"
198 " == 0\n");
200 opt_result res
201 = runtime_alias_check_p (ddr, loop,
202 optimize_loop_nest_for_speed_p (loop));
203 if (!res)
204 return res;
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207 return opt_result::success ();
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
212 static void
213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
215 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216 for (unsigned int i = 0; i < checks.length(); ++i)
217 if (checks[i] == value)
218 return;
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location,
222 "need run-time check that %T is nonzero\n",
223 value);
224 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229 DR_INFO_B. At least one of the accesses is a write. */
231 static bool
232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
234 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
237 /* Single statements are always kept in their original order. */
238 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240 return true;
242 /* STMT_A and STMT_B belong to overlapping groups. All loads are
243 emitted at the position of the first scalar load.
244 Stores in a group are emitted at the position of the last scalar store.
245 Compute that position and check whether the resulting order matches
246 the current one. */
247 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248 if (il_a)
250 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 s = DR_GROUP_NEXT_ELEMENT (s))
253 il_a = get_later_stmt (il_a, s);
254 else /* DR_IS_READ */
255 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 s = DR_GROUP_NEXT_ELEMENT (s))
257 if (get_later_stmt (il_a, s) == il_a)
258 il_a = s;
260 else
261 il_a = stmtinfo_a;
262 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263 if (il_b)
265 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 s = DR_GROUP_NEXT_ELEMENT (s))
268 il_b = get_later_stmt (il_b, s);
269 else /* DR_IS_READ */
270 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 s = DR_GROUP_NEXT_ELEMENT (s))
272 if (get_later_stmt (il_b, s) == il_b)
273 il_b = s;
275 else
276 il_b = stmtinfo_b;
277 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
281 /* A subroutine of vect_analyze_data_ref_dependence. Handle
282 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283 distances. These distances are conservatively correct but they don't
284 reflect a guaranteed dependence.
286 Return true if this function does all the work necessary to avoid
287 an alias or false if the caller should use the dependence distances
288 to limit the vectorization factor in the usual way. LOOP_DEPTH is
289 the depth of the loop described by LOOP_VINFO and the other arguments
290 are as for vect_analyze_data_ref_dependence. */
292 static bool
293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 int loop_depth, unsigned int *max_vf)
297 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
300 int dist = dist_v[loop_depth];
301 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
303 /* If the user asserted safelen >= DIST consecutive iterations
304 can be executed concurrently, assume independence.
306 ??? An alternative would be to add the alias check even
307 in this case, and vectorize the fallback loop with the
308 maximum VF set to safelen. However, if the user has
309 explicitly given a length, it's less likely that that
310 would be a win. */
311 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
313 if ((unsigned int) loop->safelen < *max_vf)
314 *max_vf = loop->safelen;
315 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 continue;
319 /* For dependence distances of 2 or more, we have the option
320 of limiting VF or checking for an alias at runtime.
321 Prefer to check at runtime if we can, to avoid limiting
322 the VF unnecessarily when the bases are in fact independent.
324 Note that the alias checks will be removed if the VF ends up
325 being small enough. */
326 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
333 return true;
337 /* Function vect_analyze_data_ref_dependence.
339 FIXME: I needed to change the sense of the returned flag.
341 Return FALSE if there (might) exist a dependence between a memory-reference
342 DRA and a memory-reference DRB. When versioning for alias may check a
343 dependence at run-time, return TRUE. Adjust *MAX_VF according to
344 the data dependence. */
346 static opt_result
347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 loop_vec_info loop_vinfo,
349 unsigned int *max_vf)
351 unsigned int i;
352 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353 struct data_reference *dra = DDR_A (ddr);
354 struct data_reference *drb = DDR_B (ddr);
355 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359 lambda_vector dist_v;
360 unsigned int loop_depth;
362 /* In loop analysis all data references should be vectorizable. */
363 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
364 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
365 gcc_unreachable ();
367 /* Independent data accesses. */
368 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
369 return opt_result::success ();
371 if (dra == drb
372 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
373 return opt_result::success ();
375 /* We do not have to consider dependences between accesses that belong
376 to the same group, unless the stride could be smaller than the
377 group size. */
378 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
379 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
380 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
381 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
382 return opt_result::success ();
384 /* Even if we have an anti-dependence then, as the vectorized loop covers at
385 least two scalar iterations, there is always also a true dependence.
386 As the vectorizer does not re-order loads and stores we can ignore
387 the anti-dependence if TBAA can disambiguate both DRs similar to the
388 case with known negative distance anti-dependences (positive
389 distance anti-dependences would violate TBAA constraints). */
390 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
391 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
392 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
393 get_alias_set (DR_REF (drb))))
394 return opt_result::success ();
396 /* Unknown data dependence. */
397 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
399 /* If user asserted safelen consecutive iterations can be
400 executed concurrently, assume independence. */
401 if (loop->safelen >= 2)
403 if ((unsigned int) loop->safelen < *max_vf)
404 *max_vf = loop->safelen;
405 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
406 return opt_result::success ();
409 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
410 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
411 return opt_result::failure_at
412 (stmtinfo_a->stmt,
413 "versioning for alias not supported for: "
414 "can't determine dependence between %T and %T\n",
415 DR_REF (dra), DR_REF (drb));
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
419 "versioning for alias required: "
420 "can't determine dependence between %T and %T\n",
421 DR_REF (dra), DR_REF (drb));
423 /* Add to list of ddrs that need to be tested at run-time. */
424 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
427 /* Known data dependence. */
428 if (DDR_NUM_DIST_VECTS (ddr) == 0)
430 /* If user asserted safelen consecutive iterations can be
431 executed concurrently, assume independence. */
432 if (loop->safelen >= 2)
434 if ((unsigned int) loop->safelen < *max_vf)
435 *max_vf = loop->safelen;
436 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
437 return opt_result::success ();
440 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
441 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
442 return opt_result::failure_at
443 (stmtinfo_a->stmt,
444 "versioning for alias not supported for: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra), DR_REF (drb));
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
450 "versioning for alias required: "
451 "bad dist vector for %T and %T\n",
452 DR_REF (dra), DR_REF (drb));
453 /* Add to list of ddrs that need to be tested at run-time. */
454 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
457 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
459 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
460 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
461 loop_depth, max_vf))
462 return opt_result::success ();
464 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
466 int dist = dist_v[loop_depth];
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance = %d.\n", dist);
472 if (dist == 0)
474 if (dump_enabled_p ())
475 dump_printf_loc (MSG_NOTE, vect_location,
476 "dependence distance == 0 between %T and %T\n",
477 DR_REF (dra), DR_REF (drb));
479 /* When we perform grouped accesses and perform implicit CSE
480 by detecting equal accesses and doing disambiguation with
481 runtime alias tests like for
482 .. = a[i];
483 .. = a[i+1];
484 a[i] = ..;
485 a[i+1] = ..;
486 *p = ..;
487 .. = a[i];
488 .. = a[i+1];
489 where we will end up loading { a[i], a[i+1] } once, make
490 sure that inserting group loads before the first load and
491 stores after the last store will do the right thing.
492 Similar for groups like
493 a[i] = ...;
494 ... = a[i];
495 a[i+1] = ...;
496 where loads from the group interleave with the store. */
497 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
498 return opt_result::failure_at (stmtinfo_a->stmt,
499 "READ_WRITE dependence"
500 " in interleaving.\n");
502 if (loop->safelen < 2)
504 tree indicator = dr_zero_step_indicator (dra);
505 if (!indicator || integer_zerop (indicator))
506 return opt_result::failure_at (stmtinfo_a->stmt,
507 "access also has a zero step\n");
508 else if (TREE_CODE (indicator) != INTEGER_CST)
509 vect_check_nonzero_value (loop_vinfo, indicator);
511 continue;
514 if (dist > 0 && DDR_REVERSED_P (ddr))
516 /* If DDR_REVERSED_P the order of the data-refs in DDR was
517 reversed (to make distance vector positive), and the actual
518 distance is negative. */
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance negative.\n");
522 /* When doing outer loop vectorization, we need to check if there is
523 a backward dependence at the inner loop level if the dependence
524 at the outer loop is reversed. See PR81740. */
525 if (nested_in_vect_loop_p (loop, stmtinfo_a)
526 || nested_in_vect_loop_p (loop, stmtinfo_b))
528 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
529 DDR_LOOP_NEST (ddr));
530 if (dist_v[inner_depth] < 0)
531 return opt_result::failure_at (stmtinfo_a->stmt,
532 "not vectorized, dependence "
533 "between data-refs %T and %T\n",
534 DR_REF (dra), DR_REF (drb));
536 /* Record a negative dependence distance to later limit the
537 amount of stmt copying / unrolling we can perform.
538 Only need to handle read-after-write dependence. */
539 if (DR_IS_READ (drb)
540 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
541 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
542 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
543 continue;
546 unsigned int abs_dist = abs (dist);
547 if (abs_dist >= 2 && abs_dist < *max_vf)
549 /* The dependence distance requires reduction of the maximal
550 vectorization factor. */
551 *max_vf = abs_dist;
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "adjusting maximal vectorization factor to %i\n",
555 *max_vf);
558 if (abs_dist >= *max_vf)
560 /* Dependence distance does not create dependence, as far as
561 vectorization is concerned, in this case. */
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_NOTE, vect_location,
564 "dependence distance >= VF.\n");
565 continue;
568 return opt_result::failure_at (stmtinfo_a->stmt,
569 "not vectorized, possible dependence "
570 "between data-refs %T and %T\n",
571 DR_REF (dra), DR_REF (drb));
574 return opt_result::success ();
577 /* Function vect_analyze_data_ref_dependences.
579 Examine all the data references in the loop, and make sure there do not
580 exist any data dependences between them. Set *MAX_VF according to
581 the maximum vectorization factor the data dependences allow. */
583 opt_result
584 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
585 unsigned int *max_vf)
587 unsigned int i;
588 struct data_dependence_relation *ddr;
590 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
592 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
594 LOOP_VINFO_DDRS (loop_vinfo)
595 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
596 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
597 /* We do not need read-read dependences. */
598 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
599 &LOOP_VINFO_DDRS (loop_vinfo),
600 LOOP_VINFO_LOOP_NEST (loop_vinfo),
601 false);
602 gcc_assert (res);
605 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
607 /* For epilogues we either have no aliases or alias versioning
608 was applied to original loop. Therefore we may just get max_vf
609 using VF of original loop. */
610 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
611 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
612 else
613 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
615 opt_result res
616 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
617 if (!res)
618 return res;
621 return opt_result::success ();
625 /* Function vect_slp_analyze_data_ref_dependence.
627 Return TRUE if there (might) exist a dependence between a memory-reference
628 DRA and a memory-reference DRB for VINFO. When versioning for alias
629 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
630 according to the data dependence. */
632 static bool
633 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
634 struct data_dependence_relation *ddr)
636 struct data_reference *dra = DDR_A (ddr);
637 struct data_reference *drb = DDR_B (ddr);
638 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
639 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
641 /* We need to check dependences of statements marked as unvectorizable
642 as well, they still can prohibit vectorization. */
644 /* Independent data accesses. */
645 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
646 return false;
648 if (dra == drb)
649 return false;
651 /* Read-read is OK. */
652 if (DR_IS_READ (dra) && DR_IS_READ (drb))
653 return false;
655 /* If dra and drb are part of the same interleaving chain consider
656 them independent. */
657 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
658 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
659 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
660 return false;
662 /* Unknown data dependence. */
663 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "can't determine dependence between %T and %T\n",
668 DR_REF (dra), DR_REF (drb));
670 else if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "determined dependence between %T and %T\n",
673 DR_REF (dra), DR_REF (drb));
675 return true;
679 /* Analyze dependences involved in the transform of SLP NODE. STORES
680 contain the vector of scalar stores of this instance if we are
681 disambiguating the loads. */
683 static bool
684 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
685 vec<stmt_vec_info> stores,
686 stmt_vec_info last_store_info)
688 /* This walks over all stmts involved in the SLP load/store done
689 in NODE verifying we can sink them up to the last stmt in the
690 group. */
691 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
693 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
694 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
696 stmt_vec_info access_info
697 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
698 if (access_info == last_access_info)
699 continue;
700 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
701 ao_ref ref;
702 bool ref_initialized_p = false;
703 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
704 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
706 gimple *stmt = gsi_stmt (gsi);
707 if (! gimple_vuse (stmt))
708 continue;
710 /* If we couldn't record a (single) data reference for this
711 stmt we have to resort to the alias oracle. */
712 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
713 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
714 if (!dr_b)
716 /* We are moving a store - this means
717 we cannot use TBAA for disambiguation. */
718 if (!ref_initialized_p)
719 ao_ref_init (&ref, DR_REF (dr_a));
720 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
721 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
722 return false;
723 continue;
726 bool dependent = false;
727 /* If we run into a store of this same instance (we've just
728 marked those) then delay dependence checking until we run
729 into the last store because this is where it will have
730 been sunk to (and we verify if we can do that as well). */
731 if (gimple_visited_p (stmt))
733 if (stmt_info != last_store_info)
734 continue;
736 for (stmt_vec_info &store_info : stores)
738 data_reference *store_dr
739 = STMT_VINFO_DATA_REF (store_info);
740 ddr_p ddr = initialize_data_dependence_relation
741 (dr_a, store_dr, vNULL);
742 dependent
743 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
744 free_dependence_relation (ddr);
745 if (dependent)
746 break;
749 else
751 ddr_p ddr = initialize_data_dependence_relation (dr_a,
752 dr_b, vNULL);
753 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
754 free_dependence_relation (ddr);
756 if (dependent)
757 return false;
761 else /* DR_IS_READ */
763 stmt_vec_info first_access_info
764 = vect_find_first_scalar_stmt_in_slp (node);
765 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
767 stmt_vec_info access_info
768 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
769 if (access_info == first_access_info)
770 continue;
771 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
772 ao_ref ref;
773 bool ref_initialized_p = false;
774 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
775 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
777 gimple *stmt = gsi_stmt (gsi);
778 if (! gimple_vdef (stmt))
779 continue;
781 /* If we couldn't record a (single) data reference for this
782 stmt we have to resort to the alias oracle. */
783 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
784 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
786 /* We are hoisting a load - this means we can use
787 TBAA for disambiguation. */
788 if (!ref_initialized_p)
789 ao_ref_init (&ref, DR_REF (dr_a));
790 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
792 if (!dr_b)
793 return false;
794 /* Resort to dependence checking below. */
796 else
797 /* No dependence. */
798 continue;
800 bool dependent = false;
801 /* If we run into a store of this same instance (we've just
802 marked those) then delay dependence checking until we run
803 into the last store because this is where it will have
804 been sunk to (and we verify if we can do that as well). */
805 if (gimple_visited_p (stmt))
807 if (stmt_info != last_store_info)
808 continue;
810 for (stmt_vec_info &store_info : stores)
812 data_reference *store_dr
813 = STMT_VINFO_DATA_REF (store_info);
814 ddr_p ddr = initialize_data_dependence_relation
815 (dr_a, store_dr, vNULL);
816 dependent
817 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
818 free_dependence_relation (ddr);
819 if (dependent)
820 break;
823 else
825 ddr_p ddr = initialize_data_dependence_relation (dr_a,
826 dr_b, vNULL);
827 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
828 free_dependence_relation (ddr);
830 if (dependent)
831 return false;
835 return true;
839 /* Function vect_analyze_data_ref_dependences.
841 Examine all the data references in the basic-block, and make sure there
842 do not exist any data dependences between them. Set *MAX_VF according to
843 the maximum vectorization factor the data dependences allow. */
845 bool
846 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
848 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
850 /* The stores of this instance are at the root of the SLP tree. */
851 slp_tree store = NULL;
852 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
853 store = SLP_INSTANCE_TREE (instance);
855 /* Verify we can sink stores to the vectorized stmt insert location. */
856 stmt_vec_info last_store_info = NULL;
857 if (store)
859 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
860 return false;
862 /* Mark stores in this instance and remember the last one. */
863 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
864 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
865 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
868 bool res = true;
870 /* Verify we can sink loads to the vectorized stmt insert location,
871 special-casing stores of this instance. */
872 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
873 if (! vect_slp_analyze_node_dependences (vinfo, load,
874 store
875 ? SLP_TREE_SCALAR_STMTS (store)
876 : vNULL, last_store_info))
878 res = false;
879 break;
882 /* Unset the visited flag. */
883 if (store)
884 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
885 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
887 return res;
890 /* Return the misalignment of DR_INFO accessed in VECTYPE. */
893 dr_misalignment (dr_vec_info *dr_info, tree vectype)
895 HOST_WIDE_INT diff = 0;
896 /* Alignment is only analyzed for the first element of a DR group,
897 use that but adjust misalignment by the offset of the access. */
898 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
900 dr_vec_info *first_dr
901 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
902 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
903 INTEGER_CSTs and the first element in the group has the lowest
904 address. */
905 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
906 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
907 gcc_assert (diff >= 0);
908 dr_info = first_dr;
911 int misalign = dr_info->misalignment;
912 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
913 if (misalign == DR_MISALIGNMENT_UNKNOWN)
914 return misalign;
916 /* If the access is only aligned for a vector type with smaller alignment
917 requirement the access has unknown misalignment. */
918 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
919 targetm.vectorize.preferred_vector_alignment (vectype)))
920 return DR_MISALIGNMENT_UNKNOWN;
922 /* If this is a backward running DR then first access in the larger
923 vectype actually is N-1 elements before the address in the DR.
924 Adjust misalign accordingly. */
925 poly_int64 misalignment = misalign + diff;
926 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) < 0)
927 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
928 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
930 /* vect_compute_data_ref_alignment will have ensured that target_alignment
931 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
932 unsigned HOST_WIDE_INT target_alignment_c
933 = dr_info->target_alignment.to_constant ();
934 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
935 return DR_MISALIGNMENT_UNKNOWN;
936 return misalign;
939 /* Record the base alignment guarantee given by DRB, which occurs
940 in STMT_INFO. */
942 static void
943 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
944 innermost_loop_behavior *drb)
946 bool existed;
947 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
948 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
949 if (!existed || entry.second->base_alignment < drb->base_alignment)
951 entry = std::make_pair (stmt_info, drb);
952 if (dump_enabled_p ())
953 dump_printf_loc (MSG_NOTE, vect_location,
954 "recording new base alignment for %T\n"
955 " alignment: %d\n"
956 " misalignment: %d\n"
957 " based on: %G",
958 drb->base_address,
959 drb->base_alignment,
960 drb->base_misalignment,
961 stmt_info->stmt);
965 /* If the region we're going to vectorize is reached, all unconditional
966 data references occur at least once. We can therefore pool the base
967 alignment guarantees from each unconditional reference. Do this by
968 going through all the data references in VINFO and checking whether
969 the containing statement makes the reference unconditionally. If so,
970 record the alignment of the base address in VINFO so that it can be
971 used for all other references with the same base. */
973 void
974 vect_record_base_alignments (vec_info *vinfo)
976 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
977 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
978 for (data_reference *dr : vinfo->shared->datarefs)
980 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
981 stmt_vec_info stmt_info = dr_info->stmt;
982 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
983 && STMT_VINFO_VECTORIZABLE (stmt_info)
984 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
986 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
988 /* If DR is nested in the loop that is being vectorized, we can also
989 record the alignment of the base wrt the outer loop. */
990 if (loop && nested_in_vect_loop_p (loop, stmt_info))
991 vect_record_base_alignment
992 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
997 /* Function vect_compute_data_ref_alignment
999 Compute the misalignment of the data reference DR_INFO when vectorizing
1000 with VECTYPE.
1002 Output:
1003 1. initialized misalignment info for DR_INFO
1005 FOR NOW: No analysis is actually performed. Misalignment is calculated
1006 only for trivial cases. TODO. */
1008 static void
1009 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1010 tree vectype)
1012 stmt_vec_info stmt_info = dr_info->stmt;
1013 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1014 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1015 class loop *loop = NULL;
1016 tree ref = DR_REF (dr_info->dr);
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_NOTE, vect_location,
1020 "vect_compute_data_ref_alignment:\n");
1022 if (loop_vinfo)
1023 loop = LOOP_VINFO_LOOP (loop_vinfo);
1025 /* Initialize misalignment to unknown. */
1026 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1028 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1029 return;
1031 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1032 bool step_preserves_misalignment_p;
1034 poly_uint64 vector_alignment
1035 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1036 BITS_PER_UNIT);
1037 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1039 /* If the main loop has peeled for alignment we have no way of knowing
1040 whether the data accesses in the epilogues are aligned. We can't at
1041 compile time answer the question whether we have entered the main loop or
1042 not. Fixes PR 92351. */
1043 if (loop_vinfo)
1045 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1046 if (orig_loop_vinfo
1047 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1048 return;
1051 unsigned HOST_WIDE_INT vect_align_c;
1052 if (!vector_alignment.is_constant (&vect_align_c))
1053 return;
1055 /* No step for BB vectorization. */
1056 if (!loop)
1058 gcc_assert (integer_zerop (drb->step));
1059 step_preserves_misalignment_p = true;
1062 /* In case the dataref is in an inner-loop of the loop that is being
1063 vectorized (LOOP), we use the base and misalignment information
1064 relative to the outer-loop (LOOP). This is ok only if the misalignment
1065 stays the same throughout the execution of the inner-loop, which is why
1066 we have to check that the stride of the dataref in the inner-loop evenly
1067 divides by the vector alignment. */
1068 else if (nested_in_vect_loop_p (loop, stmt_info))
1070 step_preserves_misalignment_p
1071 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1073 if (dump_enabled_p ())
1075 if (step_preserves_misalignment_p)
1076 dump_printf_loc (MSG_NOTE, vect_location,
1077 "inner step divides the vector alignment.\n");
1078 else
1079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080 "inner step doesn't divide the vector"
1081 " alignment.\n");
1085 /* Similarly we can only use base and misalignment information relative to
1086 an innermost loop if the misalignment stays the same throughout the
1087 execution of the loop. As above, this is the case if the stride of
1088 the dataref evenly divides by the alignment. */
1089 else
1091 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1092 step_preserves_misalignment_p
1093 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1095 if (!step_preserves_misalignment_p && dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097 "step doesn't divide the vector alignment.\n");
1100 unsigned int base_alignment = drb->base_alignment;
1101 unsigned int base_misalignment = drb->base_misalignment;
1103 /* Calculate the maximum of the pooled base address alignment and the
1104 alignment that we can compute for DR itself. */
1105 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1106 = base_alignments->get (drb->base_address);
1107 if (entry
1108 && base_alignment < (*entry).second->base_alignment
1109 && (loop_vinfo
1110 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1111 gimple_bb (entry->first->stmt))
1112 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1113 || (entry->first->dr_aux.group <= dr_info->group)))))
1115 base_alignment = entry->second->base_alignment;
1116 base_misalignment = entry->second->base_misalignment;
1119 if (drb->offset_alignment < vect_align_c
1120 || !step_preserves_misalignment_p
1121 /* We need to know whether the step wrt the vectorized loop is
1122 negative when computing the starting misalignment below. */
1123 || TREE_CODE (drb->step) != INTEGER_CST)
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "Unknown alignment for access: %T\n", ref);
1128 return;
1131 if (base_alignment < vect_align_c)
1133 unsigned int max_alignment;
1134 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1135 if (max_alignment < vect_align_c
1136 || !vect_can_force_dr_alignment_p (base,
1137 vect_align_c * BITS_PER_UNIT))
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_NOTE, vect_location,
1141 "can't force alignment of ref: %T\n", ref);
1142 return;
1145 /* Force the alignment of the decl.
1146 NOTE: This is the only change to the code we make during
1147 the analysis phase, before deciding to vectorize the loop. */
1148 if (dump_enabled_p ())
1149 dump_printf_loc (MSG_NOTE, vect_location,
1150 "force alignment of %T\n", ref);
1152 dr_info->base_decl = base;
1153 dr_info->base_misaligned = true;
1154 base_misalignment = 0;
1156 poly_int64 misalignment
1157 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1159 unsigned int const_misalignment;
1160 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1162 if (dump_enabled_p ())
1163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1164 "Non-constant misalignment for access: %T\n", ref);
1165 return;
1168 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1170 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1172 "misalign = %d bytes of ref %T\n",
1173 const_misalignment, ref);
1175 return;
1178 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1179 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1180 is made aligned via peeling. */
1182 static bool
1183 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1184 dr_vec_info *dr_peel_info)
1186 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1187 DR_TARGET_ALIGNMENT (dr_info)))
1189 poly_offset_int diff
1190 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1191 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1192 if (known_eq (diff, 0)
1193 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1194 return true;
1196 return false;
1199 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1200 aligned via peeling. */
1202 static bool
1203 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1204 dr_vec_info *dr_peel_info)
1206 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1207 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1208 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1209 DR_OFFSET (dr_peel_info->dr), 0)
1210 || !operand_equal_p (DR_STEP (dr_info->dr),
1211 DR_STEP (dr_peel_info->dr), 0))
1212 return false;
1214 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1217 /* Compute the value for dr_info->misalign so that the access appears
1218 aligned. This is used by peeling to compensate for dr_misalignment
1219 applying the offset for negative step. */
1222 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1224 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1225 return 0;
1227 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1228 poly_int64 misalignment
1229 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1230 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1232 unsigned HOST_WIDE_INT target_alignment_c;
1233 int misalign;
1234 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1235 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1236 return DR_MISALIGNMENT_UNKNOWN;
1237 return misalign;
1240 /* Function vect_update_misalignment_for_peel.
1241 Sets DR_INFO's misalignment
1242 - to 0 if it has the same alignment as DR_PEEL_INFO,
1243 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1244 - to -1 (unknown) otherwise.
1246 DR_INFO - the data reference whose misalignment is to be adjusted.
1247 DR_PEEL_INFO - the data reference whose misalignment is being made
1248 zero in the vector loop by the peel.
1249 NPEEL - the number of iterations in the peel loop if the misalignment
1250 of DR_PEEL_INFO is known at compile time. */
1252 static void
1253 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1254 dr_vec_info *dr_peel_info, int npeel)
1256 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1257 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1259 SET_DR_MISALIGNMENT (dr_info,
1260 vect_dr_misalign_for_aligned_access (dr_peel_info));
1261 return;
1264 unsigned HOST_WIDE_INT alignment;
1265 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1266 && known_alignment_for_access_p (dr_info,
1267 STMT_VINFO_VECTYPE (dr_info->stmt))
1268 && known_alignment_for_access_p (dr_peel_info,
1269 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1271 int misal = dr_info->misalignment;
1272 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1273 misal &= alignment - 1;
1274 set_dr_misalignment (dr_info, misal);
1275 return;
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1280 "to unknown (-1).\n");
1281 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1284 /* Return true if alignment is relevant for DR_INFO. */
1286 static bool
1287 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1289 stmt_vec_info stmt_info = dr_info->stmt;
1291 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1292 return false;
1294 /* For interleaving, only the alignment of the first access matters. */
1295 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1296 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1297 return false;
1299 /* Scatter-gather and invariant accesses continue to address individual
1300 scalars, so vector-level alignment is irrelevant. */
1301 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1302 || integer_zerop (DR_STEP (dr_info->dr)))
1303 return false;
1305 /* Strided accesses perform only component accesses, alignment is
1306 irrelevant for them. */
1307 if (STMT_VINFO_STRIDED_P (stmt_info)
1308 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1309 return false;
1311 return true;
1314 /* Given an memory reference EXP return whether its alignment is less
1315 than its size. */
1317 static bool
1318 not_size_aligned (tree exp)
1320 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1321 return true;
1323 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1324 > get_object_alignment (exp));
1327 /* Function vector_alignment_reachable_p
1329 Return true if vector alignment for DR_INFO is reachable by peeling
1330 a few loop iterations. Return false otherwise. */
1332 static bool
1333 vector_alignment_reachable_p (dr_vec_info *dr_info)
1335 stmt_vec_info stmt_info = dr_info->stmt;
1336 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1338 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1340 /* For interleaved access we peel only if number of iterations in
1341 the prolog loop ({VF - misalignment}), is a multiple of the
1342 number of the interleaved accesses. */
1343 int elem_size, mis_in_elements;
1345 /* FORNOW: handle only known alignment. */
1346 if (!known_alignment_for_access_p (dr_info, vectype))
1347 return false;
1349 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1350 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1351 elem_size = vector_element_size (vector_size, nelements);
1352 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1354 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1355 return false;
1358 /* If misalignment is known at the compile time then allow peeling
1359 only if natural alignment is reachable through peeling. */
1360 if (known_alignment_for_access_p (dr_info, vectype)
1361 && !aligned_access_p (dr_info, vectype))
1363 HOST_WIDE_INT elmsize =
1364 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1365 if (dump_enabled_p ())
1367 dump_printf_loc (MSG_NOTE, vect_location,
1368 "data size = %wd. misalignment = %d.\n", elmsize,
1369 dr_misalignment (dr_info, vectype));
1371 if (dr_misalignment (dr_info, vectype) % elmsize)
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 "data size does not divide the misalignment.\n");
1376 return false;
1380 if (!known_alignment_for_access_p (dr_info, vectype))
1382 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1383 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1384 if (dump_enabled_p ())
1385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386 "Unknown misalignment, %snaturally aligned\n",
1387 is_packed ? "not " : "");
1388 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1391 return true;
1395 /* Calculate the cost of the memory access represented by DR_INFO. */
1397 static void
1398 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1399 unsigned int *inside_cost,
1400 unsigned int *outside_cost,
1401 stmt_vector_for_cost *body_cost_vec,
1402 stmt_vector_for_cost *prologue_cost_vec)
1404 stmt_vec_info stmt_info = dr_info->stmt;
1405 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1406 int ncopies;
1408 if (PURE_SLP_STMT (stmt_info))
1409 ncopies = 1;
1410 else
1411 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1413 if (DR_IS_READ (dr_info->dr))
1414 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1415 outside_cost, prologue_cost_vec, body_cost_vec, false);
1416 else
1417 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_NOTE, vect_location,
1421 "vect_get_data_access_cost: inside_cost = %d, "
1422 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1426 typedef struct _vect_peel_info
1428 dr_vec_info *dr_info;
1429 int npeel;
1430 unsigned int count;
1431 } *vect_peel_info;
1433 typedef struct _vect_peel_extended_info
1435 vec_info *vinfo;
1436 struct _vect_peel_info peel_info;
1437 unsigned int inside_cost;
1438 unsigned int outside_cost;
1439 } *vect_peel_extended_info;
1442 /* Peeling hashtable helpers. */
1444 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1446 static inline hashval_t hash (const _vect_peel_info *);
1447 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1450 inline hashval_t
1451 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1453 return (hashval_t) peel_info->npeel;
1456 inline bool
1457 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1459 return (a->npeel == b->npeel);
1463 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1465 static void
1466 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1467 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1468 int npeel)
1470 struct _vect_peel_info elem, *slot;
1471 _vect_peel_info **new_slot;
1472 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1473 bool supportable_dr_alignment
1474 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, true);
1476 elem.npeel = npeel;
1477 slot = peeling_htab->find (&elem);
1478 if (slot)
1479 slot->count++;
1480 else
1482 slot = XNEW (struct _vect_peel_info);
1483 slot->npeel = npeel;
1484 slot->dr_info = dr_info;
1485 slot->count = 1;
1486 new_slot = peeling_htab->find_slot (slot, INSERT);
1487 *new_slot = slot;
1490 if (!supportable_dr_alignment
1491 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1492 slot->count += VECT_MAX_COST;
1496 /* Traverse peeling hash table to find peeling option that aligns maximum
1497 number of data accesses. */
1500 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1501 _vect_peel_extended_info *max)
1503 vect_peel_info elem = *slot;
1505 if (elem->count > max->peel_info.count
1506 || (elem->count == max->peel_info.count
1507 && max->peel_info.npeel > elem->npeel))
1509 max->peel_info.npeel = elem->npeel;
1510 max->peel_info.count = elem->count;
1511 max->peel_info.dr_info = elem->dr_info;
1514 return 1;
1517 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1518 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1519 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1520 after peeling. */
1522 static void
1523 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1524 dr_vec_info *dr0_info,
1525 unsigned int *inside_cost,
1526 unsigned int *outside_cost,
1527 stmt_vector_for_cost *body_cost_vec,
1528 stmt_vector_for_cost *prologue_cost_vec,
1529 unsigned int npeel,
1530 bool unknown_misalignment)
1532 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534 for (data_reference *dr : datarefs)
1536 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1537 if (!vect_relevant_for_alignment_p (dr_info))
1538 continue;
1540 int save_misalignment;
1541 save_misalignment = dr_info->misalignment;
1542 if (npeel == 0)
1544 else if (unknown_misalignment && dr_info == dr0_info)
1545 SET_DR_MISALIGNMENT (dr_info,
1546 vect_dr_misalign_for_aligned_access (dr0_info));
1547 else
1548 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1549 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1550 body_cost_vec, prologue_cost_vec);
1551 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1555 /* Traverse peeling hash table and calculate cost for each peeling option.
1556 Find the one with the lowest cost. */
1559 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1560 _vect_peel_extended_info *min)
1562 vect_peel_info elem = *slot;
1563 int dummy;
1564 unsigned int inside_cost = 0, outside_cost = 0;
1565 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1566 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1567 epilogue_cost_vec;
1569 prologue_cost_vec.create (2);
1570 body_cost_vec.create (2);
1571 epilogue_cost_vec.create (2);
1573 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1574 &outside_cost, &body_cost_vec,
1575 &prologue_cost_vec, elem->npeel, false);
1577 body_cost_vec.release ();
1579 outside_cost += vect_get_known_peeling_cost
1580 (loop_vinfo, elem->npeel, &dummy,
1581 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1582 &prologue_cost_vec, &epilogue_cost_vec);
1584 /* Prologue and epilogue costs are added to the target model later.
1585 These costs depend only on the scalar iteration cost, the
1586 number of peeling iterations finally chosen, and the number of
1587 misaligned statements. So discard the information found here. */
1588 prologue_cost_vec.release ();
1589 epilogue_cost_vec.release ();
1591 if (inside_cost < min->inside_cost
1592 || (inside_cost == min->inside_cost
1593 && outside_cost < min->outside_cost))
1595 min->inside_cost = inside_cost;
1596 min->outside_cost = outside_cost;
1597 min->peel_info.dr_info = elem->dr_info;
1598 min->peel_info.npeel = elem->npeel;
1599 min->peel_info.count = elem->count;
1602 return 1;
1606 /* Choose best peeling option by traversing peeling hash table and either
1607 choosing an option with the lowest cost (if cost model is enabled) or the
1608 option that aligns as many accesses as possible. */
1610 static struct _vect_peel_extended_info
1611 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1612 loop_vec_info loop_vinfo)
1614 struct _vect_peel_extended_info res;
1616 res.peel_info.dr_info = NULL;
1617 res.vinfo = loop_vinfo;
1619 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1621 res.inside_cost = INT_MAX;
1622 res.outside_cost = INT_MAX;
1623 peeling_htab->traverse <_vect_peel_extended_info *,
1624 vect_peeling_hash_get_lowest_cost> (&res);
1626 else
1628 res.peel_info.count = 0;
1629 peeling_htab->traverse <_vect_peel_extended_info *,
1630 vect_peeling_hash_get_most_frequent> (&res);
1631 res.inside_cost = 0;
1632 res.outside_cost = 0;
1635 return res;
1638 /* Return true if the new peeling NPEEL is supported. */
1640 static bool
1641 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1642 unsigned npeel)
1644 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1645 enum dr_alignment_support supportable_dr_alignment;
1647 /* Ensure that all data refs can be vectorized after the peel. */
1648 for (data_reference *dr : datarefs)
1650 int save_misalignment;
1652 if (dr == dr0_info->dr)
1653 continue;
1655 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1656 if (!vect_relevant_for_alignment_p (dr_info))
1657 continue;
1659 save_misalignment = dr_info->misalignment;
1660 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1661 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1662 supportable_dr_alignment
1663 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, false);
1664 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1666 if (!supportable_dr_alignment)
1667 return false;
1670 return true;
1673 /* Compare two data-references DRA and DRB to group them into chunks
1674 with related alignment. */
1676 static int
1677 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1679 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1680 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1681 int cmp;
1683 /* Stabilize sort. */
1684 if (dra == drb)
1685 return 0;
1687 /* Ordering of DRs according to base. */
1688 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1689 DR_BASE_ADDRESS (drb));
1690 if (cmp != 0)
1691 return cmp;
1693 /* And according to DR_OFFSET. */
1694 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1695 if (cmp != 0)
1696 return cmp;
1698 /* And after step. */
1699 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1700 if (cmp != 0)
1701 return cmp;
1703 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1704 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1705 if (cmp == 0)
1706 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1707 return cmp;
1710 /* Function vect_enhance_data_refs_alignment
1712 This pass will use loop versioning and loop peeling in order to enhance
1713 the alignment of data references in the loop.
1715 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1716 original loop is to be vectorized. Any other loops that are created by
1717 the transformations performed in this pass - are not supposed to be
1718 vectorized. This restriction will be relaxed.
1720 This pass will require a cost model to guide it whether to apply peeling
1721 or versioning or a combination of the two. For example, the scheme that
1722 intel uses when given a loop with several memory accesses, is as follows:
1723 choose one memory access ('p') which alignment you want to force by doing
1724 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1725 other accesses are not necessarily aligned, or (2) use loop versioning to
1726 generate one loop in which all accesses are aligned, and another loop in
1727 which only 'p' is necessarily aligned.
1729 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1730 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1731 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1733 Devising a cost model is the most critical aspect of this work. It will
1734 guide us on which access to peel for, whether to use loop versioning, how
1735 many versions to create, etc. The cost model will probably consist of
1736 generic considerations as well as target specific considerations (on
1737 powerpc for example, misaligned stores are more painful than misaligned
1738 loads).
1740 Here are the general steps involved in alignment enhancements:
1742 -- original loop, before alignment analysis:
1743 for (i=0; i<N; i++){
1744 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1745 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1748 -- After vect_compute_data_refs_alignment:
1749 for (i=0; i<N; i++){
1750 x = q[i]; # DR_MISALIGNMENT(q) = 3
1751 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1754 -- Possibility 1: we do loop versioning:
1755 if (p is aligned) {
1756 for (i=0; i<N; i++){ # loop 1A
1757 x = q[i]; # DR_MISALIGNMENT(q) = 3
1758 p[i] = y; # DR_MISALIGNMENT(p) = 0
1761 else {
1762 for (i=0; i<N; i++){ # loop 1B
1763 x = q[i]; # DR_MISALIGNMENT(q) = 3
1764 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1768 -- Possibility 2: we do loop peeling:
1769 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1770 x = q[i];
1771 p[i] = y;
1773 for (i = 3; i < N; i++){ # loop 2A
1774 x = q[i]; # DR_MISALIGNMENT(q) = 0
1775 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1778 -- Possibility 3: combination of loop peeling and versioning:
1779 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1780 x = q[i];
1781 p[i] = y;
1783 if (p is aligned) {
1784 for (i = 3; i<N; i++){ # loop 3A
1785 x = q[i]; # DR_MISALIGNMENT(q) = 0
1786 p[i] = y; # DR_MISALIGNMENT(p) = 0
1789 else {
1790 for (i = 3; i<N; i++){ # loop 3B
1791 x = q[i]; # DR_MISALIGNMENT(q) = 0
1792 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1796 These loops are later passed to loop_transform to be vectorized. The
1797 vectorizer will use the alignment information to guide the transformation
1798 (whether to generate regular loads/stores, or with special handling for
1799 misalignment). */
1801 opt_result
1802 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1804 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1805 enum dr_alignment_support supportable_dr_alignment;
1806 dr_vec_info *first_store = NULL;
1807 dr_vec_info *dr0_info = NULL;
1808 struct data_reference *dr;
1809 unsigned int i;
1810 bool do_peeling = false;
1811 bool do_versioning = false;
1812 unsigned int npeel = 0;
1813 bool one_misalignment_known = false;
1814 bool one_misalignment_unknown = false;
1815 bool one_dr_unsupportable = false;
1816 dr_vec_info *unsupportable_dr_info = NULL;
1817 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1818 hash_table<peel_info_hasher> peeling_htab (1);
1820 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1822 /* Reset data so we can safely be called multiple times. */
1823 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1824 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1826 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1827 return opt_result::success ();
1829 /* Sort the vector of datarefs so DRs that have the same or dependent
1830 alignment are next to each other. */
1831 auto_vec<data_reference_p> datarefs
1832 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1833 datarefs.qsort (dr_align_group_sort_cmp);
1835 /* Compute the number of DRs that become aligned when we peel
1836 a dataref so it becomes aligned. */
1837 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1838 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1839 unsigned i0;
1840 for (i0 = 0; i0 < datarefs.length (); ++i0)
1841 if (DR_BASE_ADDRESS (datarefs[i0]))
1842 break;
1843 for (i = i0 + 1; i <= datarefs.length (); ++i)
1845 if (i == datarefs.length ()
1846 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1847 DR_BASE_ADDRESS (datarefs[i]), 0)
1848 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1849 DR_OFFSET (datarefs[i]), 0)
1850 || !operand_equal_p (DR_STEP (datarefs[i0]),
1851 DR_STEP (datarefs[i]), 0))
1853 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1854 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1855 will get known misalignment if we align one of the refs
1856 with the largest DR_TARGET_ALIGNMENT. */
1857 for (unsigned j = i0; j < i; ++j)
1859 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1860 for (unsigned k = i0; k < i; ++k)
1862 if (k == j)
1863 continue;
1864 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1865 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1866 dr_infoj))
1867 n_same_align_refs[j]++;
1870 i0 = i;
1874 /* While cost model enhancements are expected in the future, the high level
1875 view of the code at this time is as follows:
1877 A) If there is a misaligned access then see if peeling to align
1878 this access can make all data references satisfy
1879 vect_supportable_dr_alignment. If so, update data structures
1880 as needed and return true.
1882 B) If peeling wasn't possible and there is a data reference with an
1883 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1884 then see if loop versioning checks can be used to make all data
1885 references satisfy vect_supportable_dr_alignment. If so, update
1886 data structures as needed and return true.
1888 C) If neither peeling nor versioning were successful then return false if
1889 any data reference does not satisfy vect_supportable_dr_alignment.
1891 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1893 Note, Possibility 3 above (which is peeling and versioning together) is not
1894 being done at this time. */
1896 /* (1) Peeling to force alignment. */
1898 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1899 Considerations:
1900 + How many accesses will become aligned due to the peeling
1901 - How many accesses will become unaligned due to the peeling,
1902 and the cost of misaligned accesses.
1903 - The cost of peeling (the extra runtime checks, the increase
1904 in code size). */
1906 FOR_EACH_VEC_ELT (datarefs, i, dr)
1908 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1909 if (!vect_relevant_for_alignment_p (dr_info))
1910 continue;
1912 stmt_vec_info stmt_info = dr_info->stmt;
1913 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1914 supportable_dr_alignment
1915 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, true);
1916 do_peeling = vector_alignment_reachable_p (dr_info);
1917 if (do_peeling)
1919 if (known_alignment_for_access_p (dr_info, vectype))
1921 unsigned int npeel_tmp = 0;
1922 bool negative = tree_int_cst_compare (DR_STEP (dr),
1923 size_zero_node) < 0;
1925 /* If known_alignment_for_access_p then we have set
1926 DR_MISALIGNMENT which is only done if we know it at compiler
1927 time, so it is safe to assume target alignment is constant.
1929 unsigned int target_align =
1930 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1931 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1932 unsigned int mis = dr_misalignment (dr_info, vectype);
1933 mis = negative ? mis : -mis;
1934 if (mis != 0)
1935 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1937 /* For multiple types, it is possible that the bigger type access
1938 will have more than one peeling option. E.g., a loop with two
1939 types: one of size (vector size / 4), and the other one of
1940 size (vector size / 8). Vectorization factor will 8. If both
1941 accesses are misaligned by 3, the first one needs one scalar
1942 iteration to be aligned, and the second one needs 5. But the
1943 first one will be aligned also by peeling 5 scalar
1944 iterations, and in that case both accesses will be aligned.
1945 Hence, except for the immediate peeling amount, we also want
1946 to try to add full vector size, while we don't exceed
1947 vectorization factor.
1948 We do this automatically for cost model, since we calculate
1949 cost for every peeling option. */
1950 poly_uint64 nscalars = npeel_tmp;
1951 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1953 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1954 nscalars = (STMT_SLP_TYPE (stmt_info)
1955 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1958 /* Save info about DR in the hash table. Also include peeling
1959 amounts according to the explanation above. */
1960 while (known_le (npeel_tmp, nscalars))
1962 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1963 dr_info, npeel_tmp);
1964 npeel_tmp += MAX (1, target_align / dr_size);
1967 one_misalignment_known = true;
1969 else
1971 /* If we don't know any misalignment values, we prefer
1972 peeling for data-ref that has the maximum number of data-refs
1973 with the same alignment, unless the target prefers to align
1974 stores over load. */
1975 unsigned same_align_drs = n_same_align_refs[i];
1976 if (!dr0_info
1977 || dr0_same_align_drs < same_align_drs)
1979 dr0_same_align_drs = same_align_drs;
1980 dr0_info = dr_info;
1982 /* For data-refs with the same number of related
1983 accesses prefer the one where the misalign
1984 computation will be invariant in the outermost loop. */
1985 else if (dr0_same_align_drs == same_align_drs)
1987 class loop *ivloop0, *ivloop;
1988 ivloop0 = outermost_invariant_loop_for_expr
1989 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1990 ivloop = outermost_invariant_loop_for_expr
1991 (loop, DR_BASE_ADDRESS (dr));
1992 if ((ivloop && !ivloop0)
1993 || (ivloop && ivloop0
1994 && flow_loop_nested_p (ivloop, ivloop0)))
1995 dr0_info = dr_info;
1998 one_misalignment_unknown = true;
2000 /* Check for data refs with unsupportable alignment that
2001 can be peeled. */
2002 if (!supportable_dr_alignment)
2004 one_dr_unsupportable = true;
2005 unsupportable_dr_info = dr_info;
2008 if (!first_store && DR_IS_WRITE (dr))
2010 first_store = dr_info;
2011 first_store_same_align_drs = same_align_drs;
2015 else
2017 if (!aligned_access_p (dr_info, vectype))
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021 "vector alignment may not be reachable\n");
2022 break;
2027 /* Check if we can possibly peel the loop. */
2028 if (!vect_can_advance_ivs_p (loop_vinfo)
2029 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2030 || loop->inner)
2031 do_peeling = false;
2033 struct _vect_peel_extended_info peel_for_known_alignment;
2034 struct _vect_peel_extended_info peel_for_unknown_alignment;
2035 struct _vect_peel_extended_info best_peel;
2037 peel_for_unknown_alignment.inside_cost = INT_MAX;
2038 peel_for_unknown_alignment.outside_cost = INT_MAX;
2039 peel_for_unknown_alignment.peel_info.count = 0;
2041 if (do_peeling
2042 && one_misalignment_unknown)
2044 /* Check if the target requires to prefer stores over loads, i.e., if
2045 misaligned stores are more expensive than misaligned loads (taking
2046 drs with same alignment into account). */
2047 unsigned int load_inside_cost = 0;
2048 unsigned int load_outside_cost = 0;
2049 unsigned int store_inside_cost = 0;
2050 unsigned int store_outside_cost = 0;
2051 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2053 stmt_vector_for_cost dummy;
2054 dummy.create (2);
2055 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2056 &load_inside_cost,
2057 &load_outside_cost,
2058 &dummy, &dummy, estimated_npeels, true);
2059 dummy.release ();
2061 if (first_store)
2063 dummy.create (2);
2064 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2065 &store_inside_cost,
2066 &store_outside_cost,
2067 &dummy, &dummy,
2068 estimated_npeels, true);
2069 dummy.release ();
2071 else
2073 store_inside_cost = INT_MAX;
2074 store_outside_cost = INT_MAX;
2077 if (load_inside_cost > store_inside_cost
2078 || (load_inside_cost == store_inside_cost
2079 && load_outside_cost > store_outside_cost))
2081 dr0_info = first_store;
2082 dr0_same_align_drs = first_store_same_align_drs;
2083 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2084 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2086 else
2088 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2089 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2092 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2093 prologue_cost_vec.create (2);
2094 epilogue_cost_vec.create (2);
2096 int dummy2;
2097 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2098 (loop_vinfo, estimated_npeels, &dummy2,
2099 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2100 &prologue_cost_vec, &epilogue_cost_vec);
2102 prologue_cost_vec.release ();
2103 epilogue_cost_vec.release ();
2105 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2108 peel_for_unknown_alignment.peel_info.npeel = 0;
2109 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2111 best_peel = peel_for_unknown_alignment;
2113 peel_for_known_alignment.inside_cost = INT_MAX;
2114 peel_for_known_alignment.outside_cost = INT_MAX;
2115 peel_for_known_alignment.peel_info.count = 0;
2116 peel_for_known_alignment.peel_info.dr_info = NULL;
2118 if (do_peeling && one_misalignment_known)
2120 /* Peeling is possible, but there is no data access that is not supported
2121 unless aligned. So we try to choose the best possible peeling from
2122 the hash table. */
2123 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2124 (&peeling_htab, loop_vinfo);
2127 /* Compare costs of peeling for known and unknown alignment. */
2128 if (peel_for_known_alignment.peel_info.dr_info != NULL
2129 && peel_for_unknown_alignment.inside_cost
2130 >= peel_for_known_alignment.inside_cost)
2132 best_peel = peel_for_known_alignment;
2134 /* If the best peeling for known alignment has NPEEL == 0, perform no
2135 peeling at all except if there is an unsupportable dr that we can
2136 align. */
2137 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2138 do_peeling = false;
2141 /* If there is an unsupportable data ref, prefer this over all choices so far
2142 since we'd have to discard a chosen peeling except when it accidentally
2143 aligned the unsupportable data ref. */
2144 if (one_dr_unsupportable)
2145 dr0_info = unsupportable_dr_info;
2146 else if (do_peeling)
2148 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2149 TODO: Use nopeel_outside_cost or get rid of it? */
2150 unsigned nopeel_inside_cost = 0;
2151 unsigned nopeel_outside_cost = 0;
2153 stmt_vector_for_cost dummy;
2154 dummy.create (2);
2155 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2156 &nopeel_outside_cost, &dummy, &dummy,
2157 0, false);
2158 dummy.release ();
2160 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2161 costs will be recorded. */
2162 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2163 prologue_cost_vec.create (2);
2164 epilogue_cost_vec.create (2);
2166 int dummy2;
2167 nopeel_outside_cost += vect_get_known_peeling_cost
2168 (loop_vinfo, 0, &dummy2,
2169 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2170 &prologue_cost_vec, &epilogue_cost_vec);
2172 prologue_cost_vec.release ();
2173 epilogue_cost_vec.release ();
2175 npeel = best_peel.peel_info.npeel;
2176 dr0_info = best_peel.peel_info.dr_info;
2178 /* If no peeling is not more expensive than the best peeling we
2179 have so far, don't perform any peeling. */
2180 if (nopeel_inside_cost <= best_peel.inside_cost)
2181 do_peeling = false;
2184 if (do_peeling)
2186 stmt_vec_info stmt_info = dr0_info->stmt;
2187 if (known_alignment_for_access_p (dr0_info,
2188 STMT_VINFO_VECTYPE (stmt_info)))
2190 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2191 size_zero_node) < 0;
2192 if (!npeel)
2194 /* Since it's known at compile time, compute the number of
2195 iterations in the peeled loop (the peeling factor) for use in
2196 updating DR_MISALIGNMENT values. The peeling factor is the
2197 vectorization factor minus the misalignment as an element
2198 count. */
2199 unsigned int mis
2200 = dr_misalignment (dr0_info, STMT_VINFO_VECTYPE (stmt_info));
2201 mis = negative ? mis : -mis;
2202 /* If known_alignment_for_access_p then we have set
2203 DR_MISALIGNMENT which is only done if we know it at compiler
2204 time, so it is safe to assume target alignment is constant.
2206 unsigned int target_align =
2207 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2208 npeel = ((mis & (target_align - 1))
2209 / vect_get_scalar_dr_size (dr0_info));
2212 /* For interleaved data access every iteration accesses all the
2213 members of the group, therefore we divide the number of iterations
2214 by the group size. */
2215 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2216 npeel /= DR_GROUP_SIZE (stmt_info);
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_NOTE, vect_location,
2220 "Try peeling by %d\n", npeel);
2223 /* Ensure that all datarefs can be vectorized after the peel. */
2224 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2225 do_peeling = false;
2227 /* Check if all datarefs are supportable and log. */
2228 if (do_peeling
2229 && npeel == 0
2230 && known_alignment_for_access_p (dr0_info,
2231 STMT_VINFO_VECTYPE (stmt_info)))
2232 return opt_result::success ();
2234 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2235 if (do_peeling)
2237 unsigned max_allowed_peel
2238 = param_vect_max_peeling_for_alignment;
2239 if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
2240 max_allowed_peel = 0;
2241 if (max_allowed_peel != (unsigned)-1)
2243 unsigned max_peel = npeel;
2244 if (max_peel == 0)
2246 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2247 unsigned HOST_WIDE_INT target_align_c;
2248 if (target_align.is_constant (&target_align_c))
2249 max_peel =
2250 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2251 else
2253 do_peeling = false;
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "Disable peeling, max peels set and vector"
2257 " alignment unknown\n");
2260 if (max_peel > max_allowed_peel)
2262 do_peeling = false;
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "Disable peeling, max peels reached: %d\n", max_peel);
2270 /* Cost model #2 - if peeling may result in a remaining loop not
2271 iterating enough to be vectorized then do not peel. Since this
2272 is a cost heuristic rather than a correctness decision, use the
2273 most likely runtime value for variable vectorization factors. */
2274 if (do_peeling
2275 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2277 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2278 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2279 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2280 < assumed_vf + max_peel)
2281 do_peeling = false;
2284 if (do_peeling)
2286 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2287 If the misalignment of DR_i is identical to that of dr0 then set
2288 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2289 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2290 by the peeling factor times the element size of DR_i (MOD the
2291 vectorization factor times the size). Otherwise, the
2292 misalignment of DR_i must be set to unknown. */
2293 FOR_EACH_VEC_ELT (datarefs, i, dr)
2294 if (dr != dr0_info->dr)
2296 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2297 if (!vect_relevant_for_alignment_p (dr_info))
2298 continue;
2300 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2303 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2304 if (npeel)
2305 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2306 else
2307 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2308 SET_DR_MISALIGNMENT (dr0_info,
2309 vect_dr_misalign_for_aligned_access (dr0_info));
2310 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "Alignment of access forced using peeling.\n");
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "Peeling for alignment will be applied.\n");
2318 /* The inside-loop cost will be accounted for in vectorizable_load
2319 and vectorizable_store correctly with adjusted alignments.
2320 Drop the body_cst_vec on the floor here. */
2321 return opt_result::success ();
2325 /* (2) Versioning to force alignment. */
2327 /* Try versioning if:
2328 1) optimize loop for speed and the cost-model is not cheap
2329 2) there is at least one unsupported misaligned data ref with an unknown
2330 misalignment, and
2331 3) all misaligned data refs with a known misalignment are supported, and
2332 4) the number of runtime alignment checks is within reason. */
2334 do_versioning
2335 = (optimize_loop_nest_for_speed_p (loop)
2336 && !loop->inner /* FORNOW */
2337 && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
2339 if (do_versioning)
2341 FOR_EACH_VEC_ELT (datarefs, i, dr)
2343 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2344 stmt_vec_info stmt_info = dr_info->stmt;
2345 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2346 if (aligned_access_p (dr_info, vectype)
2347 || !vect_relevant_for_alignment_p (dr_info))
2348 continue;
2350 if (STMT_VINFO_STRIDED_P (stmt_info))
2352 do_versioning = false;
2353 break;
2356 supportable_dr_alignment
2357 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2358 false);
2359 if (!supportable_dr_alignment)
2361 if (known_alignment_for_access_p (dr_info, vectype)
2362 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2363 >= (unsigned) param_vect_max_version_for_alignment_checks)
2365 do_versioning = false;
2366 break;
2369 /* At present we don't support versioning for alignment
2370 with variable VF, since there's no guarantee that the
2371 VF is a power of two. We could relax this if we added
2372 a way of enforcing a power-of-two size. */
2373 unsigned HOST_WIDE_INT size;
2374 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2376 do_versioning = false;
2377 break;
2380 /* Forcing alignment in the first iteration is no good if
2381 we don't keep it across iterations. For now, just disable
2382 versioning in this case.
2383 ?? We could actually unroll the loop to achieve the required
2384 overall step alignment, and forcing the alignment could be
2385 done by doing some iterations of the non-vectorized loop. */
2386 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2387 * DR_STEP_ALIGNMENT (dr),
2388 DR_TARGET_ALIGNMENT (dr_info)))
2390 do_versioning = false;
2391 break;
2394 /* The rightmost bits of an aligned address must be zeros.
2395 Construct the mask needed for this test. For example,
2396 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2397 mask must be 15 = 0xf. */
2398 int mask = size - 1;
2400 /* FORNOW: use the same mask to test all potentially unaligned
2401 references in the loop. */
2402 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2403 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2405 do_versioning = false;
2406 break;
2409 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2410 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2414 /* Versioning requires at least one misaligned data reference. */
2415 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2416 do_versioning = false;
2417 else if (!do_versioning)
2418 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2421 if (do_versioning)
2423 const vec<stmt_vec_info> &may_misalign_stmts
2424 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2425 stmt_vec_info stmt_info;
2427 /* It can now be assumed that the data references in the statements
2428 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2429 of the loop being vectorized. */
2430 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2432 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2433 SET_DR_MISALIGNMENT (dr_info,
2434 vect_dr_misalign_for_aligned_access (dr_info));
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_NOTE, vect_location,
2437 "Alignment of access forced using versioning.\n");
2440 if (dump_enabled_p ())
2441 dump_printf_loc (MSG_NOTE, vect_location,
2442 "Versioning for alignment will be applied.\n");
2444 /* Peeling and versioning can't be done together at this time. */
2445 gcc_assert (! (do_peeling && do_versioning));
2447 return opt_result::success ();
2450 /* This point is reached if neither peeling nor versioning is being done. */
2451 gcc_assert (! (do_peeling || do_versioning));
2453 return opt_result::success ();
2457 /* Function vect_analyze_data_refs_alignment
2459 Analyze the alignment of the data-references in the loop.
2460 Return FALSE if a data reference is found that cannot be vectorized. */
2462 opt_result
2463 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2465 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2467 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2468 struct data_reference *dr;
2469 unsigned int i;
2471 vect_record_base_alignments (loop_vinfo);
2472 FOR_EACH_VEC_ELT (datarefs, i, dr)
2474 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2475 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2477 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2478 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2479 continue;
2480 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2481 STMT_VINFO_VECTYPE (dr_info->stmt));
2485 return opt_result::success ();
2489 /* Analyze alignment of DRs of stmts in NODE. */
2491 static bool
2492 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2494 /* Alignment is maintained in the first element of the group. */
2495 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2496 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2497 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2498 tree vectype = SLP_TREE_VECTYPE (node);
2499 poly_uint64 vector_alignment
2500 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2501 BITS_PER_UNIT);
2502 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2503 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2504 /* Re-analyze alignment when we're facing a vectorization with a bigger
2505 alignment requirement. */
2506 else if (known_lt (dr_info->target_alignment, vector_alignment))
2508 poly_uint64 old_target_alignment = dr_info->target_alignment;
2509 int old_misalignment = dr_info->misalignment;
2510 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2511 /* But keep knowledge about a smaller alignment. */
2512 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2513 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2515 dr_info->target_alignment = old_target_alignment;
2516 dr_info->misalignment = old_misalignment;
2519 /* When we ever face unordered target alignments the first one wins in terms
2520 of analyzing and the other will become unknown in dr_misalignment. */
2521 return true;
2524 /* Function vect_slp_analyze_instance_alignment
2526 Analyze the alignment of the data-references in the SLP instance.
2527 Return FALSE if a data reference is found that cannot be vectorized. */
2529 bool
2530 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2531 slp_instance instance)
2533 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2535 slp_tree node;
2536 unsigned i;
2537 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2538 if (! vect_slp_analyze_node_alignment (vinfo, node))
2539 return false;
2541 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2542 && ! vect_slp_analyze_node_alignment
2543 (vinfo, SLP_INSTANCE_TREE (instance)))
2544 return false;
2546 return true;
2550 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2551 accesses of legal size, step, etc. Detect gaps, single element
2552 interleaving, and other special cases. Set grouped access info.
2553 Collect groups of strided stores for further use in SLP analysis.
2554 Worker for vect_analyze_group_access. */
2556 static bool
2557 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2559 data_reference *dr = dr_info->dr;
2560 tree step = DR_STEP (dr);
2561 tree scalar_type = TREE_TYPE (DR_REF (dr));
2562 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2563 stmt_vec_info stmt_info = dr_info->stmt;
2564 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2565 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2566 HOST_WIDE_INT dr_step = -1;
2567 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2568 bool slp_impossible = false;
2570 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2571 size of the interleaving group (including gaps). */
2572 if (tree_fits_shwi_p (step))
2574 dr_step = tree_to_shwi (step);
2575 /* Check that STEP is a multiple of type size. Otherwise there is
2576 a non-element-sized gap at the end of the group which we
2577 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2578 ??? As we can handle non-constant step fine here we should
2579 simply remove uses of DR_GROUP_GAP between the last and first
2580 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2581 simply not include that gap. */
2582 if ((dr_step % type_size) != 0)
2584 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_NOTE, vect_location,
2586 "Step %T is not a multiple of the element size"
2587 " for %T\n",
2588 step, DR_REF (dr));
2589 return false;
2591 groupsize = absu_hwi (dr_step) / type_size;
2593 else
2594 groupsize = 0;
2596 /* Not consecutive access is possible only if it is a part of interleaving. */
2597 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2599 /* Check if it this DR is a part of interleaving, and is a single
2600 element of the group that is accessed in the loop. */
2602 /* Gaps are supported only for loads. STEP must be a multiple of the type
2603 size. */
2604 if (DR_IS_READ (dr)
2605 && (dr_step % type_size) == 0
2606 && groupsize > 0
2607 /* This could be UINT_MAX but as we are generating code in a very
2608 inefficient way we have to cap earlier.
2609 See PR91403 for example. */
2610 && groupsize <= 4096)
2612 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2613 DR_GROUP_SIZE (stmt_info) = groupsize;
2614 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_NOTE, vect_location,
2617 "Detected single element interleaving %T"
2618 " step %T\n",
2619 DR_REF (dr), step);
2621 return true;
2624 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626 "not consecutive access %G", stmt_info->stmt);
2628 if (bb_vinfo)
2630 /* Mark the statement as unvectorizable. */
2631 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2632 return true;
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2637 STMT_VINFO_STRIDED_P (stmt_info) = true;
2638 return true;
2641 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2643 /* First stmt in the interleaving chain. Check the chain. */
2644 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2645 struct data_reference *data_ref = dr;
2646 unsigned int count = 1;
2647 tree prev_init = DR_INIT (data_ref);
2648 HOST_WIDE_INT diff, gaps = 0;
2650 /* By construction, all group members have INTEGER_CST DR_INITs. */
2651 while (next)
2653 /* We never have the same DR multiple times. */
2654 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2655 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2657 data_ref = STMT_VINFO_DATA_REF (next);
2659 /* All group members have the same STEP by construction. */
2660 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2662 /* Check that the distance between two accesses is equal to the type
2663 size. Otherwise, we have gaps. */
2664 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2665 - TREE_INT_CST_LOW (prev_init)) / type_size;
2666 if (diff != 1)
2668 /* FORNOW: SLP of accesses with gaps is not supported. */
2669 slp_impossible = true;
2670 if (DR_IS_WRITE (data_ref))
2672 if (dump_enabled_p ())
2673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2674 "interleaved store with gaps\n");
2675 return false;
2678 gaps += diff - 1;
2681 last_accessed_element += diff;
2683 /* Store the gap from the previous member of the group. If there is no
2684 gap in the access, DR_GROUP_GAP is always 1. */
2685 DR_GROUP_GAP (next) = diff;
2687 prev_init = DR_INIT (data_ref);
2688 next = DR_GROUP_NEXT_ELEMENT (next);
2689 /* Count the number of data-refs in the chain. */
2690 count++;
2693 if (groupsize == 0)
2694 groupsize = count + gaps;
2696 /* This could be UINT_MAX but as we are generating code in a very
2697 inefficient way we have to cap earlier. See PR78699 for example. */
2698 if (groupsize > 4096)
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702 "group is too large\n");
2703 return false;
2706 /* Check that the size of the interleaving is equal to count for stores,
2707 i.e., that there are no gaps. */
2708 if (groupsize != count
2709 && !DR_IS_READ (dr))
2711 groupsize = count;
2712 STMT_VINFO_STRIDED_P (stmt_info) = true;
2715 /* If there is a gap after the last load in the group it is the
2716 difference between the groupsize and the last accessed
2717 element.
2718 When there is no gap, this difference should be 0. */
2719 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2721 DR_GROUP_SIZE (stmt_info) = groupsize;
2722 if (dump_enabled_p ())
2724 dump_printf_loc (MSG_NOTE, vect_location,
2725 "Detected interleaving ");
2726 if (DR_IS_READ (dr))
2727 dump_printf (MSG_NOTE, "load ");
2728 else if (STMT_VINFO_STRIDED_P (stmt_info))
2729 dump_printf (MSG_NOTE, "strided store ");
2730 else
2731 dump_printf (MSG_NOTE, "store ");
2732 dump_printf (MSG_NOTE, "of size %u\n",
2733 (unsigned)groupsize);
2734 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2735 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2736 while (next)
2738 if (DR_GROUP_GAP (next) != 1)
2739 dump_printf_loc (MSG_NOTE, vect_location,
2740 "\t<gap of %d elements>\n",
2741 DR_GROUP_GAP (next) - 1);
2742 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2743 next = DR_GROUP_NEXT_ELEMENT (next);
2745 if (DR_GROUP_GAP (stmt_info) != 0)
2746 dump_printf_loc (MSG_NOTE, vect_location,
2747 "\t<gap of %d elements>\n",
2748 DR_GROUP_GAP (stmt_info));
2751 /* SLP: create an SLP data structure for every interleaving group of
2752 stores for further analysis in vect_analyse_slp. */
2753 if (DR_IS_WRITE (dr) && !slp_impossible)
2755 if (loop_vinfo)
2756 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2757 if (bb_vinfo)
2758 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2762 return true;
2765 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2766 accesses of legal size, step, etc. Detect gaps, single element
2767 interleaving, and other special cases. Set grouped access info.
2768 Collect groups of strided stores for further use in SLP analysis. */
2770 static bool
2771 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2773 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2775 /* Dissolve the group if present. */
2776 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2777 while (stmt_info)
2779 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2780 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2781 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2782 stmt_info = next;
2784 return false;
2786 return true;
2789 /* Analyze the access pattern of the data-reference DR_INFO.
2790 In case of non-consecutive accesses call vect_analyze_group_access() to
2791 analyze groups of accesses. */
2793 static bool
2794 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2796 data_reference *dr = dr_info->dr;
2797 tree step = DR_STEP (dr);
2798 tree scalar_type = TREE_TYPE (DR_REF (dr));
2799 stmt_vec_info stmt_info = dr_info->stmt;
2800 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2801 class loop *loop = NULL;
2803 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2804 return true;
2806 if (loop_vinfo)
2807 loop = LOOP_VINFO_LOOP (loop_vinfo);
2809 if (loop_vinfo && !step)
2811 if (dump_enabled_p ())
2812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2813 "bad data-ref access in loop\n");
2814 return false;
2817 /* Allow loads with zero step in inner-loop vectorization. */
2818 if (loop_vinfo && integer_zerop (step))
2820 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2821 if (!nested_in_vect_loop_p (loop, stmt_info))
2822 return DR_IS_READ (dr);
2823 /* Allow references with zero step for outer loops marked
2824 with pragma omp simd only - it guarantees absence of
2825 loop-carried dependencies between inner loop iterations. */
2826 if (loop->safelen < 2)
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_NOTE, vect_location,
2830 "zero step in inner loop of nest\n");
2831 return false;
2835 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2837 /* Interleaved accesses are not yet supported within outer-loop
2838 vectorization for references in the inner-loop. */
2839 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2841 /* For the rest of the analysis we use the outer-loop step. */
2842 step = STMT_VINFO_DR_STEP (stmt_info);
2843 if (integer_zerop (step))
2845 if (dump_enabled_p ())
2846 dump_printf_loc (MSG_NOTE, vect_location,
2847 "zero step in outer loop.\n");
2848 return DR_IS_READ (dr);
2852 /* Consecutive? */
2853 if (TREE_CODE (step) == INTEGER_CST)
2855 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2856 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2857 || (dr_step < 0
2858 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2860 /* Mark that it is not interleaving. */
2861 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2862 return true;
2866 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2868 if (dump_enabled_p ())
2869 dump_printf_loc (MSG_NOTE, vect_location,
2870 "grouped access in outer loop.\n");
2871 return false;
2875 /* Assume this is a DR handled by non-constant strided load case. */
2876 if (TREE_CODE (step) != INTEGER_CST)
2877 return (STMT_VINFO_STRIDED_P (stmt_info)
2878 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2879 || vect_analyze_group_access (vinfo, dr_info)));
2881 /* Not consecutive access - check if it's a part of interleaving group. */
2882 return vect_analyze_group_access (vinfo, dr_info);
2885 /* Compare two data-references DRA and DRB to group them into chunks
2886 suitable for grouping. */
2888 static int
2889 dr_group_sort_cmp (const void *dra_, const void *drb_)
2891 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2892 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2893 data_reference_p dra = dra_info->dr;
2894 data_reference_p drb = drb_info->dr;
2895 int cmp;
2897 /* Stabilize sort. */
2898 if (dra == drb)
2899 return 0;
2901 /* Different group IDs lead never belong to the same group. */
2902 if (dra_info->group != drb_info->group)
2903 return dra_info->group < drb_info->group ? -1 : 1;
2905 /* Ordering of DRs according to base. */
2906 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2907 DR_BASE_ADDRESS (drb));
2908 if (cmp != 0)
2909 return cmp;
2911 /* And according to DR_OFFSET. */
2912 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2913 if (cmp != 0)
2914 return cmp;
2916 /* Put reads before writes. */
2917 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2918 return DR_IS_READ (dra) ? -1 : 1;
2920 /* Then sort after access size. */
2921 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2922 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2923 if (cmp != 0)
2924 return cmp;
2926 /* And after step. */
2927 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2928 if (cmp != 0)
2929 return cmp;
2931 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2932 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2933 if (cmp == 0)
2934 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2935 return cmp;
2938 /* If OP is the result of a conversion, return the unconverted value,
2939 otherwise return null. */
2941 static tree
2942 strip_conversion (tree op)
2944 if (TREE_CODE (op) != SSA_NAME)
2945 return NULL_TREE;
2946 gimple *stmt = SSA_NAME_DEF_STMT (op);
2947 if (!is_gimple_assign (stmt)
2948 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2949 return NULL_TREE;
2950 return gimple_assign_rhs1 (stmt);
2953 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2954 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2955 be grouped in SLP mode. */
2957 static bool
2958 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2959 bool allow_slp_p)
2961 if (gimple_assign_single_p (stmt1_info->stmt))
2962 return gimple_assign_single_p (stmt2_info->stmt);
2964 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2965 if (call1 && gimple_call_internal_p (call1))
2967 /* Check for two masked loads or two masked stores. */
2968 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2969 if (!call2 || !gimple_call_internal_p (call2))
2970 return false;
2971 internal_fn ifn = gimple_call_internal_fn (call1);
2972 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2973 return false;
2974 if (ifn != gimple_call_internal_fn (call2))
2975 return false;
2977 /* Check that the masks are the same. Cope with casts of masks,
2978 like those created by build_mask_conversion. */
2979 tree mask1 = gimple_call_arg (call1, 2);
2980 tree mask2 = gimple_call_arg (call2, 2);
2981 if (!operand_equal_p (mask1, mask2, 0)
2982 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2984 mask1 = strip_conversion (mask1);
2985 if (!mask1)
2986 return false;
2987 mask2 = strip_conversion (mask2);
2988 if (!mask2)
2989 return false;
2990 if (!operand_equal_p (mask1, mask2, 0))
2991 return false;
2993 return true;
2996 return false;
2999 /* Function vect_analyze_data_ref_accesses.
3001 Analyze the access pattern of all the data references in the loop.
3003 FORNOW: the only access pattern that is considered vectorizable is a
3004 simple step 1 (consecutive) access.
3006 FORNOW: handle only arrays and pointer accesses. */
3008 opt_result
3009 vect_analyze_data_ref_accesses (vec_info *vinfo,
3010 vec<int> *dataref_groups)
3012 unsigned int i;
3013 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3015 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3017 if (datarefs.is_empty ())
3018 return opt_result::success ();
3020 /* Sort the array of datarefs to make building the interleaving chains
3021 linear. Don't modify the original vector's order, it is needed for
3022 determining what dependencies are reversed. */
3023 vec<dr_vec_info *> datarefs_copy;
3024 datarefs_copy.create (datarefs.length ());
3025 for (unsigned i = 0; i < datarefs.length (); i++)
3027 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3028 /* If the caller computed DR grouping use that, otherwise group by
3029 basic blocks. */
3030 if (dataref_groups)
3031 dr_info->group = (*dataref_groups)[i];
3032 else
3033 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3034 datarefs_copy.quick_push (dr_info);
3036 datarefs_copy.qsort (dr_group_sort_cmp);
3037 hash_set<stmt_vec_info> to_fixup;
3039 /* Build the interleaving chains. */
3040 for (i = 0; i < datarefs_copy.length () - 1;)
3042 dr_vec_info *dr_info_a = datarefs_copy[i];
3043 data_reference_p dra = dr_info_a->dr;
3044 int dra_group_id = dr_info_a->group;
3045 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3046 stmt_vec_info lastinfo = NULL;
3047 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3048 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3050 ++i;
3051 continue;
3053 for (i = i + 1; i < datarefs_copy.length (); ++i)
3055 dr_vec_info *dr_info_b = datarefs_copy[i];
3056 data_reference_p drb = dr_info_b->dr;
3057 int drb_group_id = dr_info_b->group;
3058 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3059 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3060 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3061 break;
3063 /* ??? Imperfect sorting (non-compatible types, non-modulo
3064 accesses, same accesses) can lead to a group to be artificially
3065 split here as we don't just skip over those. If it really
3066 matters we can push those to a worklist and re-iterate
3067 over them. The we can just skip ahead to the next DR here. */
3069 /* DRs in a different DR group should not be put into the same
3070 interleaving group. */
3071 if (dra_group_id != drb_group_id)
3072 break;
3074 /* Check that the data-refs have same first location (except init)
3075 and they are both either store or load (not load and store,
3076 not masked loads or stores). */
3077 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3078 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3079 DR_BASE_ADDRESS (drb)) != 0
3080 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3081 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3082 break;
3084 /* Check that the data-refs have the same constant size. */
3085 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3086 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3087 if (!tree_fits_uhwi_p (sza)
3088 || !tree_fits_uhwi_p (szb)
3089 || !tree_int_cst_equal (sza, szb))
3090 break;
3092 /* Check that the data-refs have the same step. */
3093 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3094 break;
3096 /* Check the types are compatible.
3097 ??? We don't distinguish this during sorting. */
3098 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3099 TREE_TYPE (DR_REF (drb))))
3100 break;
3102 /* Check that the DR_INITs are compile-time constants. */
3103 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3104 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3105 break;
3107 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3108 just hold extra information. */
3109 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3110 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3111 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3112 break;
3114 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3115 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3116 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3117 HOST_WIDE_INT init_prev
3118 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3119 gcc_assert (init_a <= init_b
3120 && init_a <= init_prev
3121 && init_prev <= init_b);
3123 /* Do not place the same access in the interleaving chain twice. */
3124 if (init_b == init_prev)
3126 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3127 < gimple_uid (DR_STMT (drb)));
3128 /* Simply link in duplicates and fix up the chain below. */
3130 else
3132 /* If init_b == init_a + the size of the type * k, we have an
3133 interleaving, and DRA is accessed before DRB. */
3134 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3135 if (type_size_a == 0
3136 || (init_b - init_a) % type_size_a != 0)
3137 break;
3139 /* If we have a store, the accesses are adjacent. This splits
3140 groups into chunks we support (we don't support vectorization
3141 of stores with gaps). */
3142 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3143 break;
3145 /* If the step (if not zero or non-constant) is smaller than the
3146 difference between data-refs' inits this splits groups into
3147 suitable sizes. */
3148 if (tree_fits_shwi_p (DR_STEP (dra)))
3150 unsigned HOST_WIDE_INT step
3151 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3152 if (step != 0
3153 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3154 break;
3158 if (dump_enabled_p ())
3159 dump_printf_loc (MSG_NOTE, vect_location,
3160 DR_IS_READ (dra)
3161 ? "Detected interleaving load %T and %T\n"
3162 : "Detected interleaving store %T and %T\n",
3163 DR_REF (dra), DR_REF (drb));
3165 /* Link the found element into the group list. */
3166 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3168 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3169 lastinfo = stmtinfo_a;
3171 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3172 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3173 lastinfo = stmtinfo_b;
3175 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3176 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3178 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3179 dump_printf_loc (MSG_NOTE, vect_location,
3180 "Load suitable for SLP vectorization only.\n");
3182 if (init_b == init_prev
3183 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3184 && dump_enabled_p ())
3185 dump_printf_loc (MSG_NOTE, vect_location,
3186 "Queuing group with duplicate access for fixup\n");
3190 /* Fixup groups with duplicate entries by splitting it. */
3191 while (1)
3193 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3194 if (!(it != to_fixup.end ()))
3195 break;
3196 stmt_vec_info grp = *it;
3197 to_fixup.remove (grp);
3199 /* Find the earliest duplicate group member. */
3200 unsigned first_duplicate = -1u;
3201 stmt_vec_info next, g = grp;
3202 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3204 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3205 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3206 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3207 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3208 g = next;
3210 if (first_duplicate == -1U)
3211 continue;
3213 /* Then move all stmts after the first duplicate to a new group.
3214 Note this is a heuristic but one with the property that *it
3215 is fixed up completely. */
3216 g = grp;
3217 stmt_vec_info newgroup = NULL, ng = grp;
3218 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3220 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3222 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3223 if (!newgroup)
3224 newgroup = next;
3225 else
3226 DR_GROUP_NEXT_ELEMENT (ng) = next;
3227 ng = next;
3228 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3230 else
3231 g = DR_GROUP_NEXT_ELEMENT (g);
3233 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3235 /* Fixup the new group which still may contain duplicates. */
3236 to_fixup.add (newgroup);
3239 dr_vec_info *dr_info;
3240 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3242 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3243 && !vect_analyze_data_ref_access (vinfo, dr_info))
3245 if (dump_enabled_p ())
3246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3247 "not vectorized: complicated access pattern.\n");
3249 if (is_a <bb_vec_info> (vinfo))
3251 /* Mark the statement as not vectorizable. */
3252 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3253 continue;
3255 else
3257 datarefs_copy.release ();
3258 return opt_result::failure_at (dr_info->stmt->stmt,
3259 "not vectorized:"
3260 " complicated access pattern.\n");
3265 datarefs_copy.release ();
3266 return opt_result::success ();
3269 /* Function vect_vfa_segment_size.
3271 Input:
3272 DR_INFO: The data reference.
3273 LENGTH_FACTOR: segment length to consider.
3275 Return a value suitable for the dr_with_seg_len::seg_len field.
3276 This is the "distance travelled" by the pointer from the first
3277 iteration in the segment to the last. Note that it does not include
3278 the size of the access; in effect it only describes the first byte. */
3280 static tree
3281 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3283 length_factor = size_binop (MINUS_EXPR,
3284 fold_convert (sizetype, length_factor),
3285 size_one_node);
3286 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3287 length_factor);
3290 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3291 gives the worst-case number of bytes covered by the segment. */
3293 static unsigned HOST_WIDE_INT
3294 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3296 stmt_vec_info stmt_vinfo = dr_info->stmt;
3297 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3298 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3299 unsigned HOST_WIDE_INT access_size = ref_size;
3300 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3302 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3303 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3305 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3306 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3307 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, false)
3308 == dr_explicit_realign_optimized))
3310 /* We might access a full vector's worth. */
3311 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3313 return access_size;
3316 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3317 describes. */
3319 static unsigned int
3320 vect_vfa_align (dr_vec_info *dr_info)
3322 return dr_alignment (dr_info->dr);
3325 /* Function vect_no_alias_p.
3327 Given data references A and B with equal base and offset, see whether
3328 the alias relation can be decided at compilation time. Return 1 if
3329 it can and the references alias, 0 if it can and the references do
3330 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3331 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3332 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3334 static int
3335 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3336 tree segment_length_a, tree segment_length_b,
3337 unsigned HOST_WIDE_INT access_size_a,
3338 unsigned HOST_WIDE_INT access_size_b)
3340 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3341 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3342 poly_uint64 const_length_a;
3343 poly_uint64 const_length_b;
3345 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3346 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3347 [a, a+12) */
3348 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3350 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3351 offset_a -= const_length_a;
3353 else
3354 const_length_a = tree_to_poly_uint64 (segment_length_a);
3355 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3357 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3358 offset_b -= const_length_b;
3360 else
3361 const_length_b = tree_to_poly_uint64 (segment_length_b);
3363 const_length_a += access_size_a;
3364 const_length_b += access_size_b;
3366 if (ranges_known_overlap_p (offset_a, const_length_a,
3367 offset_b, const_length_b))
3368 return 1;
3370 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3371 offset_b, const_length_b))
3372 return 0;
3374 return -1;
3377 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3378 in DDR is >= VF. */
3380 static bool
3381 dependence_distance_ge_vf (data_dependence_relation *ddr,
3382 unsigned int loop_depth, poly_uint64 vf)
3384 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3385 || DDR_NUM_DIST_VECTS (ddr) == 0)
3386 return false;
3388 /* If the dependence is exact, we should have limited the VF instead. */
3389 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3391 unsigned int i;
3392 lambda_vector dist_v;
3393 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3395 HOST_WIDE_INT dist = dist_v[loop_depth];
3396 if (dist != 0
3397 && !(dist > 0 && DDR_REVERSED_P (ddr))
3398 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3399 return false;
3402 if (dump_enabled_p ())
3403 dump_printf_loc (MSG_NOTE, vect_location,
3404 "dependence distance between %T and %T is >= VF\n",
3405 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3407 return true;
3410 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3412 static void
3413 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3415 dump_printf (dump_kind, "%s (%T) >= ",
3416 lower_bound.unsigned_p ? "unsigned" : "abs",
3417 lower_bound.expr);
3418 dump_dec (dump_kind, lower_bound.min_value);
3421 /* Record that the vectorized loop requires the vec_lower_bound described
3422 by EXPR, UNSIGNED_P and MIN_VALUE. */
3424 static void
3425 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3426 poly_uint64 min_value)
3428 vec<vec_lower_bound> &lower_bounds
3429 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3430 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3431 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3433 unsigned_p &= lower_bounds[i].unsigned_p;
3434 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3435 if (lower_bounds[i].unsigned_p != unsigned_p
3436 || maybe_lt (lower_bounds[i].min_value, min_value))
3438 lower_bounds[i].unsigned_p = unsigned_p;
3439 lower_bounds[i].min_value = min_value;
3440 if (dump_enabled_p ())
3442 dump_printf_loc (MSG_NOTE, vect_location,
3443 "updating run-time check to ");
3444 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3445 dump_printf (MSG_NOTE, "\n");
3448 return;
3451 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3452 if (dump_enabled_p ())
3454 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3455 dump_lower_bound (MSG_NOTE, lower_bound);
3456 dump_printf (MSG_NOTE, "\n");
3458 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3461 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3462 will span fewer than GAP bytes. */
3464 static bool
3465 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3466 poly_int64 gap)
3468 stmt_vec_info stmt_info = dr_info->stmt;
3469 HOST_WIDE_INT count
3470 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3471 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3472 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3473 return (estimated_poly_value (gap)
3474 <= count * vect_get_scalar_dr_size (dr_info));
3477 /* Return true if we know that there is no alias between DR_INFO_A and
3478 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3479 When returning true, set *LOWER_BOUND_OUT to this N. */
3481 static bool
3482 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3483 poly_uint64 *lower_bound_out)
3485 /* Check that there is a constant gap of known sign between DR_A
3486 and DR_B. */
3487 data_reference *dr_a = dr_info_a->dr;
3488 data_reference *dr_b = dr_info_b->dr;
3489 poly_int64 init_a, init_b;
3490 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3491 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3492 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3493 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3494 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3495 || !ordered_p (init_a, init_b))
3496 return false;
3498 /* Sort DR_A and DR_B by the address they access. */
3499 if (maybe_lt (init_b, init_a))
3501 std::swap (init_a, init_b);
3502 std::swap (dr_info_a, dr_info_b);
3503 std::swap (dr_a, dr_b);
3506 /* If the two accesses could be dependent within a scalar iteration,
3507 make sure that we'd retain their order. */
3508 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3509 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3510 return false;
3512 /* There is no alias if abs (DR_STEP) is greater than or equal to
3513 the bytes spanned by the combination of the two accesses. */
3514 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3515 return true;
3518 /* Function vect_prune_runtime_alias_test_list.
3520 Prune a list of ddrs to be tested at run-time by versioning for alias.
3521 Merge several alias checks into one if possible.
3522 Return FALSE if resulting list of ddrs is longer then allowed by
3523 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3525 opt_result
3526 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3528 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3529 hash_set <tree_pair_hash> compared_objects;
3531 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3532 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3533 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3534 const vec<vec_object_pair> &check_unequal_addrs
3535 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3536 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3537 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3539 ddr_p ddr;
3540 unsigned int i;
3541 tree length_factor;
3543 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3545 /* Step values are irrelevant for aliasing if the number of vector
3546 iterations is equal to the number of scalar iterations (which can
3547 happen for fully-SLP loops). */
3548 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3550 if (!vf_one_p)
3552 /* Convert the checks for nonzero steps into bound tests. */
3553 tree value;
3554 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3555 vect_check_lower_bound (loop_vinfo, value, true, 1);
3558 if (may_alias_ddrs.is_empty ())
3559 return opt_result::success ();
3561 comp_alias_ddrs.create (may_alias_ddrs.length ());
3563 unsigned int loop_depth
3564 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3565 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3567 /* First, we collect all data ref pairs for aliasing checks. */
3568 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3570 poly_uint64 lower_bound;
3571 tree segment_length_a, segment_length_b;
3572 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3573 unsigned int align_a, align_b;
3575 /* Ignore the alias if the VF we chose ended up being no greater
3576 than the dependence distance. */
3577 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3578 continue;
3580 if (DDR_OBJECT_A (ddr))
3582 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3583 if (!compared_objects.add (new_pair))
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_NOTE, vect_location,
3587 "checking that %T and %T"
3588 " have different addresses\n",
3589 new_pair.first, new_pair.second);
3590 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3592 continue;
3595 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3596 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3598 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3599 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3601 bool preserves_scalar_order_p
3602 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3603 bool ignore_step_p
3604 = (vf_one_p
3605 && (preserves_scalar_order_p
3606 || operand_equal_p (DR_STEP (dr_info_a->dr),
3607 DR_STEP (dr_info_b->dr))));
3609 /* Skip the pair if inter-iteration dependencies are irrelevant
3610 and intra-iteration dependencies are guaranteed to be honored. */
3611 if (ignore_step_p
3612 && (preserves_scalar_order_p
3613 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3614 &lower_bound)))
3616 if (dump_enabled_p ())
3617 dump_printf_loc (MSG_NOTE, vect_location,
3618 "no need for alias check between "
3619 "%T and %T when VF is 1\n",
3620 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3621 continue;
3624 /* See whether we can handle the alias using a bounds check on
3625 the step, and whether that's likely to be the best approach.
3626 (It might not be, for example, if the minimum step is much larger
3627 than the number of bytes handled by one vector iteration.) */
3628 if (!ignore_step_p
3629 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3630 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3631 &lower_bound)
3632 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3633 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3635 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3636 if (dump_enabled_p ())
3638 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3639 "%T and %T when the step %T is outside ",
3640 DR_REF (dr_info_a->dr),
3641 DR_REF (dr_info_b->dr),
3642 DR_STEP (dr_info_a->dr));
3643 if (unsigned_p)
3644 dump_printf (MSG_NOTE, "[0");
3645 else
3647 dump_printf (MSG_NOTE, "(");
3648 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3650 dump_printf (MSG_NOTE, ", ");
3651 dump_dec (MSG_NOTE, lower_bound);
3652 dump_printf (MSG_NOTE, ")\n");
3654 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3655 unsigned_p, lower_bound);
3656 continue;
3659 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3660 if (dr_group_first_a)
3662 stmt_info_a = dr_group_first_a;
3663 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3666 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3667 if (dr_group_first_b)
3669 stmt_info_b = dr_group_first_b;
3670 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3673 if (ignore_step_p)
3675 segment_length_a = size_zero_node;
3676 segment_length_b = size_zero_node;
3678 else
3680 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3681 DR_STEP (dr_info_b->dr), 0))
3682 length_factor = scalar_loop_iters;
3683 else
3684 length_factor = size_int (vect_factor);
3685 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3686 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3688 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3689 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3690 align_a = vect_vfa_align (dr_info_a);
3691 align_b = vect_vfa_align (dr_info_b);
3693 /* See whether the alias is known at compilation time. */
3694 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3695 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3696 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3697 DR_OFFSET (dr_info_b->dr), 0)
3698 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3699 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3700 && poly_int_tree_p (segment_length_a)
3701 && poly_int_tree_p (segment_length_b))
3703 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3704 segment_length_a,
3705 segment_length_b,
3706 access_size_a,
3707 access_size_b);
3708 if (res >= 0 && dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE, vect_location,
3711 "can tell at compile time that %T and %T",
3712 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3713 if (res == 0)
3714 dump_printf (MSG_NOTE, " do not alias\n");
3715 else
3716 dump_printf (MSG_NOTE, " alias\n");
3719 if (res == 0)
3720 continue;
3722 if (res == 1)
3723 return opt_result::failure_at (stmt_info_b->stmt,
3724 "not vectorized:"
3725 " compilation time alias: %G%G",
3726 stmt_info_a->stmt,
3727 stmt_info_b->stmt);
3730 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3731 access_size_a, align_a);
3732 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3733 access_size_b, align_b);
3734 /* Canonicalize the order to be the one that's needed for accurate
3735 RAW, WAR and WAW flags, in cases where the data references are
3736 well-ordered. The order doesn't really matter otherwise,
3737 but we might as well be consistent. */
3738 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3739 std::swap (dr_a, dr_b);
3741 dr_with_seg_len_pair_t dr_with_seg_len_pair
3742 (dr_a, dr_b, (preserves_scalar_order_p
3743 ? dr_with_seg_len_pair_t::WELL_ORDERED
3744 : dr_with_seg_len_pair_t::REORDERED));
3746 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3749 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3751 unsigned int count = (comp_alias_ddrs.length ()
3752 + check_unequal_addrs.length ());
3754 if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
3755 return opt_result::failure_at
3756 (vect_location, "would need a runtime alias check\n");
3758 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_NOTE, vect_location,
3760 "improved number of alias checks from %d to %d\n",
3761 may_alias_ddrs.length (), count);
3762 unsigned limit = param_vect_max_version_for_alias_checks;
3763 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3764 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3765 if (count > limit)
3766 return opt_result::failure_at
3767 (vect_location,
3768 "number of versioning for alias run-time tests exceeds %d "
3769 "(--param vect-max-version-for-alias-checks)\n", limit);
3771 return opt_result::success ();
3774 /* Check whether we can use an internal function for a gather load
3775 or scatter store. READ_P is true for loads and false for stores.
3776 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3777 the type of the memory elements being loaded or stored. OFFSET_TYPE
3778 is the type of the offset that is being applied to the invariant
3779 base address. SCALE is the amount by which the offset should
3780 be multiplied *after* it has been converted to address width.
3782 Return true if the function is supported, storing the function id in
3783 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3785 bool
3786 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3787 tree vectype, tree memory_type, tree offset_type,
3788 int scale, internal_fn *ifn_out,
3789 tree *offset_vectype_out)
3791 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3792 unsigned int element_bits = vector_element_bits (vectype);
3793 if (element_bits != memory_bits)
3794 /* For now the vector elements must be the same width as the
3795 memory elements. */
3796 return false;
3798 /* Work out which function we need. */
3799 internal_fn ifn, alt_ifn;
3800 if (read_p)
3802 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3803 alt_ifn = IFN_MASK_GATHER_LOAD;
3805 else
3807 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3808 alt_ifn = IFN_MASK_SCATTER_STORE;
3811 for (;;)
3813 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3814 if (!offset_vectype)
3815 return false;
3817 /* Test whether the target supports this combination. */
3818 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3819 offset_vectype, scale))
3821 *ifn_out = ifn;
3822 *offset_vectype_out = offset_vectype;
3823 return true;
3825 else if (!masked_p
3826 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3827 memory_type,
3828 offset_vectype,
3829 scale))
3831 *ifn_out = alt_ifn;
3832 *offset_vectype_out = offset_vectype;
3833 return true;
3836 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3837 && TYPE_PRECISION (offset_type) >= element_bits)
3838 return false;
3840 offset_type = build_nonstandard_integer_type
3841 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3845 /* STMT_INFO is a call to an internal gather load or scatter store function.
3846 Describe the operation in INFO. */
3848 static void
3849 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3850 gather_scatter_info *info)
3852 gcall *call = as_a <gcall *> (stmt_info->stmt);
3853 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3854 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3856 info->ifn = gimple_call_internal_fn (call);
3857 info->decl = NULL_TREE;
3858 info->base = gimple_call_arg (call, 0);
3859 info->offset = gimple_call_arg (call, 1);
3860 info->offset_dt = vect_unknown_def_type;
3861 info->offset_vectype = NULL_TREE;
3862 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3863 info->element_type = TREE_TYPE (vectype);
3864 info->memory_type = TREE_TYPE (DR_REF (dr));
3867 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3868 gather load or scatter store. Describe the operation in *INFO if so. */
3870 bool
3871 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3872 gather_scatter_info *info)
3874 HOST_WIDE_INT scale = 1;
3875 poly_int64 pbitpos, pbitsize;
3876 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3877 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3878 tree offtype = NULL_TREE;
3879 tree decl = NULL_TREE, base, off;
3880 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3881 tree memory_type = TREE_TYPE (DR_REF (dr));
3882 machine_mode pmode;
3883 int punsignedp, reversep, pvolatilep = 0;
3884 internal_fn ifn;
3885 tree offset_vectype;
3886 bool masked_p = false;
3888 /* See whether this is already a call to a gather/scatter internal function.
3889 If not, see whether it's a masked load or store. */
3890 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3891 if (call && gimple_call_internal_p (call))
3893 ifn = gimple_call_internal_fn (call);
3894 if (internal_gather_scatter_fn_p (ifn))
3896 vect_describe_gather_scatter_call (stmt_info, info);
3897 return true;
3899 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3902 /* True if we should aim to use internal functions rather than
3903 built-in functions. */
3904 bool use_ifn_p = (DR_IS_READ (dr)
3905 ? supports_vec_gather_load_p ()
3906 : supports_vec_scatter_store_p ());
3908 base = DR_REF (dr);
3909 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3910 see if we can use the def stmt of the address. */
3911 if (masked_p
3912 && TREE_CODE (base) == MEM_REF
3913 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3914 && integer_zerop (TREE_OPERAND (base, 1))
3915 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3917 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3918 if (is_gimple_assign (def_stmt)
3919 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3920 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3923 /* The gather and scatter builtins need address of the form
3924 loop_invariant + vector * {1, 2, 4, 8}
3926 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3927 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3928 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3929 multiplications and additions in it. To get a vector, we need
3930 a single SSA_NAME that will be defined in the loop and will
3931 contain everything that is not loop invariant and that can be
3932 vectorized. The following code attempts to find such a preexistng
3933 SSA_NAME OFF and put the loop invariants into a tree BASE
3934 that can be gimplified before the loop. */
3935 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3936 &punsignedp, &reversep, &pvolatilep);
3937 if (reversep)
3938 return false;
3940 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3942 if (TREE_CODE (base) == MEM_REF)
3944 if (!integer_zerop (TREE_OPERAND (base, 1)))
3946 if (off == NULL_TREE)
3947 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3948 else
3949 off = size_binop (PLUS_EXPR, off,
3950 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3952 base = TREE_OPERAND (base, 0);
3954 else
3955 base = build_fold_addr_expr (base);
3957 if (off == NULL_TREE)
3958 off = size_zero_node;
3960 /* If base is not loop invariant, either off is 0, then we start with just
3961 the constant offset in the loop invariant BASE and continue with base
3962 as OFF, otherwise give up.
3963 We could handle that case by gimplifying the addition of base + off
3964 into some SSA_NAME and use that as off, but for now punt. */
3965 if (!expr_invariant_in_loop_p (loop, base))
3967 if (!integer_zerop (off))
3968 return false;
3969 off = base;
3970 base = size_int (pbytepos);
3972 /* Otherwise put base + constant offset into the loop invariant BASE
3973 and continue with OFF. */
3974 else
3976 base = fold_convert (sizetype, base);
3977 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3980 /* OFF at this point may be either a SSA_NAME or some tree expression
3981 from get_inner_reference. Try to peel off loop invariants from it
3982 into BASE as long as possible. */
3983 STRIP_NOPS (off);
3984 while (offtype == NULL_TREE)
3986 enum tree_code code;
3987 tree op0, op1, add = NULL_TREE;
3989 if (TREE_CODE (off) == SSA_NAME)
3991 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3993 if (expr_invariant_in_loop_p (loop, off))
3994 return false;
3996 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3997 break;
3999 op0 = gimple_assign_rhs1 (def_stmt);
4000 code = gimple_assign_rhs_code (def_stmt);
4001 op1 = gimple_assign_rhs2 (def_stmt);
4003 else
4005 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4006 return false;
4007 code = TREE_CODE (off);
4008 extract_ops_from_tree (off, &code, &op0, &op1);
4010 switch (code)
4012 case POINTER_PLUS_EXPR:
4013 case PLUS_EXPR:
4014 if (expr_invariant_in_loop_p (loop, op0))
4016 add = op0;
4017 off = op1;
4018 do_add:
4019 add = fold_convert (sizetype, add);
4020 if (scale != 1)
4021 add = size_binop (MULT_EXPR, add, size_int (scale));
4022 base = size_binop (PLUS_EXPR, base, add);
4023 continue;
4025 if (expr_invariant_in_loop_p (loop, op1))
4027 add = op1;
4028 off = op0;
4029 goto do_add;
4031 break;
4032 case MINUS_EXPR:
4033 if (expr_invariant_in_loop_p (loop, op1))
4035 add = fold_convert (sizetype, op1);
4036 add = size_binop (MINUS_EXPR, size_zero_node, add);
4037 off = op0;
4038 goto do_add;
4040 break;
4041 case MULT_EXPR:
4042 if (scale == 1 && tree_fits_shwi_p (op1))
4044 int new_scale = tree_to_shwi (op1);
4045 /* Only treat this as a scaling operation if the target
4046 supports it for at least some offset type. */
4047 if (use_ifn_p
4048 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4049 masked_p, vectype, memory_type,
4050 signed_char_type_node,
4051 new_scale, &ifn,
4052 &offset_vectype)
4053 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4054 masked_p, vectype, memory_type,
4055 unsigned_char_type_node,
4056 new_scale, &ifn,
4057 &offset_vectype))
4058 break;
4059 scale = new_scale;
4060 off = op0;
4061 continue;
4063 break;
4064 case SSA_NAME:
4065 off = op0;
4066 continue;
4067 CASE_CONVERT:
4068 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4069 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4070 break;
4072 /* Don't include the conversion if the target is happy with
4073 the current offset type. */
4074 if (use_ifn_p
4075 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4076 masked_p, vectype, memory_type,
4077 TREE_TYPE (off), scale, &ifn,
4078 &offset_vectype))
4079 break;
4081 if (TYPE_PRECISION (TREE_TYPE (op0))
4082 == TYPE_PRECISION (TREE_TYPE (off)))
4084 off = op0;
4085 continue;
4088 /* Include the conversion if it is widening and we're using
4089 the IFN path or the target can handle the converted from
4090 offset or the current size is not already the same as the
4091 data vector element size. */
4092 if ((TYPE_PRECISION (TREE_TYPE (op0))
4093 < TYPE_PRECISION (TREE_TYPE (off)))
4094 && (use_ifn_p
4095 || (DR_IS_READ (dr)
4096 ? (targetm.vectorize.builtin_gather
4097 && targetm.vectorize.builtin_gather (vectype,
4098 TREE_TYPE (op0),
4099 scale))
4100 : (targetm.vectorize.builtin_scatter
4101 && targetm.vectorize.builtin_scatter (vectype,
4102 TREE_TYPE (op0),
4103 scale)))
4104 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4105 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4107 off = op0;
4108 offtype = TREE_TYPE (off);
4109 STRIP_NOPS (off);
4110 continue;
4112 break;
4113 default:
4114 break;
4116 break;
4119 /* If at the end OFF still isn't a SSA_NAME or isn't
4120 defined in the loop, punt. */
4121 if (TREE_CODE (off) != SSA_NAME
4122 || expr_invariant_in_loop_p (loop, off))
4123 return false;
4125 if (offtype == NULL_TREE)
4126 offtype = TREE_TYPE (off);
4128 if (use_ifn_p)
4130 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4131 vectype, memory_type, offtype, scale,
4132 &ifn, &offset_vectype))
4133 ifn = IFN_LAST;
4134 decl = NULL_TREE;
4136 else
4138 if (DR_IS_READ (dr))
4140 if (targetm.vectorize.builtin_gather)
4141 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4143 else
4145 if (targetm.vectorize.builtin_scatter)
4146 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4148 ifn = IFN_LAST;
4149 /* The offset vector type will be read from DECL when needed. */
4150 offset_vectype = NULL_TREE;
4153 info->ifn = ifn;
4154 info->decl = decl;
4155 info->base = base;
4156 info->offset = off;
4157 info->offset_dt = vect_unknown_def_type;
4158 info->offset_vectype = offset_vectype;
4159 info->scale = scale;
4160 info->element_type = TREE_TYPE (vectype);
4161 info->memory_type = memory_type;
4162 return true;
4165 /* Find the data references in STMT, analyze them with respect to LOOP and
4166 append them to DATAREFS. Return false if datarefs in this stmt cannot
4167 be handled. */
4169 opt_result
4170 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4171 vec<data_reference_p> *datarefs,
4172 vec<int> *dataref_groups, int group_id)
4174 /* We can ignore clobbers for dataref analysis - they are removed during
4175 loop vectorization and BB vectorization checks dependences with a
4176 stmt walk. */
4177 if (gimple_clobber_p (stmt))
4178 return opt_result::success ();
4180 if (gimple_has_volatile_ops (stmt))
4181 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4182 stmt);
4184 if (stmt_can_throw_internal (cfun, stmt))
4185 return opt_result::failure_at (stmt,
4186 "not vectorized:"
4187 " statement can throw an exception: %G",
4188 stmt);
4190 auto_vec<data_reference_p, 2> refs;
4191 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4192 if (!res)
4193 return res;
4195 if (refs.is_empty ())
4196 return opt_result::success ();
4198 if (refs.length () > 1)
4200 while (!refs.is_empty ())
4201 free_data_ref (refs.pop ());
4202 return opt_result::failure_at (stmt,
4203 "not vectorized: more than one "
4204 "data ref in stmt: %G", stmt);
4207 data_reference_p dr = refs.pop ();
4208 if (gcall *call = dyn_cast <gcall *> (stmt))
4209 if (!gimple_call_internal_p (call)
4210 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4211 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4213 free_data_ref (dr);
4214 return opt_result::failure_at (stmt,
4215 "not vectorized: dr in a call %G", stmt);
4218 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4219 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4221 free_data_ref (dr);
4222 return opt_result::failure_at (stmt,
4223 "not vectorized:"
4224 " statement is bitfield access %G", stmt);
4227 if (DR_BASE_ADDRESS (dr)
4228 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4230 free_data_ref (dr);
4231 return opt_result::failure_at (stmt,
4232 "not vectorized:"
4233 " base addr of dr is a constant\n");
4236 /* Check whether this may be a SIMD lane access and adjust the
4237 DR to make it easier for us to handle it. */
4238 if (loop
4239 && loop->simduid
4240 && (!DR_BASE_ADDRESS (dr)
4241 || !DR_OFFSET (dr)
4242 || !DR_INIT (dr)
4243 || !DR_STEP (dr)))
4245 struct data_reference *newdr
4246 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4247 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4248 if (DR_BASE_ADDRESS (newdr)
4249 && DR_OFFSET (newdr)
4250 && DR_INIT (newdr)
4251 && DR_STEP (newdr)
4252 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4253 && integer_zerop (DR_STEP (newdr)))
4255 tree base_address = DR_BASE_ADDRESS (newdr);
4256 tree off = DR_OFFSET (newdr);
4257 tree step = ssize_int (1);
4258 if (integer_zerop (off)
4259 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4261 off = TREE_OPERAND (base_address, 1);
4262 base_address = TREE_OPERAND (base_address, 0);
4264 STRIP_NOPS (off);
4265 if (TREE_CODE (off) == MULT_EXPR
4266 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4268 step = TREE_OPERAND (off, 1);
4269 off = TREE_OPERAND (off, 0);
4270 STRIP_NOPS (off);
4272 if (CONVERT_EXPR_P (off)
4273 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4274 < TYPE_PRECISION (TREE_TYPE (off))))
4275 off = TREE_OPERAND (off, 0);
4276 if (TREE_CODE (off) == SSA_NAME)
4278 gimple *def = SSA_NAME_DEF_STMT (off);
4279 /* Look through widening conversion. */
4280 if (is_gimple_assign (def)
4281 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4283 tree rhs1 = gimple_assign_rhs1 (def);
4284 if (TREE_CODE (rhs1) == SSA_NAME
4285 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4286 && (TYPE_PRECISION (TREE_TYPE (off))
4287 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4288 def = SSA_NAME_DEF_STMT (rhs1);
4290 if (is_gimple_call (def)
4291 && gimple_call_internal_p (def)
4292 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4294 tree arg = gimple_call_arg (def, 0);
4295 tree reft = TREE_TYPE (DR_REF (newdr));
4296 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4297 arg = SSA_NAME_VAR (arg);
4298 if (arg == loop->simduid
4299 /* For now. */
4300 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4302 DR_BASE_ADDRESS (newdr) = base_address;
4303 DR_OFFSET (newdr) = ssize_int (0);
4304 DR_STEP (newdr) = step;
4305 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4306 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4307 /* Mark as simd-lane access. */
4308 tree arg2 = gimple_call_arg (def, 1);
4309 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4310 free_data_ref (dr);
4311 datarefs->safe_push (newdr);
4312 if (dataref_groups)
4313 dataref_groups->safe_push (group_id);
4314 return opt_result::success ();
4319 free_data_ref (newdr);
4322 datarefs->safe_push (dr);
4323 if (dataref_groups)
4324 dataref_groups->safe_push (group_id);
4325 return opt_result::success ();
4328 /* Function vect_analyze_data_refs.
4330 Find all the data references in the loop or basic block.
4332 The general structure of the analysis of data refs in the vectorizer is as
4333 follows:
4334 1- vect_analyze_data_refs(loop/bb): call
4335 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4336 in the loop/bb and their dependences.
4337 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4338 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4339 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4343 opt_result
4344 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4346 class loop *loop = NULL;
4347 unsigned int i;
4348 struct data_reference *dr;
4349 tree scalar_type;
4351 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4353 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4354 loop = LOOP_VINFO_LOOP (loop_vinfo);
4356 /* Go through the data-refs, check that the analysis succeeded. Update
4357 pointer from stmt_vec_info struct to DR and vectype. */
4359 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4360 FOR_EACH_VEC_ELT (datarefs, i, dr)
4362 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4363 poly_uint64 vf;
4365 gcc_assert (DR_REF (dr));
4366 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4367 gcc_assert (!stmt_info->dr_aux.dr);
4368 stmt_info->dr_aux.dr = dr;
4369 stmt_info->dr_aux.stmt = stmt_info;
4371 /* Check that analysis of the data-ref succeeded. */
4372 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4373 || !DR_STEP (dr))
4375 bool maybe_gather
4376 = DR_IS_READ (dr)
4377 && !TREE_THIS_VOLATILE (DR_REF (dr));
4378 bool maybe_scatter
4379 = DR_IS_WRITE (dr)
4380 && !TREE_THIS_VOLATILE (DR_REF (dr))
4381 && (targetm.vectorize.builtin_scatter != NULL
4382 || supports_vec_scatter_store_p ());
4384 /* If target supports vector gather loads or scatter stores,
4385 see if they can't be used. */
4386 if (is_a <loop_vec_info> (vinfo)
4387 && !nested_in_vect_loop_p (loop, stmt_info))
4389 if (maybe_gather || maybe_scatter)
4391 if (maybe_gather)
4392 gatherscatter = GATHER;
4393 else
4394 gatherscatter = SCATTER;
4398 if (gatherscatter == SG_NONE)
4400 if (dump_enabled_p ())
4401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4402 "not vectorized: data ref analysis "
4403 "failed %G", stmt_info->stmt);
4404 if (is_a <bb_vec_info> (vinfo))
4406 /* In BB vectorization the ref can still participate
4407 in dependence analysis, we just can't vectorize it. */
4408 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4409 continue;
4411 return opt_result::failure_at (stmt_info->stmt,
4412 "not vectorized:"
4413 " data ref analysis failed: %G",
4414 stmt_info->stmt);
4418 /* See if this was detected as SIMD lane access. */
4419 if (dr->aux == (void *)-1
4420 || dr->aux == (void *)-2
4421 || dr->aux == (void *)-3
4422 || dr->aux == (void *)-4)
4424 if (nested_in_vect_loop_p (loop, stmt_info))
4425 return opt_result::failure_at (stmt_info->stmt,
4426 "not vectorized:"
4427 " data ref analysis failed: %G",
4428 stmt_info->stmt);
4429 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4430 = -(uintptr_t) dr->aux;
4433 tree base = get_base_address (DR_REF (dr));
4434 if (base && VAR_P (base) && DECL_NONALIASED (base))
4436 if (dump_enabled_p ())
4437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4438 "not vectorized: base object not addressable "
4439 "for stmt: %G", stmt_info->stmt);
4440 if (is_a <bb_vec_info> (vinfo))
4442 /* In BB vectorization the ref can still participate
4443 in dependence analysis, we just can't vectorize it. */
4444 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4445 continue;
4447 return opt_result::failure_at (stmt_info->stmt,
4448 "not vectorized: base object not"
4449 " addressable for stmt: %G",
4450 stmt_info->stmt);
4453 if (is_a <loop_vec_info> (vinfo)
4454 && DR_STEP (dr)
4455 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4457 if (nested_in_vect_loop_p (loop, stmt_info))
4458 return opt_result::failure_at (stmt_info->stmt,
4459 "not vectorized: "
4460 "not suitable for strided load %G",
4461 stmt_info->stmt);
4462 STMT_VINFO_STRIDED_P (stmt_info) = true;
4465 /* Update DR field in stmt_vec_info struct. */
4467 /* If the dataref is in an inner-loop of the loop that is considered for
4468 for vectorization, we also want to analyze the access relative to
4469 the outer-loop (DR contains information only relative to the
4470 inner-most enclosing loop). We do that by building a reference to the
4471 first location accessed by the inner-loop, and analyze it relative to
4472 the outer-loop. */
4473 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4475 /* Build a reference to the first location accessed by the
4476 inner loop: *(BASE + INIT + OFFSET). By construction,
4477 this address must be invariant in the inner loop, so we
4478 can consider it as being used in the outer loop. */
4479 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4480 tree offset = unshare_expr (DR_OFFSET (dr));
4481 tree init = unshare_expr (DR_INIT (dr));
4482 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4483 init, offset);
4484 tree init_addr = fold_build_pointer_plus (base, init_offset);
4485 tree init_ref = build_fold_indirect_ref (init_addr);
4487 if (dump_enabled_p ())
4488 dump_printf_loc (MSG_NOTE, vect_location,
4489 "analyze in outer loop: %T\n", init_ref);
4491 opt_result res
4492 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4493 init_ref, loop, stmt_info->stmt);
4494 if (!res)
4495 /* dr_analyze_innermost already explained the failure. */
4496 return res;
4498 if (dump_enabled_p ())
4499 dump_printf_loc (MSG_NOTE, vect_location,
4500 "\touter base_address: %T\n"
4501 "\touter offset from base address: %T\n"
4502 "\touter constant offset from base address: %T\n"
4503 "\touter step: %T\n"
4504 "\touter base alignment: %d\n\n"
4505 "\touter base misalignment: %d\n"
4506 "\touter offset alignment: %d\n"
4507 "\touter step alignment: %d\n",
4508 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4509 STMT_VINFO_DR_OFFSET (stmt_info),
4510 STMT_VINFO_DR_INIT (stmt_info),
4511 STMT_VINFO_DR_STEP (stmt_info),
4512 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4513 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4514 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4515 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4518 /* Set vectype for STMT. */
4519 scalar_type = TREE_TYPE (DR_REF (dr));
4520 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4521 if (!vectype)
4523 if (dump_enabled_p ())
4525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4526 "not vectorized: no vectype for stmt: %G",
4527 stmt_info->stmt);
4528 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4529 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4530 scalar_type);
4531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4534 if (is_a <bb_vec_info> (vinfo))
4536 /* No vector type is fine, the ref can still participate
4537 in dependence analysis, we just can't vectorize it. */
4538 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4539 continue;
4541 if (fatal)
4542 *fatal = false;
4543 return opt_result::failure_at (stmt_info->stmt,
4544 "not vectorized:"
4545 " no vectype for stmt: %G"
4546 " scalar_type: %T\n",
4547 stmt_info->stmt, scalar_type);
4549 else
4551 if (dump_enabled_p ())
4552 dump_printf_loc (MSG_NOTE, vect_location,
4553 "got vectype for stmt: %G%T\n",
4554 stmt_info->stmt, vectype);
4557 /* Adjust the minimal vectorization factor according to the
4558 vector type. */
4559 vf = TYPE_VECTOR_SUBPARTS (vectype);
4560 *min_vf = upper_bound (*min_vf, vf);
4562 /* Leave the BB vectorizer to pick the vector type later, based on
4563 the final dataref group size and SLP node size. */
4564 if (is_a <loop_vec_info> (vinfo))
4565 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4567 if (gatherscatter != SG_NONE)
4569 gather_scatter_info gs_info;
4570 if (!vect_check_gather_scatter (stmt_info,
4571 as_a <loop_vec_info> (vinfo),
4572 &gs_info)
4573 || !get_vectype_for_scalar_type (vinfo,
4574 TREE_TYPE (gs_info.offset)))
4576 if (fatal)
4577 *fatal = false;
4578 return opt_result::failure_at
4579 (stmt_info->stmt,
4580 (gatherscatter == GATHER)
4581 ? "not vectorized: not suitable for gather load %G"
4582 : "not vectorized: not suitable for scatter store %G",
4583 stmt_info->stmt);
4585 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4589 /* We used to stop processing and prune the list here. Verify we no
4590 longer need to. */
4591 gcc_assert (i == datarefs.length ());
4593 return opt_result::success ();
4597 /* Function vect_get_new_vect_var.
4599 Returns a name for a new variable. The current naming scheme appends the
4600 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4601 the name of vectorizer generated variables, and appends that to NAME if
4602 provided. */
4604 tree
4605 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4607 const char *prefix;
4608 tree new_vect_var;
4610 switch (var_kind)
4612 case vect_simple_var:
4613 prefix = "vect";
4614 break;
4615 case vect_scalar_var:
4616 prefix = "stmp";
4617 break;
4618 case vect_mask_var:
4619 prefix = "mask";
4620 break;
4621 case vect_pointer_var:
4622 prefix = "vectp";
4623 break;
4624 default:
4625 gcc_unreachable ();
4628 if (name)
4630 char* tmp = concat (prefix, "_", name, NULL);
4631 new_vect_var = create_tmp_reg (type, tmp);
4632 free (tmp);
4634 else
4635 new_vect_var = create_tmp_reg (type, prefix);
4637 return new_vect_var;
4640 /* Like vect_get_new_vect_var but return an SSA name. */
4642 tree
4643 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4645 const char *prefix;
4646 tree new_vect_var;
4648 switch (var_kind)
4650 case vect_simple_var:
4651 prefix = "vect";
4652 break;
4653 case vect_scalar_var:
4654 prefix = "stmp";
4655 break;
4656 case vect_pointer_var:
4657 prefix = "vectp";
4658 break;
4659 default:
4660 gcc_unreachable ();
4663 if (name)
4665 char* tmp = concat (prefix, "_", name, NULL);
4666 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4667 free (tmp);
4669 else
4670 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4672 return new_vect_var;
4675 /* Duplicate points-to info on NAME from DR_INFO. */
4677 static void
4678 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4680 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4681 /* DR_PTR_INFO is for a base SSA name, not including constant or
4682 variable offsets in the ref so its alignment info does not apply. */
4683 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4686 /* Function vect_create_addr_base_for_vector_ref.
4688 Create an expression that computes the address of the first memory location
4689 that will be accessed for a data reference.
4691 Input:
4692 STMT_INFO: The statement containing the data reference.
4693 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4694 OFFSET: Optional. If supplied, it is be added to the initial address.
4695 LOOP: Specify relative to which loop-nest should the address be computed.
4696 For example, when the dataref is in an inner-loop nested in an
4697 outer-loop that is now being vectorized, LOOP can be either the
4698 outer-loop, or the inner-loop. The first memory location accessed
4699 by the following dataref ('in' points to short):
4701 for (i=0; i<N; i++)
4702 for (j=0; j<M; j++)
4703 s += in[i+j]
4705 is as follows:
4706 if LOOP=i_loop: &in (relative to i_loop)
4707 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4708 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4709 initial address. Unlike OFFSET, which is number of elements to
4710 be added, BYTE_OFFSET is measured in bytes.
4712 Output:
4713 1. Return an SSA_NAME whose value is the address of the memory location of
4714 the first vector of the data reference.
4715 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4716 these statement(s) which define the returned SSA_NAME.
4718 FORNOW: We are only handling array accesses with step 1. */
4720 tree
4721 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4722 gimple_seq *new_stmt_list,
4723 tree offset,
4724 tree byte_offset)
4726 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4727 struct data_reference *dr = dr_info->dr;
4728 const char *base_name;
4729 tree addr_base;
4730 tree dest;
4731 gimple_seq seq = NULL;
4732 tree vect_ptr_type;
4733 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4734 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4735 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4737 tree data_ref_base = unshare_expr (drb->base_address);
4738 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4739 tree init = unshare_expr (drb->init);
4741 if (loop_vinfo)
4742 base_name = get_name (data_ref_base);
4743 else
4745 base_offset = ssize_int (0);
4746 init = ssize_int (0);
4747 base_name = get_name (DR_REF (dr));
4750 /* Create base_offset */
4751 base_offset = size_binop (PLUS_EXPR,
4752 fold_convert (sizetype, base_offset),
4753 fold_convert (sizetype, init));
4755 if (offset)
4757 offset = fold_build2 (MULT_EXPR, sizetype,
4758 fold_convert (sizetype, offset), step);
4759 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4760 base_offset, offset);
4762 if (byte_offset)
4764 byte_offset = fold_convert (sizetype, byte_offset);
4765 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4766 base_offset, byte_offset);
4769 /* base + base_offset */
4770 if (loop_vinfo)
4771 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4772 else
4774 addr_base = build1 (ADDR_EXPR,
4775 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4776 unshare_expr (DR_REF (dr)));
4779 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4780 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4781 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4782 gimple_seq_add_seq (new_stmt_list, seq);
4784 if (DR_PTR_INFO (dr)
4785 && TREE_CODE (addr_base) == SSA_NAME
4786 && !SSA_NAME_PTR_INFO (addr_base))
4787 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4789 if (dump_enabled_p ())
4790 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4792 return addr_base;
4796 /* Function vect_create_data_ref_ptr.
4798 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4799 location accessed in the loop by STMT_INFO, along with the def-use update
4800 chain to appropriately advance the pointer through the loop iterations.
4801 Also set aliasing information for the pointer. This pointer is used by
4802 the callers to this function to create a memory reference expression for
4803 vector load/store access.
4805 Input:
4806 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4807 GIMPLE_ASSIGN <name, data-ref> or
4808 GIMPLE_ASSIGN <data-ref, name>.
4809 2. AGGR_TYPE: the type of the reference, which should be either a vector
4810 or an array.
4811 3. AT_LOOP: the loop where the vector memref is to be created.
4812 4. OFFSET (optional): an offset to be added to the initial address accessed
4813 by the data-ref in STMT_INFO.
4814 5. BSI: location where the new stmts are to be placed if there is no loop
4815 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4816 pointing to the initial address.
4817 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4818 to the initial address accessed by the data-ref in STMT_INFO. This is
4819 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4820 in bytes.
4821 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4822 to the IV during each iteration of the loop. NULL says to move
4823 by one copy of AGGR_TYPE up or down, depending on the step of the
4824 data reference.
4826 Output:
4827 1. Declare a new ptr to vector_type, and have it point to the base of the
4828 data reference (initial addressed accessed by the data reference).
4829 For example, for vector of type V8HI, the following code is generated:
4831 v8hi *ap;
4832 ap = (v8hi *)initial_address;
4834 if OFFSET is not supplied:
4835 initial_address = &a[init];
4836 if OFFSET is supplied:
4837 initial_address = &a[init + OFFSET];
4838 if BYTE_OFFSET is supplied:
4839 initial_address = &a[init] + BYTE_OFFSET;
4841 Return the initial_address in INITIAL_ADDRESS.
4843 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4844 update the pointer in each iteration of the loop.
4846 Return the increment stmt that updates the pointer in PTR_INCR.
4848 3. Return the pointer. */
4850 tree
4851 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4852 tree aggr_type, class loop *at_loop, tree offset,
4853 tree *initial_address, gimple_stmt_iterator *gsi,
4854 gimple **ptr_incr, bool only_init,
4855 tree byte_offset, tree iv_step)
4857 const char *base_name;
4858 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4859 class loop *loop = NULL;
4860 bool nested_in_vect_loop = false;
4861 class loop *containing_loop = NULL;
4862 tree aggr_ptr_type;
4863 tree aggr_ptr;
4864 tree new_temp;
4865 gimple_seq new_stmt_list = NULL;
4866 edge pe = NULL;
4867 basic_block new_bb;
4868 tree aggr_ptr_init;
4869 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4870 struct data_reference *dr = dr_info->dr;
4871 tree aptr;
4872 gimple_stmt_iterator incr_gsi;
4873 bool insert_after;
4874 tree indx_before_incr, indx_after_incr;
4875 gimple *incr;
4876 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4878 gcc_assert (iv_step != NULL_TREE
4879 || TREE_CODE (aggr_type) == ARRAY_TYPE
4880 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4882 if (loop_vinfo)
4884 loop = LOOP_VINFO_LOOP (loop_vinfo);
4885 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4886 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4887 pe = loop_preheader_edge (loop);
4889 else
4891 gcc_assert (bb_vinfo);
4892 only_init = true;
4893 *ptr_incr = NULL;
4896 /* Create an expression for the first address accessed by this load
4897 in LOOP. */
4898 base_name = get_name (DR_BASE_ADDRESS (dr));
4900 if (dump_enabled_p ())
4902 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4903 dump_printf_loc (MSG_NOTE, vect_location,
4904 "create %s-pointer variable to type: %T",
4905 get_tree_code_name (TREE_CODE (aggr_type)),
4906 aggr_type);
4907 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4908 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4909 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4910 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4911 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4912 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4913 else
4914 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4915 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4918 /* (1) Create the new aggregate-pointer variable.
4919 Vector and array types inherit the alias set of their component
4920 type by default so we need to use a ref-all pointer if the data
4921 reference does not conflict with the created aggregated data
4922 reference because it is not addressable. */
4923 bool need_ref_all = false;
4924 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4925 get_alias_set (DR_REF (dr))))
4926 need_ref_all = true;
4927 /* Likewise for any of the data references in the stmt group. */
4928 else if (DR_GROUP_SIZE (stmt_info) > 1)
4930 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4933 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4934 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4935 get_alias_set (DR_REF (sdr))))
4937 need_ref_all = true;
4938 break;
4940 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4942 while (sinfo);
4944 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4945 need_ref_all);
4946 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4949 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4950 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4951 def-use update cycles for the pointer: one relative to the outer-loop
4952 (LOOP), which is what steps (3) and (4) below do. The other is relative
4953 to the inner-loop (which is the inner-most loop containing the dataref),
4954 and this is done be step (5) below.
4956 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4957 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4958 redundant. Steps (3),(4) create the following:
4960 vp0 = &base_addr;
4961 LOOP: vp1 = phi(vp0,vp2)
4964 vp2 = vp1 + step
4965 goto LOOP
4967 If there is an inner-loop nested in loop, then step (5) will also be
4968 applied, and an additional update in the inner-loop will be created:
4970 vp0 = &base_addr;
4971 LOOP: vp1 = phi(vp0,vp2)
4973 inner: vp3 = phi(vp1,vp4)
4974 vp4 = vp3 + inner_step
4975 if () goto inner
4977 vp2 = vp1 + step
4978 if () goto LOOP */
4980 /* (2) Calculate the initial address of the aggregate-pointer, and set
4981 the aggregate-pointer to point to it before the loop. */
4983 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4985 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4986 stmt_info, &new_stmt_list,
4987 offset, byte_offset);
4988 if (new_stmt_list)
4990 if (pe)
4992 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4993 gcc_assert (!new_bb);
4995 else
4996 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4999 *initial_address = new_temp;
5000 aggr_ptr_init = new_temp;
5002 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5003 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5004 inner-loop nested in LOOP (during outer-loop vectorization). */
5006 /* No update in loop is required. */
5007 if (only_init && (!loop_vinfo || at_loop == loop))
5008 aptr = aggr_ptr_init;
5009 else
5011 /* Accesses to invariant addresses should be handled specially
5012 by the caller. */
5013 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5014 gcc_assert (!integer_zerop (step));
5016 if (iv_step == NULL_TREE)
5018 /* The step of the aggregate pointer is the type size,
5019 negated for downward accesses. */
5020 iv_step = TYPE_SIZE_UNIT (aggr_type);
5021 if (tree_int_cst_sgn (step) == -1)
5022 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5025 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5027 create_iv (aggr_ptr_init,
5028 fold_convert (aggr_ptr_type, iv_step),
5029 aggr_ptr, loop, &incr_gsi, insert_after,
5030 &indx_before_incr, &indx_after_incr);
5031 incr = gsi_stmt (incr_gsi);
5033 /* Copy the points-to information if it exists. */
5034 if (DR_PTR_INFO (dr))
5036 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5037 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5039 if (ptr_incr)
5040 *ptr_incr = incr;
5042 aptr = indx_before_incr;
5045 if (!nested_in_vect_loop || only_init)
5046 return aptr;
5049 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5050 nested in LOOP, if exists. */
5052 gcc_assert (nested_in_vect_loop);
5053 if (!only_init)
5055 standard_iv_increment_position (containing_loop, &incr_gsi,
5056 &insert_after);
5057 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5058 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5059 &indx_after_incr);
5060 incr = gsi_stmt (incr_gsi);
5062 /* Copy the points-to information if it exists. */
5063 if (DR_PTR_INFO (dr))
5065 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5066 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5068 if (ptr_incr)
5069 *ptr_incr = incr;
5071 return indx_before_incr;
5073 else
5074 gcc_unreachable ();
5078 /* Function bump_vector_ptr
5080 Increment a pointer (to a vector type) by vector-size. If requested,
5081 i.e. if PTR-INCR is given, then also connect the new increment stmt
5082 to the existing def-use update-chain of the pointer, by modifying
5083 the PTR_INCR as illustrated below:
5085 The pointer def-use update-chain before this function:
5086 DATAREF_PTR = phi (p_0, p_2)
5087 ....
5088 PTR_INCR: p_2 = DATAREF_PTR + step
5090 The pointer def-use update-chain after this function:
5091 DATAREF_PTR = phi (p_0, p_2)
5092 ....
5093 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5094 ....
5095 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5097 Input:
5098 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5099 in the loop.
5100 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5101 the loop. The increment amount across iterations is expected
5102 to be vector_size.
5103 BSI - location where the new update stmt is to be placed.
5104 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5105 BUMP - optional. The offset by which to bump the pointer. If not given,
5106 the offset is assumed to be vector_size.
5108 Output: Return NEW_DATAREF_PTR as illustrated above.
5112 tree
5113 bump_vector_ptr (vec_info *vinfo,
5114 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5115 stmt_vec_info stmt_info, tree bump)
5117 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5118 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5119 tree update = TYPE_SIZE_UNIT (vectype);
5120 gimple *incr_stmt;
5121 ssa_op_iter iter;
5122 use_operand_p use_p;
5123 tree new_dataref_ptr;
5125 if (bump)
5126 update = bump;
5128 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5129 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5130 else
5131 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5132 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5133 dataref_ptr, update);
5134 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5135 /* Fold the increment, avoiding excessive chains use-def chains of
5136 those, leading to compile-time issues for passes until the next
5137 forwprop pass which would do this as well. */
5138 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5139 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5141 incr_stmt = gsi_stmt (fold_gsi);
5142 update_stmt (incr_stmt);
5145 /* Copy the points-to information if it exists. */
5146 if (DR_PTR_INFO (dr))
5148 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5149 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5152 if (!ptr_incr)
5153 return new_dataref_ptr;
5155 /* Update the vector-pointer's cross-iteration increment. */
5156 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5158 tree use = USE_FROM_PTR (use_p);
5160 if (use == dataref_ptr)
5161 SET_USE (use_p, new_dataref_ptr);
5162 else
5163 gcc_assert (operand_equal_p (use, update, 0));
5166 return new_dataref_ptr;
5170 /* Copy memory reference info such as base/clique from the SRC reference
5171 to the DEST MEM_REF. */
5173 void
5174 vect_copy_ref_info (tree dest, tree src)
5176 if (TREE_CODE (dest) != MEM_REF)
5177 return;
5179 tree src_base = src;
5180 while (handled_component_p (src_base))
5181 src_base = TREE_OPERAND (src_base, 0);
5182 if (TREE_CODE (src_base) != MEM_REF
5183 && TREE_CODE (src_base) != TARGET_MEM_REF)
5184 return;
5186 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5187 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5191 /* Function vect_create_destination_var.
5193 Create a new temporary of type VECTYPE. */
5195 tree
5196 vect_create_destination_var (tree scalar_dest, tree vectype)
5198 tree vec_dest;
5199 const char *name;
5200 char *new_name;
5201 tree type;
5202 enum vect_var_kind kind;
5204 kind = vectype
5205 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5206 ? vect_mask_var
5207 : vect_simple_var
5208 : vect_scalar_var;
5209 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5211 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5213 name = get_name (scalar_dest);
5214 if (name)
5215 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5216 else
5217 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5218 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5219 free (new_name);
5221 return vec_dest;
5224 /* Function vect_grouped_store_supported.
5226 Returns TRUE if interleave high and interleave low permutations
5227 are supported, and FALSE otherwise. */
5229 bool
5230 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5232 machine_mode mode = TYPE_MODE (vectype);
5234 /* vect_permute_store_chain requires the group size to be equal to 3 or
5235 be a power of two. */
5236 if (count != 3 && exact_log2 (count) == -1)
5238 if (dump_enabled_p ())
5239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5240 "the size of the group of accesses"
5241 " is not a power of 2 or not eqaul to 3\n");
5242 return false;
5245 /* Check that the permutation is supported. */
5246 if (VECTOR_MODE_P (mode))
5248 unsigned int i;
5249 if (count == 3)
5251 unsigned int j0 = 0, j1 = 0, j2 = 0;
5252 unsigned int i, j;
5254 unsigned int nelt;
5255 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5257 if (dump_enabled_p ())
5258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5259 "cannot handle groups of 3 stores for"
5260 " variable-length vectors\n");
5261 return false;
5264 vec_perm_builder sel (nelt, nelt, 1);
5265 sel.quick_grow (nelt);
5266 vec_perm_indices indices;
5267 for (j = 0; j < 3; j++)
5269 int nelt0 = ((3 - j) * nelt) % 3;
5270 int nelt1 = ((3 - j) * nelt + 1) % 3;
5271 int nelt2 = ((3 - j) * nelt + 2) % 3;
5272 for (i = 0; i < nelt; i++)
5274 if (3 * i + nelt0 < nelt)
5275 sel[3 * i + nelt0] = j0++;
5276 if (3 * i + nelt1 < nelt)
5277 sel[3 * i + nelt1] = nelt + j1++;
5278 if (3 * i + nelt2 < nelt)
5279 sel[3 * i + nelt2] = 0;
5281 indices.new_vector (sel, 2, nelt);
5282 if (!can_vec_perm_const_p (mode, indices))
5284 if (dump_enabled_p ())
5285 dump_printf (MSG_MISSED_OPTIMIZATION,
5286 "permutation op not supported by target.\n");
5287 return false;
5290 for (i = 0; i < nelt; i++)
5292 if (3 * i + nelt0 < nelt)
5293 sel[3 * i + nelt0] = 3 * i + nelt0;
5294 if (3 * i + nelt1 < nelt)
5295 sel[3 * i + nelt1] = 3 * i + nelt1;
5296 if (3 * i + nelt2 < nelt)
5297 sel[3 * i + nelt2] = nelt + j2++;
5299 indices.new_vector (sel, 2, nelt);
5300 if (!can_vec_perm_const_p (mode, indices))
5302 if (dump_enabled_p ())
5303 dump_printf (MSG_MISSED_OPTIMIZATION,
5304 "permutation op not supported by target.\n");
5305 return false;
5308 return true;
5310 else
5312 /* If length is not equal to 3 then only power of 2 is supported. */
5313 gcc_assert (pow2p_hwi (count));
5314 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5316 /* The encoding has 2 interleaved stepped patterns. */
5317 vec_perm_builder sel (nelt, 2, 3);
5318 sel.quick_grow (6);
5319 for (i = 0; i < 3; i++)
5321 sel[i * 2] = i;
5322 sel[i * 2 + 1] = i + nelt;
5324 vec_perm_indices indices (sel, 2, nelt);
5325 if (can_vec_perm_const_p (mode, indices))
5327 for (i = 0; i < 6; i++)
5328 sel[i] += exact_div (nelt, 2);
5329 indices.new_vector (sel, 2, nelt);
5330 if (can_vec_perm_const_p (mode, indices))
5331 return true;
5336 if (dump_enabled_p ())
5337 dump_printf (MSG_MISSED_OPTIMIZATION,
5338 "permutation op not supported by target.\n");
5339 return false;
5343 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5344 type VECTYPE. MASKED_P says whether the masked form is needed. */
5346 bool
5347 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5348 bool masked_p)
5350 if (masked_p)
5351 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5352 vec_mask_store_lanes_optab,
5353 vectype, count);
5354 else
5355 return vect_lanes_optab_supported_p ("vec_store_lanes",
5356 vec_store_lanes_optab,
5357 vectype, count);
5361 /* Function vect_permute_store_chain.
5363 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5364 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5365 the data correctly for the stores. Return the final references for stores
5366 in RESULT_CHAIN.
5368 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5369 The input is 4 vectors each containing 8 elements. We assign a number to
5370 each element, the input sequence is:
5372 1st vec: 0 1 2 3 4 5 6 7
5373 2nd vec: 8 9 10 11 12 13 14 15
5374 3rd vec: 16 17 18 19 20 21 22 23
5375 4th vec: 24 25 26 27 28 29 30 31
5377 The output sequence should be:
5379 1st vec: 0 8 16 24 1 9 17 25
5380 2nd vec: 2 10 18 26 3 11 19 27
5381 3rd vec: 4 12 20 28 5 13 21 30
5382 4th vec: 6 14 22 30 7 15 23 31
5384 i.e., we interleave the contents of the four vectors in their order.
5386 We use interleave_high/low instructions to create such output. The input of
5387 each interleave_high/low operation is two vectors:
5388 1st vec 2nd vec
5389 0 1 2 3 4 5 6 7
5390 the even elements of the result vector are obtained left-to-right from the
5391 high/low elements of the first vector. The odd elements of the result are
5392 obtained left-to-right from the high/low elements of the second vector.
5393 The output of interleave_high will be: 0 4 1 5
5394 and of interleave_low: 2 6 3 7
5397 The permutation is done in log LENGTH stages. In each stage interleave_high
5398 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5399 where the first argument is taken from the first half of DR_CHAIN and the
5400 second argument from it's second half.
5401 In our example,
5403 I1: interleave_high (1st vec, 3rd vec)
5404 I2: interleave_low (1st vec, 3rd vec)
5405 I3: interleave_high (2nd vec, 4th vec)
5406 I4: interleave_low (2nd vec, 4th vec)
5408 The output for the first stage is:
5410 I1: 0 16 1 17 2 18 3 19
5411 I2: 4 20 5 21 6 22 7 23
5412 I3: 8 24 9 25 10 26 11 27
5413 I4: 12 28 13 29 14 30 15 31
5415 The output of the second stage, i.e. the final result is:
5417 I1: 0 8 16 24 1 9 17 25
5418 I2: 2 10 18 26 3 11 19 27
5419 I3: 4 12 20 28 5 13 21 30
5420 I4: 6 14 22 30 7 15 23 31. */
5422 void
5423 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5424 unsigned int length,
5425 stmt_vec_info stmt_info,
5426 gimple_stmt_iterator *gsi,
5427 vec<tree> *result_chain)
5429 tree vect1, vect2, high, low;
5430 gimple *perm_stmt;
5431 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5432 tree perm_mask_low, perm_mask_high;
5433 tree data_ref;
5434 tree perm3_mask_low, perm3_mask_high;
5435 unsigned int i, j, n, log_length = exact_log2 (length);
5437 result_chain->quick_grow (length);
5438 memcpy (result_chain->address (), dr_chain.address (),
5439 length * sizeof (tree));
5441 if (length == 3)
5443 /* vect_grouped_store_supported ensures that this is constant. */
5444 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5445 unsigned int j0 = 0, j1 = 0, j2 = 0;
5447 vec_perm_builder sel (nelt, nelt, 1);
5448 sel.quick_grow (nelt);
5449 vec_perm_indices indices;
5450 for (j = 0; j < 3; j++)
5452 int nelt0 = ((3 - j) * nelt) % 3;
5453 int nelt1 = ((3 - j) * nelt + 1) % 3;
5454 int nelt2 = ((3 - j) * nelt + 2) % 3;
5456 for (i = 0; i < nelt; i++)
5458 if (3 * i + nelt0 < nelt)
5459 sel[3 * i + nelt0] = j0++;
5460 if (3 * i + nelt1 < nelt)
5461 sel[3 * i + nelt1] = nelt + j1++;
5462 if (3 * i + nelt2 < nelt)
5463 sel[3 * i + nelt2] = 0;
5465 indices.new_vector (sel, 2, nelt);
5466 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5468 for (i = 0; i < nelt; i++)
5470 if (3 * i + nelt0 < nelt)
5471 sel[3 * i + nelt0] = 3 * i + nelt0;
5472 if (3 * i + nelt1 < nelt)
5473 sel[3 * i + nelt1] = 3 * i + nelt1;
5474 if (3 * i + nelt2 < nelt)
5475 sel[3 * i + nelt2] = nelt + j2++;
5477 indices.new_vector (sel, 2, nelt);
5478 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5480 vect1 = dr_chain[0];
5481 vect2 = dr_chain[1];
5483 /* Create interleaving stmt:
5484 low = VEC_PERM_EXPR <vect1, vect2,
5485 {j, nelt, *, j + 1, nelt + j + 1, *,
5486 j + 2, nelt + j + 2, *, ...}> */
5487 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5488 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5489 vect2, perm3_mask_low);
5490 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5492 vect1 = data_ref;
5493 vect2 = dr_chain[2];
5494 /* Create interleaving stmt:
5495 low = VEC_PERM_EXPR <vect1, vect2,
5496 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5497 6, 7, nelt + j + 2, ...}> */
5498 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5499 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5500 vect2, perm3_mask_high);
5501 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5502 (*result_chain)[j] = data_ref;
5505 else
5507 /* If length is not equal to 3 then only power of 2 is supported. */
5508 gcc_assert (pow2p_hwi (length));
5510 /* The encoding has 2 interleaved stepped patterns. */
5511 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5512 vec_perm_builder sel (nelt, 2, 3);
5513 sel.quick_grow (6);
5514 for (i = 0; i < 3; i++)
5516 sel[i * 2] = i;
5517 sel[i * 2 + 1] = i + nelt;
5519 vec_perm_indices indices (sel, 2, nelt);
5520 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5522 for (i = 0; i < 6; i++)
5523 sel[i] += exact_div (nelt, 2);
5524 indices.new_vector (sel, 2, nelt);
5525 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5527 for (i = 0, n = log_length; i < n; i++)
5529 for (j = 0; j < length/2; j++)
5531 vect1 = dr_chain[j];
5532 vect2 = dr_chain[j+length/2];
5534 /* Create interleaving stmt:
5535 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5536 ...}> */
5537 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5538 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5539 vect2, perm_mask_high);
5540 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5541 (*result_chain)[2*j] = high;
5543 /* Create interleaving stmt:
5544 low = VEC_PERM_EXPR <vect1, vect2,
5545 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5546 ...}> */
5547 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5548 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5549 vect2, perm_mask_low);
5550 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5551 (*result_chain)[2*j+1] = low;
5553 memcpy (dr_chain.address (), result_chain->address (),
5554 length * sizeof (tree));
5559 /* Function vect_setup_realignment
5561 This function is called when vectorizing an unaligned load using
5562 the dr_explicit_realign[_optimized] scheme.
5563 This function generates the following code at the loop prolog:
5565 p = initial_addr;
5566 x msq_init = *(floor(p)); # prolog load
5567 realignment_token = call target_builtin;
5568 loop:
5569 x msq = phi (msq_init, ---)
5571 The stmts marked with x are generated only for the case of
5572 dr_explicit_realign_optimized.
5574 The code above sets up a new (vector) pointer, pointing to the first
5575 location accessed by STMT_INFO, and a "floor-aligned" load using that
5576 pointer. It also generates code to compute the "realignment-token"
5577 (if the relevant target hook was defined), and creates a phi-node at the
5578 loop-header bb whose arguments are the result of the prolog-load (created
5579 by this function) and the result of a load that takes place in the loop
5580 (to be created by the caller to this function).
5582 For the case of dr_explicit_realign_optimized:
5583 The caller to this function uses the phi-result (msq) to create the
5584 realignment code inside the loop, and sets up the missing phi argument,
5585 as follows:
5586 loop:
5587 msq = phi (msq_init, lsq)
5588 lsq = *(floor(p')); # load in loop
5589 result = realign_load (msq, lsq, realignment_token);
5591 For the case of dr_explicit_realign:
5592 loop:
5593 msq = *(floor(p)); # load in loop
5594 p' = p + (VS-1);
5595 lsq = *(floor(p')); # load in loop
5596 result = realign_load (msq, lsq, realignment_token);
5598 Input:
5599 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5600 a memory location that may be unaligned.
5601 BSI - place where new code is to be inserted.
5602 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5603 is used.
5605 Output:
5606 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5607 target hook, if defined.
5608 Return value - the result of the loop-header phi node. */
5610 tree
5611 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5612 gimple_stmt_iterator *gsi, tree *realignment_token,
5613 enum dr_alignment_support alignment_support_scheme,
5614 tree init_addr,
5615 class loop **at_loop)
5617 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5618 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5619 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5620 struct data_reference *dr = dr_info->dr;
5621 class loop *loop = NULL;
5622 edge pe = NULL;
5623 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5624 tree vec_dest;
5625 gimple *inc;
5626 tree ptr;
5627 tree data_ref;
5628 basic_block new_bb;
5629 tree msq_init = NULL_TREE;
5630 tree new_temp;
5631 gphi *phi_stmt;
5632 tree msq = NULL_TREE;
5633 gimple_seq stmts = NULL;
5634 bool compute_in_loop = false;
5635 bool nested_in_vect_loop = false;
5636 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5637 class loop *loop_for_initial_load = NULL;
5639 if (loop_vinfo)
5641 loop = LOOP_VINFO_LOOP (loop_vinfo);
5642 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5645 gcc_assert (alignment_support_scheme == dr_explicit_realign
5646 || alignment_support_scheme == dr_explicit_realign_optimized);
5648 /* We need to generate three things:
5649 1. the misalignment computation
5650 2. the extra vector load (for the optimized realignment scheme).
5651 3. the phi node for the two vectors from which the realignment is
5652 done (for the optimized realignment scheme). */
5654 /* 1. Determine where to generate the misalignment computation.
5656 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5657 calculation will be generated by this function, outside the loop (in the
5658 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5659 caller, inside the loop.
5661 Background: If the misalignment remains fixed throughout the iterations of
5662 the loop, then both realignment schemes are applicable, and also the
5663 misalignment computation can be done outside LOOP. This is because we are
5664 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5665 are a multiple of VS (the Vector Size), and therefore the misalignment in
5666 different vectorized LOOP iterations is always the same.
5667 The problem arises only if the memory access is in an inner-loop nested
5668 inside LOOP, which is now being vectorized using outer-loop vectorization.
5669 This is the only case when the misalignment of the memory access may not
5670 remain fixed throughout the iterations of the inner-loop (as explained in
5671 detail in vect_supportable_dr_alignment). In this case, not only is the
5672 optimized realignment scheme not applicable, but also the misalignment
5673 computation (and generation of the realignment token that is passed to
5674 REALIGN_LOAD) have to be done inside the loop.
5676 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5677 or not, which in turn determines if the misalignment is computed inside
5678 the inner-loop, or outside LOOP. */
5680 if (init_addr != NULL_TREE || !loop_vinfo)
5682 compute_in_loop = true;
5683 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5687 /* 2. Determine where to generate the extra vector load.
5689 For the optimized realignment scheme, instead of generating two vector
5690 loads in each iteration, we generate a single extra vector load in the
5691 preheader of the loop, and in each iteration reuse the result of the
5692 vector load from the previous iteration. In case the memory access is in
5693 an inner-loop nested inside LOOP, which is now being vectorized using
5694 outer-loop vectorization, we need to determine whether this initial vector
5695 load should be generated at the preheader of the inner-loop, or can be
5696 generated at the preheader of LOOP. If the memory access has no evolution
5697 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5698 to be generated inside LOOP (in the preheader of the inner-loop). */
5700 if (nested_in_vect_loop)
5702 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5703 bool invariant_in_outerloop =
5704 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5705 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5707 else
5708 loop_for_initial_load = loop;
5709 if (at_loop)
5710 *at_loop = loop_for_initial_load;
5712 if (loop_for_initial_load)
5713 pe = loop_preheader_edge (loop_for_initial_load);
5715 /* 3. For the case of the optimized realignment, create the first vector
5716 load at the loop preheader. */
5718 if (alignment_support_scheme == dr_explicit_realign_optimized)
5720 /* Create msq_init = *(floor(p1)) in the loop preheader */
5721 gassign *new_stmt;
5723 gcc_assert (!compute_in_loop);
5724 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5725 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5726 loop_for_initial_load, NULL_TREE,
5727 &init_addr, NULL, &inc, true);
5728 if (TREE_CODE (ptr) == SSA_NAME)
5729 new_temp = copy_ssa_name (ptr);
5730 else
5731 new_temp = make_ssa_name (TREE_TYPE (ptr));
5732 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5733 tree type = TREE_TYPE (ptr);
5734 new_stmt = gimple_build_assign
5735 (new_temp, BIT_AND_EXPR, ptr,
5736 fold_build2 (MINUS_EXPR, type,
5737 build_int_cst (type, 0),
5738 build_int_cst (type, align)));
5739 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5740 gcc_assert (!new_bb);
5741 data_ref
5742 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5743 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5744 vect_copy_ref_info (data_ref, DR_REF (dr));
5745 new_stmt = gimple_build_assign (vec_dest, data_ref);
5746 new_temp = make_ssa_name (vec_dest, new_stmt);
5747 gimple_assign_set_lhs (new_stmt, new_temp);
5748 if (pe)
5750 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5751 gcc_assert (!new_bb);
5753 else
5754 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5756 msq_init = gimple_assign_lhs (new_stmt);
5759 /* 4. Create realignment token using a target builtin, if available.
5760 It is done either inside the containing loop, or before LOOP (as
5761 determined above). */
5763 if (targetm.vectorize.builtin_mask_for_load)
5765 gcall *new_stmt;
5766 tree builtin_decl;
5768 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5769 if (!init_addr)
5771 /* Generate the INIT_ADDR computation outside LOOP. */
5772 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5773 stmt_info, &stmts,
5774 NULL_TREE);
5775 if (loop)
5777 pe = loop_preheader_edge (loop);
5778 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5779 gcc_assert (!new_bb);
5781 else
5782 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5785 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5786 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5787 vec_dest =
5788 vect_create_destination_var (scalar_dest,
5789 gimple_call_return_type (new_stmt));
5790 new_temp = make_ssa_name (vec_dest, new_stmt);
5791 gimple_call_set_lhs (new_stmt, new_temp);
5793 if (compute_in_loop)
5794 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5795 else
5797 /* Generate the misalignment computation outside LOOP. */
5798 pe = loop_preheader_edge (loop);
5799 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5800 gcc_assert (!new_bb);
5803 *realignment_token = gimple_call_lhs (new_stmt);
5805 /* The result of the CALL_EXPR to this builtin is determined from
5806 the value of the parameter and no global variables are touched
5807 which makes the builtin a "const" function. Requiring the
5808 builtin to have the "const" attribute makes it unnecessary
5809 to call mark_call_clobbered. */
5810 gcc_assert (TREE_READONLY (builtin_decl));
5813 if (alignment_support_scheme == dr_explicit_realign)
5814 return msq;
5816 gcc_assert (!compute_in_loop);
5817 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5820 /* 5. Create msq = phi <msq_init, lsq> in loop */
5822 pe = loop_preheader_edge (containing_loop);
5823 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5824 msq = make_ssa_name (vec_dest);
5825 phi_stmt = create_phi_node (msq, containing_loop->header);
5826 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5828 return msq;
5832 /* Function vect_grouped_load_supported.
5834 COUNT is the size of the load group (the number of statements plus the
5835 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5836 only one statement, with a gap of COUNT - 1.
5838 Returns true if a suitable permute exists. */
5840 bool
5841 vect_grouped_load_supported (tree vectype, bool single_element_p,
5842 unsigned HOST_WIDE_INT count)
5844 machine_mode mode = TYPE_MODE (vectype);
5846 /* If this is single-element interleaving with an element distance
5847 that leaves unused vector loads around punt - we at least create
5848 very sub-optimal code in that case (and blow up memory,
5849 see PR65518). */
5850 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5852 if (dump_enabled_p ())
5853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5854 "single-element interleaving not supported "
5855 "for not adjacent vector loads\n");
5856 return false;
5859 /* vect_permute_load_chain requires the group size to be equal to 3 or
5860 be a power of two. */
5861 if (count != 3 && exact_log2 (count) == -1)
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5865 "the size of the group of accesses"
5866 " is not a power of 2 or not equal to 3\n");
5867 return false;
5870 /* Check that the permutation is supported. */
5871 if (VECTOR_MODE_P (mode))
5873 unsigned int i, j;
5874 if (count == 3)
5876 unsigned int nelt;
5877 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5879 if (dump_enabled_p ())
5880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5881 "cannot handle groups of 3 loads for"
5882 " variable-length vectors\n");
5883 return false;
5886 vec_perm_builder sel (nelt, nelt, 1);
5887 sel.quick_grow (nelt);
5888 vec_perm_indices indices;
5889 unsigned int k;
5890 for (k = 0; k < 3; k++)
5892 for (i = 0; i < nelt; i++)
5893 if (3 * i + k < 2 * nelt)
5894 sel[i] = 3 * i + k;
5895 else
5896 sel[i] = 0;
5897 indices.new_vector (sel, 2, nelt);
5898 if (!can_vec_perm_const_p (mode, indices))
5900 if (dump_enabled_p ())
5901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5902 "shuffle of 3 loads is not supported by"
5903 " target\n");
5904 return false;
5906 for (i = 0, j = 0; i < nelt; i++)
5907 if (3 * i + k < 2 * nelt)
5908 sel[i] = i;
5909 else
5910 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5911 indices.new_vector (sel, 2, nelt);
5912 if (!can_vec_perm_const_p (mode, indices))
5914 if (dump_enabled_p ())
5915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5916 "shuffle of 3 loads is not supported by"
5917 " target\n");
5918 return false;
5921 return true;
5923 else
5925 /* If length is not equal to 3 then only power of 2 is supported. */
5926 gcc_assert (pow2p_hwi (count));
5927 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5929 /* The encoding has a single stepped pattern. */
5930 vec_perm_builder sel (nelt, 1, 3);
5931 sel.quick_grow (3);
5932 for (i = 0; i < 3; i++)
5933 sel[i] = i * 2;
5934 vec_perm_indices indices (sel, 2, nelt);
5935 if (can_vec_perm_const_p (mode, indices))
5937 for (i = 0; i < 3; i++)
5938 sel[i] = i * 2 + 1;
5939 indices.new_vector (sel, 2, nelt);
5940 if (can_vec_perm_const_p (mode, indices))
5941 return true;
5946 if (dump_enabled_p ())
5947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5948 "extract even/odd not supported by target\n");
5949 return false;
5952 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5953 type VECTYPE. MASKED_P says whether the masked form is needed. */
5955 bool
5956 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5957 bool masked_p)
5959 if (masked_p)
5960 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5961 vec_mask_load_lanes_optab,
5962 vectype, count);
5963 else
5964 return vect_lanes_optab_supported_p ("vec_load_lanes",
5965 vec_load_lanes_optab,
5966 vectype, count);
5969 /* Function vect_permute_load_chain.
5971 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5972 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5973 the input data correctly. Return the final references for loads in
5974 RESULT_CHAIN.
5976 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5977 The input is 4 vectors each containing 8 elements. We assign a number to each
5978 element, the input sequence is:
5980 1st vec: 0 1 2 3 4 5 6 7
5981 2nd vec: 8 9 10 11 12 13 14 15
5982 3rd vec: 16 17 18 19 20 21 22 23
5983 4th vec: 24 25 26 27 28 29 30 31
5985 The output sequence should be:
5987 1st vec: 0 4 8 12 16 20 24 28
5988 2nd vec: 1 5 9 13 17 21 25 29
5989 3rd vec: 2 6 10 14 18 22 26 30
5990 4th vec: 3 7 11 15 19 23 27 31
5992 i.e., the first output vector should contain the first elements of each
5993 interleaving group, etc.
5995 We use extract_even/odd instructions to create such output. The input of
5996 each extract_even/odd operation is two vectors
5997 1st vec 2nd vec
5998 0 1 2 3 4 5 6 7
6000 and the output is the vector of extracted even/odd elements. The output of
6001 extract_even will be: 0 2 4 6
6002 and of extract_odd: 1 3 5 7
6005 The permutation is done in log LENGTH stages. In each stage extract_even
6006 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6007 their order. In our example,
6009 E1: extract_even (1st vec, 2nd vec)
6010 E2: extract_odd (1st vec, 2nd vec)
6011 E3: extract_even (3rd vec, 4th vec)
6012 E4: extract_odd (3rd vec, 4th vec)
6014 The output for the first stage will be:
6016 E1: 0 2 4 6 8 10 12 14
6017 E2: 1 3 5 7 9 11 13 15
6018 E3: 16 18 20 22 24 26 28 30
6019 E4: 17 19 21 23 25 27 29 31
6021 In order to proceed and create the correct sequence for the next stage (or
6022 for the correct output, if the second stage is the last one, as in our
6023 example), we first put the output of extract_even operation and then the
6024 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6025 The input for the second stage is:
6027 1st vec (E1): 0 2 4 6 8 10 12 14
6028 2nd vec (E3): 16 18 20 22 24 26 28 30
6029 3rd vec (E2): 1 3 5 7 9 11 13 15
6030 4th vec (E4): 17 19 21 23 25 27 29 31
6032 The output of the second stage:
6034 E1: 0 4 8 12 16 20 24 28
6035 E2: 2 6 10 14 18 22 26 30
6036 E3: 1 5 9 13 17 21 25 29
6037 E4: 3 7 11 15 19 23 27 31
6039 And RESULT_CHAIN after reordering:
6041 1st vec (E1): 0 4 8 12 16 20 24 28
6042 2nd vec (E3): 1 5 9 13 17 21 25 29
6043 3rd vec (E2): 2 6 10 14 18 22 26 30
6044 4th vec (E4): 3 7 11 15 19 23 27 31. */
6046 static void
6047 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6048 unsigned int length,
6049 stmt_vec_info stmt_info,
6050 gimple_stmt_iterator *gsi,
6051 vec<tree> *result_chain)
6053 tree data_ref, first_vect, second_vect;
6054 tree perm_mask_even, perm_mask_odd;
6055 tree perm3_mask_low, perm3_mask_high;
6056 gimple *perm_stmt;
6057 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6058 unsigned int i, j, log_length = exact_log2 (length);
6060 result_chain->quick_grow (length);
6061 memcpy (result_chain->address (), dr_chain.address (),
6062 length * sizeof (tree));
6064 if (length == 3)
6066 /* vect_grouped_load_supported ensures that this is constant. */
6067 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6068 unsigned int k;
6070 vec_perm_builder sel (nelt, nelt, 1);
6071 sel.quick_grow (nelt);
6072 vec_perm_indices indices;
6073 for (k = 0; k < 3; k++)
6075 for (i = 0; i < nelt; i++)
6076 if (3 * i + k < 2 * nelt)
6077 sel[i] = 3 * i + k;
6078 else
6079 sel[i] = 0;
6080 indices.new_vector (sel, 2, nelt);
6081 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6083 for (i = 0, j = 0; i < nelt; i++)
6084 if (3 * i + k < 2 * nelt)
6085 sel[i] = i;
6086 else
6087 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6088 indices.new_vector (sel, 2, nelt);
6089 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6091 first_vect = dr_chain[0];
6092 second_vect = dr_chain[1];
6094 /* Create interleaving stmt (low part of):
6095 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6096 ...}> */
6097 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6098 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6099 second_vect, perm3_mask_low);
6100 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6102 /* Create interleaving stmt (high part of):
6103 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6104 ...}> */
6105 first_vect = data_ref;
6106 second_vect = dr_chain[2];
6107 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6108 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6109 second_vect, perm3_mask_high);
6110 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6111 (*result_chain)[k] = data_ref;
6114 else
6116 /* If length is not equal to 3 then only power of 2 is supported. */
6117 gcc_assert (pow2p_hwi (length));
6119 /* The encoding has a single stepped pattern. */
6120 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6121 vec_perm_builder sel (nelt, 1, 3);
6122 sel.quick_grow (3);
6123 for (i = 0; i < 3; ++i)
6124 sel[i] = i * 2;
6125 vec_perm_indices indices (sel, 2, nelt);
6126 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6128 for (i = 0; i < 3; ++i)
6129 sel[i] = i * 2 + 1;
6130 indices.new_vector (sel, 2, nelt);
6131 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6133 for (i = 0; i < log_length; i++)
6135 for (j = 0; j < length; j += 2)
6137 first_vect = dr_chain[j];
6138 second_vect = dr_chain[j+1];
6140 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6141 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6142 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6143 first_vect, second_vect,
6144 perm_mask_even);
6145 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6146 (*result_chain)[j/2] = data_ref;
6148 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6149 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6150 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6151 first_vect, second_vect,
6152 perm_mask_odd);
6153 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6154 (*result_chain)[j/2+length/2] = data_ref;
6156 memcpy (dr_chain.address (), result_chain->address (),
6157 length * sizeof (tree));
6162 /* Function vect_shift_permute_load_chain.
6164 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6165 sequence of stmts to reorder the input data accordingly.
6166 Return the final references for loads in RESULT_CHAIN.
6167 Return true if successed, false otherwise.
6169 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6170 The input is 3 vectors each containing 8 elements. We assign a
6171 number to each element, the input sequence is:
6173 1st vec: 0 1 2 3 4 5 6 7
6174 2nd vec: 8 9 10 11 12 13 14 15
6175 3rd vec: 16 17 18 19 20 21 22 23
6177 The output sequence should be:
6179 1st vec: 0 3 6 9 12 15 18 21
6180 2nd vec: 1 4 7 10 13 16 19 22
6181 3rd vec: 2 5 8 11 14 17 20 23
6183 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6185 First we shuffle all 3 vectors to get correct elements order:
6187 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6188 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6189 3rd vec: (16 19 22) (17 20 23) (18 21)
6191 Next we unite and shift vector 3 times:
6193 1st step:
6194 shift right by 6 the concatenation of:
6195 "1st vec" and "2nd vec"
6196 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6197 "2nd vec" and "3rd vec"
6198 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6199 "3rd vec" and "1st vec"
6200 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6201 | New vectors |
6203 So that now new vectors are:
6205 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6206 2nd vec: (10 13) (16 19 22) (17 20 23)
6207 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6209 2nd step:
6210 shift right by 5 the concatenation of:
6211 "1st vec" and "3rd vec"
6212 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6213 "2nd vec" and "1st vec"
6214 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6215 "3rd vec" and "2nd vec"
6216 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6217 | New vectors |
6219 So that now new vectors are:
6221 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6222 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6223 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6225 3rd step:
6226 shift right by 5 the concatenation of:
6227 "1st vec" and "1st vec"
6228 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6229 shift right by 3 the concatenation of:
6230 "2nd vec" and "2nd vec"
6231 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6232 | New vectors |
6234 So that now all vectors are READY:
6235 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6236 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6237 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6239 This algorithm is faster than one in vect_permute_load_chain if:
6240 1. "shift of a concatination" is faster than general permutation.
6241 This is usually so.
6242 2. The TARGET machine can't execute vector instructions in parallel.
6243 This is because each step of the algorithm depends on previous.
6244 The algorithm in vect_permute_load_chain is much more parallel.
6246 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6249 static bool
6250 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6251 unsigned int length,
6252 stmt_vec_info stmt_info,
6253 gimple_stmt_iterator *gsi,
6254 vec<tree> *result_chain)
6256 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6257 tree perm2_mask1, perm2_mask2, perm3_mask;
6258 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6259 gimple *perm_stmt;
6261 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6262 unsigned int i;
6263 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6265 unsigned HOST_WIDE_INT nelt, vf;
6266 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6267 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6268 /* Not supported for variable-length vectors. */
6269 return false;
6271 vec_perm_builder sel (nelt, nelt, 1);
6272 sel.quick_grow (nelt);
6274 result_chain->quick_grow (length);
6275 memcpy (result_chain->address (), dr_chain.address (),
6276 length * sizeof (tree));
6278 if (pow2p_hwi (length) && vf > 4)
6280 unsigned int j, log_length = exact_log2 (length);
6281 for (i = 0; i < nelt / 2; ++i)
6282 sel[i] = i * 2;
6283 for (i = 0; i < nelt / 2; ++i)
6284 sel[nelt / 2 + i] = i * 2 + 1;
6285 vec_perm_indices indices (sel, 2, nelt);
6286 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6288 if (dump_enabled_p ())
6289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6290 "shuffle of 2 fields structure is not \
6291 supported by target\n");
6292 return false;
6294 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6296 for (i = 0; i < nelt / 2; ++i)
6297 sel[i] = i * 2 + 1;
6298 for (i = 0; i < nelt / 2; ++i)
6299 sel[nelt / 2 + i] = i * 2;
6300 indices.new_vector (sel, 2, nelt);
6301 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6303 if (dump_enabled_p ())
6304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305 "shuffle of 2 fields structure is not \
6306 supported by target\n");
6307 return false;
6309 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6311 /* Generating permutation constant to shift all elements.
6312 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6313 for (i = 0; i < nelt; i++)
6314 sel[i] = nelt / 2 + i;
6315 indices.new_vector (sel, 2, nelt);
6316 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6318 if (dump_enabled_p ())
6319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6320 "shift permutation is not supported by target\n");
6321 return false;
6323 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6325 /* Generating permutation constant to select vector from 2.
6326 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6327 for (i = 0; i < nelt / 2; i++)
6328 sel[i] = i;
6329 for (i = nelt / 2; i < nelt; i++)
6330 sel[i] = nelt + i;
6331 indices.new_vector (sel, 2, nelt);
6332 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6334 if (dump_enabled_p ())
6335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6336 "select is not supported by target\n");
6337 return false;
6339 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6341 for (i = 0; i < log_length; i++)
6343 for (j = 0; j < length; j += 2)
6345 first_vect = dr_chain[j];
6346 second_vect = dr_chain[j + 1];
6348 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6349 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6350 first_vect, first_vect,
6351 perm2_mask1);
6352 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6353 vect[0] = data_ref;
6355 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6356 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6357 second_vect, second_vect,
6358 perm2_mask2);
6359 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6360 vect[1] = data_ref;
6362 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6363 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6364 vect[0], vect[1], shift1_mask);
6365 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6366 (*result_chain)[j/2 + length/2] = data_ref;
6368 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6369 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6370 vect[0], vect[1], select_mask);
6371 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6372 (*result_chain)[j/2] = data_ref;
6374 memcpy (dr_chain.address (), result_chain->address (),
6375 length * sizeof (tree));
6377 return true;
6379 if (length == 3 && vf > 2)
6381 unsigned int k = 0, l = 0;
6383 /* Generating permutation constant to get all elements in rigth order.
6384 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6385 for (i = 0; i < nelt; i++)
6387 if (3 * k + (l % 3) >= nelt)
6389 k = 0;
6390 l += (3 - (nelt % 3));
6392 sel[i] = 3 * k + (l % 3);
6393 k++;
6395 vec_perm_indices indices (sel, 2, nelt);
6396 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6398 if (dump_enabled_p ())
6399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6400 "shuffle of 3 fields structure is not \
6401 supported by target\n");
6402 return false;
6404 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6406 /* Generating permutation constant to shift all elements.
6407 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6408 for (i = 0; i < nelt; i++)
6409 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6410 indices.new_vector (sel, 2, nelt);
6411 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6413 if (dump_enabled_p ())
6414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6415 "shift permutation is not supported by target\n");
6416 return false;
6418 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6420 /* Generating permutation constant to shift all elements.
6421 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6422 for (i = 0; i < nelt; i++)
6423 sel[i] = 2 * (nelt / 3) + 1 + i;
6424 indices.new_vector (sel, 2, nelt);
6425 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6427 if (dump_enabled_p ())
6428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6429 "shift permutation is not supported by target\n");
6430 return false;
6432 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6434 /* Generating permutation constant to shift all elements.
6435 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6436 for (i = 0; i < nelt; i++)
6437 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6438 indices.new_vector (sel, 2, nelt);
6439 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6441 if (dump_enabled_p ())
6442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6443 "shift permutation is not supported by target\n");
6444 return false;
6446 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6448 /* Generating permutation constant to shift all elements.
6449 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6450 for (i = 0; i < nelt; i++)
6451 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6452 indices.new_vector (sel, 2, nelt);
6453 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6455 if (dump_enabled_p ())
6456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6457 "shift permutation is not supported by target\n");
6458 return false;
6460 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6462 for (k = 0; k < 3; k++)
6464 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6465 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6466 dr_chain[k], dr_chain[k],
6467 perm3_mask);
6468 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6469 vect[k] = data_ref;
6472 for (k = 0; k < 3; k++)
6474 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6475 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6476 vect[k % 3], vect[(k + 1) % 3],
6477 shift1_mask);
6478 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6479 vect_shift[k] = data_ref;
6482 for (k = 0; k < 3; k++)
6484 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6485 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6486 vect_shift[(4 - k) % 3],
6487 vect_shift[(3 - k) % 3],
6488 shift2_mask);
6489 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6490 vect[k] = data_ref;
6493 (*result_chain)[3 - (nelt % 3)] = vect[2];
6495 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6496 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6497 vect[0], shift3_mask);
6498 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6499 (*result_chain)[nelt % 3] = data_ref;
6501 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6502 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6503 vect[1], shift4_mask);
6504 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6505 (*result_chain)[0] = data_ref;
6506 return true;
6508 return false;
6511 /* Function vect_transform_grouped_load.
6513 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6514 to perform their permutation and ascribe the result vectorized statements to
6515 the scalar statements.
6518 void
6519 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6520 vec<tree> dr_chain,
6521 int size, gimple_stmt_iterator *gsi)
6523 machine_mode mode;
6524 vec<tree> result_chain = vNULL;
6526 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6527 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6528 vectors, that are ready for vector computation. */
6529 result_chain.create (size);
6531 /* If reassociation width for vector type is 2 or greater target machine can
6532 execute 2 or more vector instructions in parallel. Otherwise try to
6533 get chain for loads group using vect_shift_permute_load_chain. */
6534 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6535 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6536 || pow2p_hwi (size)
6537 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6538 gsi, &result_chain))
6539 vect_permute_load_chain (vinfo, dr_chain,
6540 size, stmt_info, gsi, &result_chain);
6541 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6542 result_chain.release ();
6545 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6546 generated as part of the vectorization of STMT_INFO. Assign the statement
6547 for each vector to the associated scalar statement. */
6549 void
6550 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6551 vec<tree> result_chain)
6553 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6554 unsigned int i, gap_count;
6555 tree tmp_data_ref;
6557 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6558 Since we scan the chain starting from it's first node, their order
6559 corresponds the order of data-refs in RESULT_CHAIN. */
6560 stmt_vec_info next_stmt_info = first_stmt_info;
6561 gap_count = 1;
6562 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6564 if (!next_stmt_info)
6565 break;
6567 /* Skip the gaps. Loads created for the gaps will be removed by dead
6568 code elimination pass later. No need to check for the first stmt in
6569 the group, since it always exists.
6570 DR_GROUP_GAP is the number of steps in elements from the previous
6571 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6572 correspond to the gaps. */
6573 if (next_stmt_info != first_stmt_info
6574 && gap_count < DR_GROUP_GAP (next_stmt_info))
6576 gap_count++;
6577 continue;
6580 /* ??? The following needs cleanup after the removal of
6581 DR_GROUP_SAME_DR_STMT. */
6582 if (next_stmt_info)
6584 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6585 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6586 copies, and we put the new vector statement last. */
6587 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6589 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6590 gap_count = 1;
6595 /* Function vect_force_dr_alignment_p.
6597 Returns whether the alignment of a DECL can be forced to be aligned
6598 on ALIGNMENT bit boundary. */
6600 bool
6601 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6603 if (!VAR_P (decl))
6604 return false;
6606 if (decl_in_symtab_p (decl)
6607 && !symtab_node::get (decl)->can_increase_alignment_p ())
6608 return false;
6610 if (TREE_STATIC (decl))
6611 return (known_le (alignment,
6612 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6613 else
6614 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6618 /* Return whether the data reference DR_INFO is supported with respect to its
6619 alignment.
6620 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6621 it is aligned, i.e., check if it is possible to vectorize it with different
6622 alignment. */
6624 enum dr_alignment_support
6625 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6626 tree vectype, bool check_aligned_accesses)
6628 data_reference *dr = dr_info->dr;
6629 stmt_vec_info stmt_info = dr_info->stmt;
6630 machine_mode mode = TYPE_MODE (vectype);
6631 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6632 class loop *vect_loop = NULL;
6633 bool nested_in_vect_loop = false;
6635 if (aligned_access_p (dr_info, vectype) && !check_aligned_accesses)
6636 return dr_aligned;
6638 /* For now assume all conditional loads/stores support unaligned
6639 access without any special code. */
6640 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6641 if (gimple_call_internal_p (stmt)
6642 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6643 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6644 return dr_unaligned_supported;
6646 if (loop_vinfo)
6648 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6649 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6652 /* Possibly unaligned access. */
6654 /* We can choose between using the implicit realignment scheme (generating
6655 a misaligned_move stmt) and the explicit realignment scheme (generating
6656 aligned loads with a REALIGN_LOAD). There are two variants to the
6657 explicit realignment scheme: optimized, and unoptimized.
6658 We can optimize the realignment only if the step between consecutive
6659 vector loads is equal to the vector size. Since the vector memory
6660 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6661 is guaranteed that the misalignment amount remains the same throughout the
6662 execution of the vectorized loop. Therefore, we can create the
6663 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6664 at the loop preheader.
6666 However, in the case of outer-loop vectorization, when vectorizing a
6667 memory access in the inner-loop nested within the LOOP that is now being
6668 vectorized, while it is guaranteed that the misalignment of the
6669 vectorized memory access will remain the same in different outer-loop
6670 iterations, it is *not* guaranteed that is will remain the same throughout
6671 the execution of the inner-loop. This is because the inner-loop advances
6672 with the original scalar step (and not in steps of VS). If the inner-loop
6673 step happens to be a multiple of VS, then the misalignment remains fixed
6674 and we can use the optimized realignment scheme. For example:
6676 for (i=0; i<N; i++)
6677 for (j=0; j<M; j++)
6678 s += a[i+j];
6680 When vectorizing the i-loop in the above example, the step between
6681 consecutive vector loads is 1, and so the misalignment does not remain
6682 fixed across the execution of the inner-loop, and the realignment cannot
6683 be optimized (as illustrated in the following pseudo vectorized loop):
6685 for (i=0; i<N; i+=4)
6686 for (j=0; j<M; j++){
6687 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6688 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6689 // (assuming that we start from an aligned address).
6692 We therefore have to use the unoptimized realignment scheme:
6694 for (i=0; i<N; i+=4)
6695 for (j=k; j<M; j+=4)
6696 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6697 // that the misalignment of the initial address is
6698 // 0).
6700 The loop can then be vectorized as follows:
6702 for (k=0; k<4; k++){
6703 rt = get_realignment_token (&vp[k]);
6704 for (i=0; i<N; i+=4){
6705 v1 = vp[i+k];
6706 for (j=k; j<M; j+=4){
6707 v2 = vp[i+j+VS-1];
6708 va = REALIGN_LOAD <v1,v2,rt>;
6709 vs += va;
6710 v1 = v2;
6713 } */
6715 if (DR_IS_READ (dr))
6717 bool is_packed = false;
6718 tree type = (TREE_TYPE (DR_REF (dr)));
6720 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6721 && (!targetm.vectorize.builtin_mask_for_load
6722 || targetm.vectorize.builtin_mask_for_load ()))
6724 /* If we are doing SLP then the accesses need not have the
6725 same alignment, instead it depends on the SLP group size. */
6726 if (loop_vinfo
6727 && STMT_SLP_TYPE (stmt_info)
6728 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6729 * (DR_GROUP_SIZE
6730 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6731 TYPE_VECTOR_SUBPARTS (vectype)))
6733 else if (!loop_vinfo
6734 || (nested_in_vect_loop
6735 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6736 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6737 return dr_explicit_realign;
6738 else
6739 return dr_explicit_realign_optimized;
6741 if (!known_alignment_for_access_p (dr_info, vectype))
6742 is_packed = not_size_aligned (DR_REF (dr));
6744 if (targetm.vectorize.support_vector_misalignment
6745 (mode, type, dr_misalignment (dr_info, vectype), is_packed))
6746 /* Can't software pipeline the loads, but can at least do them. */
6747 return dr_unaligned_supported;
6749 else
6751 bool is_packed = false;
6752 tree type = (TREE_TYPE (DR_REF (dr)));
6754 if (!known_alignment_for_access_p (dr_info, vectype))
6755 is_packed = not_size_aligned (DR_REF (dr));
6757 if (targetm.vectorize.support_vector_misalignment
6758 (mode, type, dr_misalignment (dr_info, vectype), is_packed))
6759 return dr_unaligned_supported;
6762 /* Unsupported. */
6763 return dr_unaligned_unsupported;