c++: only cache constexpr calls that are constant exprs
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blob9edc8989de9559ee2b1b967968490047532e6204
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 HOST_WIDE_INT lhs, rhs;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
150 unsigned int i = 0;
151 if (gimple_call_internal_p (call))
153 internal_fn ifn = gimple_call_internal_fn (call);
154 if (internal_load_fn_p (ifn))
155 /* For loads the LHS type does the trick. */
156 i = ~0U;
157 else if (internal_store_fn_p (ifn))
159 /* For stores use the tyep of the stored value. */
160 i = internal_fn_stored_value_index (ifn);
161 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
162 i = ~0U;
164 else if (internal_fn_mask_index (ifn) == 0)
165 i = 1;
167 if (i < gimple_call_num_args (call))
169 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
170 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
172 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
173 if (rhs < lhs)
174 scalar_type = rhs_type;
179 return scalar_type;
183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
184 tested at run-time. Return TRUE if DDR was successfully inserted.
185 Return false if versioning is not supported. */
187 static opt_result
188 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
190 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
192 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
193 return opt_result::failure_at (vect_location,
194 "will not create alias checks, as"
195 " --param vect-max-version-for-alias-checks"
196 " == 0\n");
198 opt_result res
199 = runtime_alias_check_p (ddr, loop,
200 optimize_loop_nest_for_speed_p (loop));
201 if (!res)
202 return res;
204 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
205 return opt_result::success ();
208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
210 static void
211 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
213 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
214 for (unsigned int i = 0; i < checks.length(); ++i)
215 if (checks[i] == value)
216 return;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location,
220 "need run-time check that %T is nonzero\n",
221 value);
222 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 /* Return true if we know that the order of vectorized DR_INFO_A and
226 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
227 DR_INFO_B. At least one of the accesses is a write. */
229 static bool
230 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
232 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
233 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
235 /* Single statements are always kept in their original order. */
236 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
237 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
238 return true;
240 /* STMT_A and STMT_B belong to overlapping groups. All loads are
241 emitted at the position of the first scalar load.
242 Stores in a group are emitted at the position of the last scalar store.
243 Compute that position and check whether the resulting order matches
244 the current one. */
245 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246 if (il_a)
248 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
250 s = DR_GROUP_NEXT_ELEMENT (s))
251 il_a = get_later_stmt (il_a, s);
252 else /* DR_IS_READ */
253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 s = DR_GROUP_NEXT_ELEMENT (s))
255 if (get_later_stmt (il_a, s) == il_a)
256 il_a = s;
258 else
259 il_a = stmtinfo_a;
260 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
261 if (il_b)
263 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
264 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
265 s = DR_GROUP_NEXT_ELEMENT (s))
266 il_b = get_later_stmt (il_b, s);
267 else /* DR_IS_READ */
268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
269 s = DR_GROUP_NEXT_ELEMENT (s))
270 if (get_later_stmt (il_b, s) == il_b)
271 il_b = s;
273 else
274 il_b = stmtinfo_b;
275 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
276 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 /* A subroutine of vect_analyze_data_ref_dependence. Handle
280 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
281 distances. These distances are conservatively correct but they don't
282 reflect a guaranteed dependence.
284 Return true if this function does all the work necessary to avoid
285 an alias or false if the caller should use the dependence distances
286 to limit the vectorization factor in the usual way. LOOP_DEPTH is
287 the depth of the loop described by LOOP_VINFO and the other arguments
288 are as for vect_analyze_data_ref_dependence. */
290 static bool
291 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
292 loop_vec_info loop_vinfo,
293 int loop_depth, unsigned int *max_vf)
295 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
296 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
298 int dist = dist_v[loop_depth];
299 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
301 /* If the user asserted safelen >= DIST consecutive iterations
302 can be executed concurrently, assume independence.
304 ??? An alternative would be to add the alias check even
305 in this case, and vectorize the fallback loop with the
306 maximum VF set to safelen. However, if the user has
307 explicitly given a length, it's less likely that that
308 would be a win. */
309 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
311 if ((unsigned int) loop->safelen < *max_vf)
312 *max_vf = loop->safelen;
313 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
314 continue;
317 /* For dependence distances of 2 or more, we have the option
318 of limiting VF or checking for an alias at runtime.
319 Prefer to check at runtime if we can, to avoid limiting
320 the VF unnecessarily when the bases are in fact independent.
322 Note that the alias checks will be removed if the VF ends up
323 being small enough. */
324 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
325 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
326 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
327 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
328 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 return true;
335 /* Function vect_analyze_data_ref_dependence.
337 FIXME: I needed to change the sense of the returned flag.
339 Return FALSE if there (might) exist a dependence between a memory-reference
340 DRA and a memory-reference DRB. When versioning for alias may check a
341 dependence at run-time, return TRUE. Adjust *MAX_VF according to
342 the data dependence. */
344 static opt_result
345 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
346 loop_vec_info loop_vinfo,
347 unsigned int *max_vf)
349 unsigned int i;
350 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
351 struct data_reference *dra = DDR_A (ddr);
352 struct data_reference *drb = DDR_B (ddr);
353 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
354 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
355 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
356 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
357 lambda_vector dist_v;
358 unsigned int loop_depth;
360 /* If user asserted safelen consecutive iterations can be
361 executed concurrently, assume independence. */
362 auto apply_safelen = [&]()
364 if (loop->safelen >= 2)
366 if ((unsigned int) loop->safelen < *max_vf)
367 *max_vf = loop->safelen;
368 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
369 return true;
371 return false;
374 /* In loop analysis all data references should be vectorizable. */
375 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
376 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
377 gcc_unreachable ();
379 /* Independent data accesses. */
380 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
381 return opt_result::success ();
383 if (dra == drb
384 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
385 return opt_result::success ();
387 /* We do not have to consider dependences between accesses that belong
388 to the same group, unless the stride could be smaller than the
389 group size. */
390 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
391 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
393 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
394 return opt_result::success ();
396 /* Even if we have an anti-dependence then, as the vectorized loop covers at
397 least two scalar iterations, there is always also a true dependence.
398 As the vectorizer does not re-order loads and stores we can ignore
399 the anti-dependence if TBAA can disambiguate both DRs similar to the
400 case with known negative distance anti-dependences (positive
401 distance anti-dependences would violate TBAA constraints). */
402 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
403 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
404 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
405 get_alias_set (DR_REF (drb))))
406 return opt_result::success ();
408 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
409 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
411 if (apply_safelen ())
412 return opt_result::success ();
414 return opt_result::failure_at
415 (stmtinfo_a->stmt,
416 "possible alias involving gather/scatter between %T and %T\n",
417 DR_REF (dra), DR_REF (drb));
420 /* Unknown data dependence. */
421 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
423 if (apply_safelen ())
424 return opt_result::success ();
426 if (dump_enabled_p ())
427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
428 "versioning for alias required: "
429 "can't determine dependence between %T and %T\n",
430 DR_REF (dra), DR_REF (drb));
432 /* Add to list of ddrs that need to be tested at run-time. */
433 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436 /* Known data dependence. */
437 if (DDR_NUM_DIST_VECTS (ddr) == 0)
439 if (apply_safelen ())
440 return opt_result::success ();
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
444 "versioning for alias required: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra), DR_REF (drb));
447 /* Add to list of ddrs that need to be tested at run-time. */
448 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
453 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
454 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
455 loop_depth, max_vf))
456 return opt_result::success ();
458 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
460 int dist = dist_v[loop_depth];
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "dependence distance = %d.\n", dist);
466 if (dist == 0)
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance == 0 between %T and %T\n",
471 DR_REF (dra), DR_REF (drb));
473 /* When we perform grouped accesses and perform implicit CSE
474 by detecting equal accesses and doing disambiguation with
475 runtime alias tests like for
476 .. = a[i];
477 .. = a[i+1];
478 a[i] = ..;
479 a[i+1] = ..;
480 *p = ..;
481 .. = a[i];
482 .. = a[i+1];
483 where we will end up loading { a[i], a[i+1] } once, make
484 sure that inserting group loads before the first load and
485 stores after the last store will do the right thing.
486 Similar for groups like
487 a[i] = ...;
488 ... = a[i];
489 a[i+1] = ...;
490 where loads from the group interleave with the store. */
491 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
492 return opt_result::failure_at (stmtinfo_a->stmt,
493 "READ_WRITE dependence"
494 " in interleaving.\n");
496 if (loop->safelen < 2)
498 tree indicator = dr_zero_step_indicator (dra);
499 if (!indicator || integer_zerop (indicator))
500 return opt_result::failure_at (stmtinfo_a->stmt,
501 "access also has a zero step\n");
502 else if (TREE_CODE (indicator) != INTEGER_CST)
503 vect_check_nonzero_value (loop_vinfo, indicator);
505 continue;
508 if (dist > 0 && DDR_REVERSED_P (ddr))
510 /* If DDR_REVERSED_P the order of the data-refs in DDR was
511 reversed (to make distance vector positive), and the actual
512 distance is negative. */
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance negative.\n");
516 /* When doing outer loop vectorization, we need to check if there is
517 a backward dependence at the inner loop level if the dependence
518 at the outer loop is reversed. See PR81740. */
519 if (nested_in_vect_loop_p (loop, stmtinfo_a)
520 || nested_in_vect_loop_p (loop, stmtinfo_b))
522 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
523 DDR_LOOP_NEST (ddr));
524 if (dist_v[inner_depth] < 0)
525 return opt_result::failure_at (stmtinfo_a->stmt,
526 "not vectorized, dependence "
527 "between data-refs %T and %T\n",
528 DR_REF (dra), DR_REF (drb));
530 /* Record a negative dependence distance to later limit the
531 amount of stmt copying / unrolling we can perform.
532 Only need to handle read-after-write dependence. */
533 if (DR_IS_READ (drb)
534 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
535 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
536 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
537 continue;
540 unsigned int abs_dist = abs (dist);
541 if (abs_dist >= 2 && abs_dist < *max_vf)
543 /* The dependence distance requires reduction of the maximal
544 vectorization factor. */
545 *max_vf = abs_dist;
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE, vect_location,
548 "adjusting maximal vectorization factor to %i\n",
549 *max_vf);
552 if (abs_dist >= *max_vf)
554 /* Dependence distance does not create dependence, as far as
555 vectorization is concerned, in this case. */
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "dependence distance >= VF.\n");
559 continue;
562 return opt_result::failure_at (stmtinfo_a->stmt,
563 "not vectorized, possible dependence "
564 "between data-refs %T and %T\n",
565 DR_REF (dra), DR_REF (drb));
568 return opt_result::success ();
571 /* Function vect_analyze_data_ref_dependences.
573 Examine all the data references in the loop, and make sure there do not
574 exist any data dependences between them. Set *MAX_VF according to
575 the maximum vectorization factor the data dependences allow. */
577 opt_result
578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
579 unsigned int *max_vf)
581 unsigned int i;
582 struct data_dependence_relation *ddr;
584 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
586 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
588 LOOP_VINFO_DDRS (loop_vinfo)
589 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
590 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
591 /* We do not need read-read dependences. */
592 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
593 &LOOP_VINFO_DDRS (loop_vinfo),
594 LOOP_VINFO_LOOP_NEST (loop_vinfo),
595 false);
596 gcc_assert (res);
599 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
601 /* For epilogues we either have no aliases or alias versioning
602 was applied to original loop. Therefore we may just get max_vf
603 using VF of original loop. */
604 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
605 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
606 else
607 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
609 opt_result res
610 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
611 if (!res)
612 return res;
615 return opt_result::success ();
619 /* Function vect_slp_analyze_data_ref_dependence.
621 Return TRUE if there (might) exist a dependence between a memory-reference
622 DRA and a memory-reference DRB for VINFO. When versioning for alias
623 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
624 according to the data dependence. */
626 static bool
627 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
628 struct data_dependence_relation *ddr)
630 struct data_reference *dra = DDR_A (ddr);
631 struct data_reference *drb = DDR_B (ddr);
632 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
633 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
635 /* We need to check dependences of statements marked as unvectorizable
636 as well, they still can prohibit vectorization. */
638 /* Independent data accesses. */
639 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
640 return false;
642 if (dra == drb)
643 return false;
645 /* Read-read is OK. */
646 if (DR_IS_READ (dra) && DR_IS_READ (drb))
647 return false;
649 /* If dra and drb are part of the same interleaving chain consider
650 them independent. */
651 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
652 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
653 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
654 return false;
656 /* Unknown data dependence. */
657 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
661 "can't determine dependence between %T and %T\n",
662 DR_REF (dra), DR_REF (drb));
664 else if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "determined dependence between %T and %T\n",
667 DR_REF (dra), DR_REF (drb));
669 return true;
673 /* Analyze dependences involved in the transform of SLP NODE. STORES
674 contain the vector of scalar stores of this instance if we are
675 disambiguating the loads. */
677 static bool
678 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
679 vec<stmt_vec_info> stores,
680 stmt_vec_info last_store_info)
682 /* This walks over all stmts involved in the SLP load/store done
683 in NODE verifying we can sink them up to the last stmt in the
684 group. */
685 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
687 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
688 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
690 stmt_vec_info access_info
691 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
692 if (access_info == last_access_info)
693 continue;
694 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
695 ao_ref ref;
696 bool ref_initialized_p = false;
697 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
698 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
700 gimple *stmt = gsi_stmt (gsi);
701 if (! gimple_vuse (stmt))
702 continue;
704 /* If we couldn't record a (single) data reference for this
705 stmt we have to resort to the alias oracle. */
706 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
707 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
708 if (!dr_b)
710 /* We are moving a store - this means
711 we cannot use TBAA for disambiguation. */
712 if (!ref_initialized_p)
713 ao_ref_init (&ref, DR_REF (dr_a));
714 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
715 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
716 return false;
717 continue;
720 bool dependent = false;
721 /* If we run into a store of this same instance (we've just
722 marked those) then delay dependence checking until we run
723 into the last store because this is where it will have
724 been sunk to (and we verify if we can do that as well). */
725 if (gimple_visited_p (stmt))
727 if (stmt_info != last_store_info)
728 continue;
730 for (stmt_vec_info &store_info : stores)
732 data_reference *store_dr
733 = STMT_VINFO_DATA_REF (store_info);
734 ddr_p ddr = initialize_data_dependence_relation
735 (dr_a, store_dr, vNULL);
736 dependent
737 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
738 free_dependence_relation (ddr);
739 if (dependent)
740 break;
743 else
745 ddr_p ddr = initialize_data_dependence_relation (dr_a,
746 dr_b, vNULL);
747 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
748 free_dependence_relation (ddr);
750 if (dependent)
751 return false;
755 else /* DR_IS_READ */
757 stmt_vec_info first_access_info
758 = vect_find_first_scalar_stmt_in_slp (node);
759 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
761 stmt_vec_info access_info
762 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
763 if (access_info == first_access_info)
764 continue;
765 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
766 ao_ref ref;
767 bool ref_initialized_p = false;
768 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
769 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
771 gimple *stmt = gsi_stmt (gsi);
772 if (! gimple_vdef (stmt))
773 continue;
775 /* If we couldn't record a (single) data reference for this
776 stmt we have to resort to the alias oracle. */
777 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
778 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
780 /* We are hoisting a load - this means we can use
781 TBAA for disambiguation. */
782 if (!ref_initialized_p)
783 ao_ref_init (&ref, DR_REF (dr_a));
784 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
786 if (!dr_b)
787 return false;
788 /* Resort to dependence checking below. */
790 else
791 /* No dependence. */
792 continue;
794 bool dependent = false;
795 /* If we run into a store of this same instance (we've just
796 marked those) then delay dependence checking until we run
797 into the last store because this is where it will have
798 been sunk to (and we verify if we can do that as well). */
799 if (gimple_visited_p (stmt))
801 if (stmt_info != last_store_info)
802 continue;
804 for (stmt_vec_info &store_info : stores)
806 data_reference *store_dr
807 = STMT_VINFO_DATA_REF (store_info);
808 ddr_p ddr = initialize_data_dependence_relation
809 (dr_a, store_dr, vNULL);
810 dependent
811 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
812 free_dependence_relation (ddr);
813 if (dependent)
814 break;
817 else
819 ddr_p ddr = initialize_data_dependence_relation (dr_a,
820 dr_b, vNULL);
821 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
822 free_dependence_relation (ddr);
824 if (dependent)
825 return false;
829 return true;
833 /* Function vect_analyze_data_ref_dependences.
835 Examine all the data references in the basic-block, and make sure there
836 do not exist any data dependences between them. Set *MAX_VF according to
837 the maximum vectorization factor the data dependences allow. */
839 bool
840 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
842 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
844 /* The stores of this instance are at the root of the SLP tree. */
845 slp_tree store = NULL;
846 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
847 store = SLP_INSTANCE_TREE (instance);
849 /* Verify we can sink stores to the vectorized stmt insert location. */
850 stmt_vec_info last_store_info = NULL;
851 if (store)
853 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
854 return false;
856 /* Mark stores in this instance and remember the last one. */
857 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
858 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
859 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
862 bool res = true;
864 /* Verify we can sink loads to the vectorized stmt insert location,
865 special-casing stores of this instance. */
866 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
867 if (! vect_slp_analyze_node_dependences (vinfo, load,
868 store
869 ? SLP_TREE_SCALAR_STMTS (store)
870 : vNULL, last_store_info))
872 res = false;
873 break;
876 /* Unset the visited flag. */
877 if (store)
878 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
879 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
881 return res;
884 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
885 applied. */
888 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
890 HOST_WIDE_INT diff = 0;
891 /* Alignment is only analyzed for the first element of a DR group,
892 use that but adjust misalignment by the offset of the access. */
893 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
895 dr_vec_info *first_dr
896 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
897 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
898 INTEGER_CSTs and the first element in the group has the lowest
899 address. */
900 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
901 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
902 gcc_assert (diff >= 0);
903 dr_info = first_dr;
906 int misalign = dr_info->misalignment;
907 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
908 if (misalign == DR_MISALIGNMENT_UNKNOWN)
909 return misalign;
911 /* If the access is only aligned for a vector type with smaller alignment
912 requirement the access has unknown misalignment. */
913 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
914 targetm.vectorize.preferred_vector_alignment (vectype)))
915 return DR_MISALIGNMENT_UNKNOWN;
917 /* Apply the offset from the DR group start and the externally supplied
918 offset which can for example result from a negative stride access. */
919 poly_int64 misalignment = misalign + diff + offset;
921 /* vect_compute_data_ref_alignment will have ensured that target_alignment
922 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
923 unsigned HOST_WIDE_INT target_alignment_c
924 = dr_info->target_alignment.to_constant ();
925 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
926 return DR_MISALIGNMENT_UNKNOWN;
927 return misalign;
930 /* Record the base alignment guarantee given by DRB, which occurs
931 in STMT_INFO. */
933 static void
934 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
935 innermost_loop_behavior *drb)
937 bool existed;
938 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
939 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
940 if (!existed || entry.second->base_alignment < drb->base_alignment)
942 entry = std::make_pair (stmt_info, drb);
943 if (dump_enabled_p ())
944 dump_printf_loc (MSG_NOTE, vect_location,
945 "recording new base alignment for %T\n"
946 " alignment: %d\n"
947 " misalignment: %d\n"
948 " based on: %G",
949 drb->base_address,
950 drb->base_alignment,
951 drb->base_misalignment,
952 stmt_info->stmt);
956 /* If the region we're going to vectorize is reached, all unconditional
957 data references occur at least once. We can therefore pool the base
958 alignment guarantees from each unconditional reference. Do this by
959 going through all the data references in VINFO and checking whether
960 the containing statement makes the reference unconditionally. If so,
961 record the alignment of the base address in VINFO so that it can be
962 used for all other references with the same base. */
964 void
965 vect_record_base_alignments (vec_info *vinfo)
967 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
968 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
969 for (data_reference *dr : vinfo->shared->datarefs)
971 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
972 stmt_vec_info stmt_info = dr_info->stmt;
973 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
974 && STMT_VINFO_VECTORIZABLE (stmt_info)
975 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
977 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
979 /* If DR is nested in the loop that is being vectorized, we can also
980 record the alignment of the base wrt the outer loop. */
981 if (loop && nested_in_vect_loop_p (loop, stmt_info))
982 vect_record_base_alignment
983 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
988 /* Function vect_compute_data_ref_alignment
990 Compute the misalignment of the data reference DR_INFO when vectorizing
991 with VECTYPE.
993 Output:
994 1. initialized misalignment info for DR_INFO
996 FOR NOW: No analysis is actually performed. Misalignment is calculated
997 only for trivial cases. TODO. */
999 static void
1000 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1001 tree vectype)
1003 stmt_vec_info stmt_info = dr_info->stmt;
1004 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1005 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1006 class loop *loop = NULL;
1007 tree ref = DR_REF (dr_info->dr);
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_NOTE, vect_location,
1011 "vect_compute_data_ref_alignment:\n");
1013 if (loop_vinfo)
1014 loop = LOOP_VINFO_LOOP (loop_vinfo);
1016 /* Initialize misalignment to unknown. */
1017 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1019 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1020 return;
1022 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1023 bool step_preserves_misalignment_p;
1025 poly_uint64 vector_alignment
1026 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1027 BITS_PER_UNIT);
1028 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1030 /* If the main loop has peeled for alignment we have no way of knowing
1031 whether the data accesses in the epilogues are aligned. We can't at
1032 compile time answer the question whether we have entered the main loop or
1033 not. Fixes PR 92351. */
1034 if (loop_vinfo)
1036 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1037 if (orig_loop_vinfo
1038 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1039 return;
1042 unsigned HOST_WIDE_INT vect_align_c;
1043 if (!vector_alignment.is_constant (&vect_align_c))
1044 return;
1046 /* No step for BB vectorization. */
1047 if (!loop)
1049 gcc_assert (integer_zerop (drb->step));
1050 step_preserves_misalignment_p = true;
1053 /* In case the dataref is in an inner-loop of the loop that is being
1054 vectorized (LOOP), we use the base and misalignment information
1055 relative to the outer-loop (LOOP). This is ok only if the misalignment
1056 stays the same throughout the execution of the inner-loop, which is why
1057 we have to check that the stride of the dataref in the inner-loop evenly
1058 divides by the vector alignment. */
1059 else if (nested_in_vect_loop_p (loop, stmt_info))
1061 step_preserves_misalignment_p
1062 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1064 if (dump_enabled_p ())
1066 if (step_preserves_misalignment_p)
1067 dump_printf_loc (MSG_NOTE, vect_location,
1068 "inner step divides the vector alignment.\n");
1069 else
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071 "inner step doesn't divide the vector"
1072 " alignment.\n");
1076 /* Similarly we can only use base and misalignment information relative to
1077 an innermost loop if the misalignment stays the same throughout the
1078 execution of the loop. As above, this is the case if the stride of
1079 the dataref evenly divides by the alignment. */
1080 else
1082 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1083 step_preserves_misalignment_p
1084 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1086 if (!step_preserves_misalignment_p && dump_enabled_p ())
1087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1088 "step doesn't divide the vector alignment.\n");
1091 unsigned int base_alignment = drb->base_alignment;
1092 unsigned int base_misalignment = drb->base_misalignment;
1094 /* Calculate the maximum of the pooled base address alignment and the
1095 alignment that we can compute for DR itself. */
1096 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1097 = base_alignments->get (drb->base_address);
1098 if (entry
1099 && base_alignment < (*entry).second->base_alignment
1100 && (loop_vinfo
1101 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1102 gimple_bb (entry->first->stmt))
1103 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1104 || (entry->first->dr_aux.group <= dr_info->group)))))
1106 base_alignment = entry->second->base_alignment;
1107 base_misalignment = entry->second->base_misalignment;
1110 if (drb->offset_alignment < vect_align_c
1111 || !step_preserves_misalignment_p
1112 /* We need to know whether the step wrt the vectorized loop is
1113 negative when computing the starting misalignment below. */
1114 || TREE_CODE (drb->step) != INTEGER_CST)
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Unknown alignment for access: %T\n", ref);
1119 return;
1122 if (base_alignment < vect_align_c)
1124 unsigned int max_alignment;
1125 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1126 if (max_alignment < vect_align_c
1127 || !vect_can_force_dr_alignment_p (base,
1128 vect_align_c * BITS_PER_UNIT))
1130 if (dump_enabled_p ())
1131 dump_printf_loc (MSG_NOTE, vect_location,
1132 "can't force alignment of ref: %T\n", ref);
1133 return;
1136 /* Force the alignment of the decl.
1137 NOTE: This is the only change to the code we make during
1138 the analysis phase, before deciding to vectorize the loop. */
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_NOTE, vect_location,
1141 "force alignment of %T\n", ref);
1143 dr_info->base_decl = base;
1144 dr_info->base_misaligned = true;
1145 base_misalignment = 0;
1147 poly_int64 misalignment
1148 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1150 unsigned int const_misalignment;
1151 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1153 if (dump_enabled_p ())
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155 "Non-constant misalignment for access: %T\n", ref);
1156 return;
1159 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1161 if (dump_enabled_p ())
1162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163 "misalign = %d bytes of ref %T\n",
1164 const_misalignment, ref);
1166 return;
1169 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1170 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1171 is made aligned via peeling. */
1173 static bool
1174 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1175 dr_vec_info *dr_peel_info)
1177 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1178 DR_TARGET_ALIGNMENT (dr_info)))
1180 poly_offset_int diff
1181 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1182 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1183 if (known_eq (diff, 0)
1184 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1185 return true;
1187 return false;
1190 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1191 aligned via peeling. */
1193 static bool
1194 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1195 dr_vec_info *dr_peel_info)
1197 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1198 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1199 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1200 DR_OFFSET (dr_peel_info->dr), 0)
1201 || !operand_equal_p (DR_STEP (dr_info->dr),
1202 DR_STEP (dr_peel_info->dr), 0))
1203 return false;
1205 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 /* Compute the value for dr_info->misalign so that the access appears
1209 aligned. This is used by peeling to compensate for dr_misalignment
1210 applying the offset for negative step. */
1213 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1215 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1216 return 0;
1218 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1219 poly_int64 misalignment
1220 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1221 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1223 unsigned HOST_WIDE_INT target_alignment_c;
1224 int misalign;
1225 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1226 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1227 return DR_MISALIGNMENT_UNKNOWN;
1228 return misalign;
1231 /* Function vect_update_misalignment_for_peel.
1232 Sets DR_INFO's misalignment
1233 - to 0 if it has the same alignment as DR_PEEL_INFO,
1234 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1235 - to -1 (unknown) otherwise.
1237 DR_INFO - the data reference whose misalignment is to be adjusted.
1238 DR_PEEL_INFO - the data reference whose misalignment is being made
1239 zero in the vector loop by the peel.
1240 NPEEL - the number of iterations in the peel loop if the misalignment
1241 of DR_PEEL_INFO is known at compile time. */
1243 static void
1244 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1245 dr_vec_info *dr_peel_info, int npeel)
1247 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1248 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1250 SET_DR_MISALIGNMENT (dr_info,
1251 vect_dr_misalign_for_aligned_access (dr_peel_info));
1252 return;
1255 unsigned HOST_WIDE_INT alignment;
1256 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1257 && known_alignment_for_access_p (dr_info,
1258 STMT_VINFO_VECTYPE (dr_info->stmt))
1259 && known_alignment_for_access_p (dr_peel_info,
1260 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1262 int misal = dr_info->misalignment;
1263 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1264 misal &= alignment - 1;
1265 set_dr_misalignment (dr_info, misal);
1266 return;
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1271 "to unknown (-1).\n");
1272 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 /* Return true if alignment is relevant for DR_INFO. */
1277 static bool
1278 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1280 stmt_vec_info stmt_info = dr_info->stmt;
1282 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1283 return false;
1285 /* For interleaving, only the alignment of the first access matters. */
1286 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1287 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1288 return false;
1290 /* Scatter-gather and invariant accesses continue to address individual
1291 scalars, so vector-level alignment is irrelevant. */
1292 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1293 || integer_zerop (DR_STEP (dr_info->dr)))
1294 return false;
1296 /* Strided accesses perform only component accesses, alignment is
1297 irrelevant for them. */
1298 if (STMT_VINFO_STRIDED_P (stmt_info)
1299 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1300 return false;
1302 return true;
1305 /* Given an memory reference EXP return whether its alignment is less
1306 than its size. */
1308 static bool
1309 not_size_aligned (tree exp)
1311 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1312 return true;
1314 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1315 > get_object_alignment (exp));
1318 /* Function vector_alignment_reachable_p
1320 Return true if vector alignment for DR_INFO is reachable by peeling
1321 a few loop iterations. Return false otherwise. */
1323 static bool
1324 vector_alignment_reachable_p (dr_vec_info *dr_info)
1326 stmt_vec_info stmt_info = dr_info->stmt;
1327 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1329 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1331 /* For interleaved access we peel only if number of iterations in
1332 the prolog loop ({VF - misalignment}), is a multiple of the
1333 number of the interleaved accesses. */
1334 int elem_size, mis_in_elements;
1336 /* FORNOW: handle only known alignment. */
1337 if (!known_alignment_for_access_p (dr_info, vectype))
1338 return false;
1340 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1341 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1342 elem_size = vector_element_size (vector_size, nelements);
1343 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1345 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1346 return false;
1349 /* If misalignment is known at the compile time then allow peeling
1350 only if natural alignment is reachable through peeling. */
1351 if (known_alignment_for_access_p (dr_info, vectype)
1352 && !aligned_access_p (dr_info, vectype))
1354 HOST_WIDE_INT elmsize =
1355 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1356 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_NOTE, vect_location,
1359 "data size = %wd. misalignment = %d.\n", elmsize,
1360 dr_misalignment (dr_info, vectype));
1362 if (dr_misalignment (dr_info, vectype) % elmsize)
1364 if (dump_enabled_p ())
1365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 "data size does not divide the misalignment.\n");
1367 return false;
1371 if (!known_alignment_for_access_p (dr_info, vectype))
1373 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1374 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1375 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "Unknown misalignment, %snaturally aligned\n",
1378 is_packed ? "not " : "");
1379 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382 return true;
1386 /* Calculate the cost of the memory access represented by DR_INFO. */
1388 static void
1389 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1390 dr_alignment_support alignment_support_scheme,
1391 int misalignment,
1392 unsigned int *inside_cost,
1393 unsigned int *outside_cost,
1394 stmt_vector_for_cost *body_cost_vec,
1395 stmt_vector_for_cost *prologue_cost_vec)
1397 stmt_vec_info stmt_info = dr_info->stmt;
1398 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1399 int ncopies;
1401 if (PURE_SLP_STMT (stmt_info))
1402 ncopies = 1;
1403 else
1404 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1406 if (DR_IS_READ (dr_info->dr))
1407 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1408 misalignment, true, inside_cost,
1409 outside_cost, prologue_cost_vec, body_cost_vec, false);
1410 else
1411 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1412 misalignment, inside_cost, body_cost_vec);
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "vect_get_data_access_cost: inside_cost = %d, "
1417 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1421 typedef struct _vect_peel_info
1423 dr_vec_info *dr_info;
1424 int npeel;
1425 unsigned int count;
1426 } *vect_peel_info;
1428 typedef struct _vect_peel_extended_info
1430 vec_info *vinfo;
1431 struct _vect_peel_info peel_info;
1432 unsigned int inside_cost;
1433 unsigned int outside_cost;
1434 } *vect_peel_extended_info;
1437 /* Peeling hashtable helpers. */
1439 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1441 static inline hashval_t hash (const _vect_peel_info *);
1442 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 inline hashval_t
1446 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1448 return (hashval_t) peel_info->npeel;
1451 inline bool
1452 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1454 return (a->npeel == b->npeel);
1458 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1460 static void
1461 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1462 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1463 int npeel, bool supportable_if_not_aligned)
1465 struct _vect_peel_info elem, *slot;
1466 _vect_peel_info **new_slot;
1468 elem.npeel = npeel;
1469 slot = peeling_htab->find (&elem);
1470 if (slot)
1471 slot->count++;
1472 else
1474 slot = XNEW (struct _vect_peel_info);
1475 slot->npeel = npeel;
1476 slot->dr_info = dr_info;
1477 slot->count = 1;
1478 new_slot = peeling_htab->find_slot (slot, INSERT);
1479 *new_slot = slot;
1482 /* If this DR is not supported with unknown misalignment then bias
1483 this slot when the cost model is disabled. */
1484 if (!supportable_if_not_aligned
1485 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1486 slot->count += VECT_MAX_COST;
1490 /* Traverse peeling hash table to find peeling option that aligns maximum
1491 number of data accesses. */
1494 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1495 _vect_peel_extended_info *max)
1497 vect_peel_info elem = *slot;
1499 if (elem->count > max->peel_info.count
1500 || (elem->count == max->peel_info.count
1501 && max->peel_info.npeel > elem->npeel))
1503 max->peel_info.npeel = elem->npeel;
1504 max->peel_info.count = elem->count;
1505 max->peel_info.dr_info = elem->dr_info;
1508 return 1;
1511 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1512 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1513 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1514 after peeling. */
1516 static void
1517 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1518 dr_vec_info *dr0_info,
1519 unsigned int *inside_cost,
1520 unsigned int *outside_cost,
1521 stmt_vector_for_cost *body_cost_vec,
1522 stmt_vector_for_cost *prologue_cost_vec,
1523 unsigned int npeel)
1525 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1527 bool dr0_alignment_known_p
1528 = (dr0_info
1529 && known_alignment_for_access_p (dr0_info,
1530 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1532 for (data_reference *dr : datarefs)
1534 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1535 if (!vect_relevant_for_alignment_p (dr_info))
1536 continue;
1538 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1539 dr_alignment_support alignment_support_scheme;
1540 int misalignment;
1541 unsigned HOST_WIDE_INT alignment;
1543 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1544 size_zero_node) < 0;
1545 poly_int64 off = 0;
1546 if (negative)
1547 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1548 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1550 if (npeel == 0)
1551 misalignment = dr_misalignment (dr_info, vectype, off);
1552 else if (dr_info == dr0_info
1553 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1554 misalignment = 0;
1555 else if (!dr0_alignment_known_p
1556 || !known_alignment_for_access_p (dr_info, vectype)
1557 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1558 misalignment = DR_MISALIGNMENT_UNKNOWN;
1559 else
1561 misalignment = dr_misalignment (dr_info, vectype, off);
1562 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1563 misalignment &= alignment - 1;
1565 alignment_support_scheme
1566 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1567 misalignment);
1569 vect_get_data_access_cost (loop_vinfo, dr_info,
1570 alignment_support_scheme, misalignment,
1571 inside_cost, outside_cost,
1572 body_cost_vec, prologue_cost_vec);
1576 /* Traverse peeling hash table and calculate cost for each peeling option.
1577 Find the one with the lowest cost. */
1580 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1581 _vect_peel_extended_info *min)
1583 vect_peel_info elem = *slot;
1584 int dummy;
1585 unsigned int inside_cost = 0, outside_cost = 0;
1586 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1587 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1588 epilogue_cost_vec;
1590 prologue_cost_vec.create (2);
1591 body_cost_vec.create (2);
1592 epilogue_cost_vec.create (2);
1594 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1595 &outside_cost, &body_cost_vec,
1596 &prologue_cost_vec, elem->npeel);
1598 body_cost_vec.release ();
1600 outside_cost += vect_get_known_peeling_cost
1601 (loop_vinfo, elem->npeel, &dummy,
1602 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1603 &prologue_cost_vec, &epilogue_cost_vec);
1605 /* Prologue and epilogue costs are added to the target model later.
1606 These costs depend only on the scalar iteration cost, the
1607 number of peeling iterations finally chosen, and the number of
1608 misaligned statements. So discard the information found here. */
1609 prologue_cost_vec.release ();
1610 epilogue_cost_vec.release ();
1612 if (inside_cost < min->inside_cost
1613 || (inside_cost == min->inside_cost
1614 && outside_cost < min->outside_cost))
1616 min->inside_cost = inside_cost;
1617 min->outside_cost = outside_cost;
1618 min->peel_info.dr_info = elem->dr_info;
1619 min->peel_info.npeel = elem->npeel;
1620 min->peel_info.count = elem->count;
1623 return 1;
1627 /* Choose best peeling option by traversing peeling hash table and either
1628 choosing an option with the lowest cost (if cost model is enabled) or the
1629 option that aligns as many accesses as possible. */
1631 static struct _vect_peel_extended_info
1632 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1633 loop_vec_info loop_vinfo)
1635 struct _vect_peel_extended_info res;
1637 res.peel_info.dr_info = NULL;
1638 res.vinfo = loop_vinfo;
1640 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1642 res.inside_cost = INT_MAX;
1643 res.outside_cost = INT_MAX;
1644 peeling_htab->traverse <_vect_peel_extended_info *,
1645 vect_peeling_hash_get_lowest_cost> (&res);
1647 else
1649 res.peel_info.count = 0;
1650 peeling_htab->traverse <_vect_peel_extended_info *,
1651 vect_peeling_hash_get_most_frequent> (&res);
1652 res.inside_cost = 0;
1653 res.outside_cost = 0;
1656 return res;
1659 /* Return true if the new peeling NPEEL is supported. */
1661 static bool
1662 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1663 unsigned npeel)
1665 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1666 enum dr_alignment_support supportable_dr_alignment;
1668 bool dr0_alignment_known_p
1669 = known_alignment_for_access_p (dr0_info,
1670 STMT_VINFO_VECTYPE (dr0_info->stmt));
1672 /* Ensure that all data refs can be vectorized after the peel. */
1673 for (data_reference *dr : datarefs)
1675 if (dr == dr0_info->dr)
1676 continue;
1678 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1679 if (!vect_relevant_for_alignment_p (dr_info)
1680 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1681 continue;
1683 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1684 int misalignment;
1685 unsigned HOST_WIDE_INT alignment;
1686 if (!dr0_alignment_known_p
1687 || !known_alignment_for_access_p (dr_info, vectype)
1688 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1689 misalignment = DR_MISALIGNMENT_UNKNOWN;
1690 else
1692 misalignment = dr_misalignment (dr_info, vectype);
1693 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1694 misalignment &= alignment - 1;
1696 supportable_dr_alignment
1697 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1698 misalignment);
1699 if (supportable_dr_alignment == dr_unaligned_unsupported)
1700 return false;
1703 return true;
1706 /* Compare two data-references DRA and DRB to group them into chunks
1707 with related alignment. */
1709 static int
1710 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1712 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1713 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1714 int cmp;
1716 /* Stabilize sort. */
1717 if (dra == drb)
1718 return 0;
1720 /* Ordering of DRs according to base. */
1721 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1722 DR_BASE_ADDRESS (drb));
1723 if (cmp != 0)
1724 return cmp;
1726 /* And according to DR_OFFSET. */
1727 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1728 if (cmp != 0)
1729 return cmp;
1731 /* And after step. */
1732 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1733 if (cmp != 0)
1734 return cmp;
1736 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1737 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1738 if (cmp == 0)
1739 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1740 return cmp;
1743 /* Function vect_enhance_data_refs_alignment
1745 This pass will use loop versioning and loop peeling in order to enhance
1746 the alignment of data references in the loop.
1748 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1749 original loop is to be vectorized. Any other loops that are created by
1750 the transformations performed in this pass - are not supposed to be
1751 vectorized. This restriction will be relaxed.
1753 This pass will require a cost model to guide it whether to apply peeling
1754 or versioning or a combination of the two. For example, the scheme that
1755 intel uses when given a loop with several memory accesses, is as follows:
1756 choose one memory access ('p') which alignment you want to force by doing
1757 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1758 other accesses are not necessarily aligned, or (2) use loop versioning to
1759 generate one loop in which all accesses are aligned, and another loop in
1760 which only 'p' is necessarily aligned.
1762 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1763 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1764 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1766 Devising a cost model is the most critical aspect of this work. It will
1767 guide us on which access to peel for, whether to use loop versioning, how
1768 many versions to create, etc. The cost model will probably consist of
1769 generic considerations as well as target specific considerations (on
1770 powerpc for example, misaligned stores are more painful than misaligned
1771 loads).
1773 Here are the general steps involved in alignment enhancements:
1775 -- original loop, before alignment analysis:
1776 for (i=0; i<N; i++){
1777 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1778 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1781 -- After vect_compute_data_refs_alignment:
1782 for (i=0; i<N; i++){
1783 x = q[i]; # DR_MISALIGNMENT(q) = 3
1784 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1787 -- Possibility 1: we do loop versioning:
1788 if (p is aligned) {
1789 for (i=0; i<N; i++){ # loop 1A
1790 x = q[i]; # DR_MISALIGNMENT(q) = 3
1791 p[i] = y; # DR_MISALIGNMENT(p) = 0
1794 else {
1795 for (i=0; i<N; i++){ # loop 1B
1796 x = q[i]; # DR_MISALIGNMENT(q) = 3
1797 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1801 -- Possibility 2: we do loop peeling:
1802 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1803 x = q[i];
1804 p[i] = y;
1806 for (i = 3; i < N; i++){ # loop 2A
1807 x = q[i]; # DR_MISALIGNMENT(q) = 0
1808 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1811 -- Possibility 3: combination of loop peeling and versioning:
1812 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1813 x = q[i];
1814 p[i] = y;
1816 if (p is aligned) {
1817 for (i = 3; i<N; i++){ # loop 3A
1818 x = q[i]; # DR_MISALIGNMENT(q) = 0
1819 p[i] = y; # DR_MISALIGNMENT(p) = 0
1822 else {
1823 for (i = 3; i<N; i++){ # loop 3B
1824 x = q[i]; # DR_MISALIGNMENT(q) = 0
1825 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1829 These loops are later passed to loop_transform to be vectorized. The
1830 vectorizer will use the alignment information to guide the transformation
1831 (whether to generate regular loads/stores, or with special handling for
1832 misalignment). */
1834 opt_result
1835 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838 dr_vec_info *first_store = NULL;
1839 dr_vec_info *dr0_info = NULL;
1840 struct data_reference *dr;
1841 unsigned int i;
1842 bool do_peeling = false;
1843 bool do_versioning = false;
1844 unsigned int npeel = 0;
1845 bool one_misalignment_known = false;
1846 bool one_misalignment_unknown = false;
1847 bool one_dr_unsupportable = false;
1848 dr_vec_info *unsupportable_dr_info = NULL;
1849 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1850 hash_table<peel_info_hasher> peeling_htab (1);
1852 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1854 /* Reset data so we can safely be called multiple times. */
1855 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1856 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1858 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1859 return opt_result::success ();
1861 /* Sort the vector of datarefs so DRs that have the same or dependent
1862 alignment are next to each other. */
1863 auto_vec<data_reference_p> datarefs
1864 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1865 datarefs.qsort (dr_align_group_sort_cmp);
1867 /* Compute the number of DRs that become aligned when we peel
1868 a dataref so it becomes aligned. */
1869 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1870 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1871 unsigned i0;
1872 for (i0 = 0; i0 < datarefs.length (); ++i0)
1873 if (DR_BASE_ADDRESS (datarefs[i0]))
1874 break;
1875 for (i = i0 + 1; i <= datarefs.length (); ++i)
1877 if (i == datarefs.length ()
1878 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1879 DR_BASE_ADDRESS (datarefs[i]), 0)
1880 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1881 DR_OFFSET (datarefs[i]), 0)
1882 || !operand_equal_p (DR_STEP (datarefs[i0]),
1883 DR_STEP (datarefs[i]), 0))
1885 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1886 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1887 will get known misalignment if we align one of the refs
1888 with the largest DR_TARGET_ALIGNMENT. */
1889 for (unsigned j = i0; j < i; ++j)
1891 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1892 for (unsigned k = i0; k < i; ++k)
1894 if (k == j)
1895 continue;
1896 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1897 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1898 dr_infoj))
1899 n_same_align_refs[j]++;
1902 i0 = i;
1906 /* While cost model enhancements are expected in the future, the high level
1907 view of the code at this time is as follows:
1909 A) If there is a misaligned access then see if peeling to align
1910 this access can make all data references satisfy
1911 vect_supportable_dr_alignment. If so, update data structures
1912 as needed and return true.
1914 B) If peeling wasn't possible and there is a data reference with an
1915 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1916 then see if loop versioning checks can be used to make all data
1917 references satisfy vect_supportable_dr_alignment. If so, update
1918 data structures as needed and return true.
1920 C) If neither peeling nor versioning were successful then return false if
1921 any data reference does not satisfy vect_supportable_dr_alignment.
1923 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1925 Note, Possibility 3 above (which is peeling and versioning together) is not
1926 being done at this time. */
1928 /* (1) Peeling to force alignment. */
1930 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1931 Considerations:
1932 + How many accesses will become aligned due to the peeling
1933 - How many accesses will become unaligned due to the peeling,
1934 and the cost of misaligned accesses.
1935 - The cost of peeling (the extra runtime checks, the increase
1936 in code size). */
1938 FOR_EACH_VEC_ELT (datarefs, i, dr)
1940 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1941 if (!vect_relevant_for_alignment_p (dr_info))
1942 continue;
1944 stmt_vec_info stmt_info = dr_info->stmt;
1945 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1946 do_peeling = vector_alignment_reachable_p (dr_info);
1947 if (do_peeling)
1949 if (known_alignment_for_access_p (dr_info, vectype))
1951 unsigned int npeel_tmp = 0;
1952 bool negative = tree_int_cst_compare (DR_STEP (dr),
1953 size_zero_node) < 0;
1955 /* If known_alignment_for_access_p then we have set
1956 DR_MISALIGNMENT which is only done if we know it at compiler
1957 time, so it is safe to assume target alignment is constant.
1959 unsigned int target_align =
1960 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1961 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1962 poly_int64 off = 0;
1963 if (negative)
1964 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1965 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1966 mis = negative ? mis : -mis;
1967 if (mis != 0)
1968 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1970 /* For multiple types, it is possible that the bigger type access
1971 will have more than one peeling option. E.g., a loop with two
1972 types: one of size (vector size / 4), and the other one of
1973 size (vector size / 8). Vectorization factor will 8. If both
1974 accesses are misaligned by 3, the first one needs one scalar
1975 iteration to be aligned, and the second one needs 5. But the
1976 first one will be aligned also by peeling 5 scalar
1977 iterations, and in that case both accesses will be aligned.
1978 Hence, except for the immediate peeling amount, we also want
1979 to try to add full vector size, while we don't exceed
1980 vectorization factor.
1981 We do this automatically for cost model, since we calculate
1982 cost for every peeling option. */
1983 poly_uint64 nscalars = npeel_tmp;
1984 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1986 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1987 nscalars = (STMT_SLP_TYPE (stmt_info)
1988 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991 /* Save info about DR in the hash table. Also include peeling
1992 amounts according to the explanation above. Indicate
1993 the alignment status when the ref is not aligned.
1994 ??? Rather than using unknown alignment here we should
1995 prune all entries from the peeling hashtable which cause
1996 DRs to be not supported. */
1997 bool supportable_if_not_aligned
1998 = vect_supportable_dr_alignment
1999 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2000 while (known_le (npeel_tmp, nscalars))
2002 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2003 dr_info, npeel_tmp,
2004 supportable_if_not_aligned);
2005 npeel_tmp += MAX (1, target_align / dr_size);
2008 one_misalignment_known = true;
2010 else
2012 /* If we don't know any misalignment values, we prefer
2013 peeling for data-ref that has the maximum number of data-refs
2014 with the same alignment, unless the target prefers to align
2015 stores over load. */
2016 unsigned same_align_drs = n_same_align_refs[i];
2017 if (!dr0_info
2018 || dr0_same_align_drs < same_align_drs)
2020 dr0_same_align_drs = same_align_drs;
2021 dr0_info = dr_info;
2023 /* For data-refs with the same number of related
2024 accesses prefer the one where the misalign
2025 computation will be invariant in the outermost loop. */
2026 else if (dr0_same_align_drs == same_align_drs)
2028 class loop *ivloop0, *ivloop;
2029 ivloop0 = outermost_invariant_loop_for_expr
2030 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2031 ivloop = outermost_invariant_loop_for_expr
2032 (loop, DR_BASE_ADDRESS (dr));
2033 if ((ivloop && !ivloop0)
2034 || (ivloop && ivloop0
2035 && flow_loop_nested_p (ivloop, ivloop0)))
2036 dr0_info = dr_info;
2039 one_misalignment_unknown = true;
2041 /* Check for data refs with unsupportable alignment that
2042 can be peeled. */
2043 enum dr_alignment_support supportable_dr_alignment
2044 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2045 DR_MISALIGNMENT_UNKNOWN);
2046 if (supportable_dr_alignment == dr_unaligned_unsupported)
2048 one_dr_unsupportable = true;
2049 unsupportable_dr_info = dr_info;
2052 if (!first_store && DR_IS_WRITE (dr))
2054 first_store = dr_info;
2055 first_store_same_align_drs = same_align_drs;
2059 else
2061 if (!aligned_access_p (dr_info, vectype))
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2065 "vector alignment may not be reachable\n");
2066 break;
2071 /* Check if we can possibly peel the loop. */
2072 if (!vect_can_advance_ivs_p (loop_vinfo)
2073 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2074 || loop->inner)
2075 do_peeling = false;
2077 struct _vect_peel_extended_info peel_for_known_alignment;
2078 struct _vect_peel_extended_info peel_for_unknown_alignment;
2079 struct _vect_peel_extended_info best_peel;
2081 peel_for_unknown_alignment.inside_cost = INT_MAX;
2082 peel_for_unknown_alignment.outside_cost = INT_MAX;
2083 peel_for_unknown_alignment.peel_info.count = 0;
2085 if (do_peeling
2086 && one_misalignment_unknown)
2088 /* Check if the target requires to prefer stores over loads, i.e., if
2089 misaligned stores are more expensive than misaligned loads (taking
2090 drs with same alignment into account). */
2091 unsigned int load_inside_cost = 0;
2092 unsigned int load_outside_cost = 0;
2093 unsigned int store_inside_cost = 0;
2094 unsigned int store_outside_cost = 0;
2095 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2097 stmt_vector_for_cost dummy;
2098 dummy.create (2);
2099 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2100 &load_inside_cost,
2101 &load_outside_cost,
2102 &dummy, &dummy, estimated_npeels);
2103 dummy.release ();
2105 if (first_store)
2107 dummy.create (2);
2108 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2109 &store_inside_cost,
2110 &store_outside_cost,
2111 &dummy, &dummy,
2112 estimated_npeels);
2113 dummy.release ();
2115 else
2117 store_inside_cost = INT_MAX;
2118 store_outside_cost = INT_MAX;
2121 if (load_inside_cost > store_inside_cost
2122 || (load_inside_cost == store_inside_cost
2123 && load_outside_cost > store_outside_cost))
2125 dr0_info = first_store;
2126 dr0_same_align_drs = first_store_same_align_drs;
2127 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2128 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2130 else
2132 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2133 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2137 prologue_cost_vec.create (2);
2138 epilogue_cost_vec.create (2);
2140 int dummy2;
2141 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2142 (loop_vinfo, estimated_npeels, &dummy2,
2143 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2144 &prologue_cost_vec, &epilogue_cost_vec);
2146 prologue_cost_vec.release ();
2147 epilogue_cost_vec.release ();
2149 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152 peel_for_unknown_alignment.peel_info.npeel = 0;
2153 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2155 best_peel = peel_for_unknown_alignment;
2157 peel_for_known_alignment.inside_cost = INT_MAX;
2158 peel_for_known_alignment.outside_cost = INT_MAX;
2159 peel_for_known_alignment.peel_info.count = 0;
2160 peel_for_known_alignment.peel_info.dr_info = NULL;
2162 if (do_peeling && one_misalignment_known)
2164 /* Peeling is possible, but there is no data access that is not supported
2165 unless aligned. So we try to choose the best possible peeling from
2166 the hash table. */
2167 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2168 (&peeling_htab, loop_vinfo);
2171 /* Compare costs of peeling for known and unknown alignment. */
2172 if (peel_for_known_alignment.peel_info.dr_info != NULL
2173 && peel_for_unknown_alignment.inside_cost
2174 >= peel_for_known_alignment.inside_cost)
2176 best_peel = peel_for_known_alignment;
2178 /* If the best peeling for known alignment has NPEEL == 0, perform no
2179 peeling at all except if there is an unsupportable dr that we can
2180 align. */
2181 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2182 do_peeling = false;
2185 /* If there is an unsupportable data ref, prefer this over all choices so far
2186 since we'd have to discard a chosen peeling except when it accidentally
2187 aligned the unsupportable data ref. */
2188 if (one_dr_unsupportable)
2189 dr0_info = unsupportable_dr_info;
2190 else if (do_peeling)
2192 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2193 TODO: Use nopeel_outside_cost or get rid of it? */
2194 unsigned nopeel_inside_cost = 0;
2195 unsigned nopeel_outside_cost = 0;
2197 stmt_vector_for_cost dummy;
2198 dummy.create (2);
2199 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2200 &nopeel_outside_cost, &dummy, &dummy, 0);
2201 dummy.release ();
2203 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2204 costs will be recorded. */
2205 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2206 prologue_cost_vec.create (2);
2207 epilogue_cost_vec.create (2);
2209 int dummy2;
2210 nopeel_outside_cost += vect_get_known_peeling_cost
2211 (loop_vinfo, 0, &dummy2,
2212 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2213 &prologue_cost_vec, &epilogue_cost_vec);
2215 prologue_cost_vec.release ();
2216 epilogue_cost_vec.release ();
2218 npeel = best_peel.peel_info.npeel;
2219 dr0_info = best_peel.peel_info.dr_info;
2221 /* If no peeling is not more expensive than the best peeling we
2222 have so far, don't perform any peeling. */
2223 if (nopeel_inside_cost <= best_peel.inside_cost)
2224 do_peeling = false;
2227 if (do_peeling)
2229 stmt_vec_info stmt_info = dr0_info->stmt;
2230 if (known_alignment_for_access_p (dr0_info,
2231 STMT_VINFO_VECTYPE (stmt_info)))
2233 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2234 size_zero_node) < 0;
2235 if (!npeel)
2237 /* Since it's known at compile time, compute the number of
2238 iterations in the peeled loop (the peeling factor) for use in
2239 updating DR_MISALIGNMENT values. The peeling factor is the
2240 vectorization factor minus the misalignment as an element
2241 count. */
2242 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2243 poly_int64 off = 0;
2244 if (negative)
2245 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2246 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2247 unsigned int mis
2248 = dr_misalignment (dr0_info, vectype, off);
2249 mis = negative ? mis : -mis;
2250 /* If known_alignment_for_access_p then we have set
2251 DR_MISALIGNMENT which is only done if we know it at compiler
2252 time, so it is safe to assume target alignment is constant.
2254 unsigned int target_align =
2255 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2256 npeel = ((mis & (target_align - 1))
2257 / vect_get_scalar_dr_size (dr0_info));
2260 /* For interleaved data access every iteration accesses all the
2261 members of the group, therefore we divide the number of iterations
2262 by the group size. */
2263 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2264 npeel /= DR_GROUP_SIZE (stmt_info);
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "Try peeling by %d\n", npeel);
2271 /* Ensure that all datarefs can be vectorized after the peel. */
2272 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2273 do_peeling = false;
2275 /* Check if all datarefs are supportable and log. */
2276 if (do_peeling
2277 && npeel == 0
2278 && known_alignment_for_access_p (dr0_info,
2279 STMT_VINFO_VECTYPE (stmt_info)))
2280 return opt_result::success ();
2282 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2283 if (do_peeling)
2285 unsigned max_allowed_peel
2286 = param_vect_max_peeling_for_alignment;
2287 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2288 max_allowed_peel = 0;
2289 if (max_allowed_peel != (unsigned)-1)
2291 unsigned max_peel = npeel;
2292 if (max_peel == 0)
2294 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2295 unsigned HOST_WIDE_INT target_align_c;
2296 if (target_align.is_constant (&target_align_c))
2297 max_peel =
2298 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2299 else
2301 do_peeling = false;
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "Disable peeling, max peels set and vector"
2305 " alignment unknown\n");
2308 if (max_peel > max_allowed_peel)
2310 do_peeling = false;
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "Disable peeling, max peels reached: %d\n", max_peel);
2318 /* Cost model #2 - if peeling may result in a remaining loop not
2319 iterating enough to be vectorized then do not peel. Since this
2320 is a cost heuristic rather than a correctness decision, use the
2321 most likely runtime value for variable vectorization factors. */
2322 if (do_peeling
2323 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2325 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2326 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2327 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2328 < assumed_vf + max_peel)
2329 do_peeling = false;
2332 if (do_peeling)
2334 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2335 If the misalignment of DR_i is identical to that of dr0 then set
2336 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2337 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2338 by the peeling factor times the element size of DR_i (MOD the
2339 vectorization factor times the size). Otherwise, the
2340 misalignment of DR_i must be set to unknown. */
2341 FOR_EACH_VEC_ELT (datarefs, i, dr)
2342 if (dr != dr0_info->dr)
2344 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2345 if (!vect_relevant_for_alignment_p (dr_info))
2346 continue;
2348 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2352 if (npeel)
2353 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2354 else
2355 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2356 SET_DR_MISALIGNMENT (dr0_info,
2357 vect_dr_misalign_for_aligned_access (dr0_info));
2358 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_NOTE, vect_location,
2361 "Alignment of access forced using peeling.\n");
2362 dump_printf_loc (MSG_NOTE, vect_location,
2363 "Peeling for alignment will be applied.\n");
2366 /* The inside-loop cost will be accounted for in vectorizable_load
2367 and vectorizable_store correctly with adjusted alignments.
2368 Drop the body_cst_vec on the floor here. */
2369 return opt_result::success ();
2373 /* (2) Versioning to force alignment. */
2375 /* Try versioning if:
2376 1) optimize loop for speed and the cost-model is not cheap
2377 2) there is at least one unsupported misaligned data ref with an unknown
2378 misalignment, and
2379 3) all misaligned data refs with a known misalignment are supported, and
2380 4) the number of runtime alignment checks is within reason. */
2382 do_versioning
2383 = (optimize_loop_nest_for_speed_p (loop)
2384 && !loop->inner /* FORNOW */
2385 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2387 if (do_versioning)
2389 FOR_EACH_VEC_ELT (datarefs, i, dr)
2391 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2392 if (!vect_relevant_for_alignment_p (dr_info))
2393 continue;
2395 stmt_vec_info stmt_info = dr_info->stmt;
2396 if (STMT_VINFO_STRIDED_P (stmt_info))
2398 do_versioning = false;
2399 break;
2402 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2403 bool negative = tree_int_cst_compare (DR_STEP (dr),
2404 size_zero_node) < 0;
2405 poly_int64 off = 0;
2406 if (negative)
2407 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2408 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2409 int misalignment;
2410 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2411 continue;
2413 enum dr_alignment_support supportable_dr_alignment
2414 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2415 misalignment);
2416 if (supportable_dr_alignment == dr_unaligned_unsupported)
2418 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2419 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2420 >= (unsigned) param_vect_max_version_for_alignment_checks))
2422 do_versioning = false;
2423 break;
2426 /* At present we don't support versioning for alignment
2427 with variable VF, since there's no guarantee that the
2428 VF is a power of two. We could relax this if we added
2429 a way of enforcing a power-of-two size. */
2430 unsigned HOST_WIDE_INT size;
2431 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2433 do_versioning = false;
2434 break;
2437 /* Forcing alignment in the first iteration is no good if
2438 we don't keep it across iterations. For now, just disable
2439 versioning in this case.
2440 ?? We could actually unroll the loop to achieve the required
2441 overall step alignment, and forcing the alignment could be
2442 done by doing some iterations of the non-vectorized loop. */
2443 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2444 * DR_STEP_ALIGNMENT (dr),
2445 DR_TARGET_ALIGNMENT (dr_info)))
2447 do_versioning = false;
2448 break;
2451 /* The rightmost bits of an aligned address must be zeros.
2452 Construct the mask needed for this test. For example,
2453 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2454 mask must be 15 = 0xf. */
2455 int mask = size - 1;
2457 /* FORNOW: use the same mask to test all potentially unaligned
2458 references in the loop. */
2459 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2460 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2462 do_versioning = false;
2463 break;
2466 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2467 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2471 /* Versioning requires at least one misaligned data reference. */
2472 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2473 do_versioning = false;
2474 else if (!do_versioning)
2475 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478 if (do_versioning)
2480 const vec<stmt_vec_info> &may_misalign_stmts
2481 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2482 stmt_vec_info stmt_info;
2484 /* It can now be assumed that the data references in the statements
2485 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2486 of the loop being vectorized. */
2487 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2489 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2490 SET_DR_MISALIGNMENT (dr_info,
2491 vect_dr_misalign_for_aligned_access (dr_info));
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_NOTE, vect_location,
2494 "Alignment of access forced using versioning.\n");
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE, vect_location,
2499 "Versioning for alignment will be applied.\n");
2501 /* Peeling and versioning can't be done together at this time. */
2502 gcc_assert (! (do_peeling && do_versioning));
2504 return opt_result::success ();
2507 /* This point is reached if neither peeling nor versioning is being done. */
2508 gcc_assert (! (do_peeling || do_versioning));
2510 return opt_result::success ();
2514 /* Function vect_analyze_data_refs_alignment
2516 Analyze the alignment of the data-references in the loop.
2517 Return FALSE if a data reference is found that cannot be vectorized. */
2519 opt_result
2520 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2522 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2524 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2525 struct data_reference *dr;
2526 unsigned int i;
2528 vect_record_base_alignments (loop_vinfo);
2529 FOR_EACH_VEC_ELT (datarefs, i, dr)
2531 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2532 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2534 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2535 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2536 continue;
2537 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2538 STMT_VINFO_VECTYPE (dr_info->stmt));
2542 return opt_result::success ();
2546 /* Analyze alignment of DRs of stmts in NODE. */
2548 static bool
2549 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2551 /* Alignment is maintained in the first element of the group. */
2552 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2553 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2554 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2555 tree vectype = SLP_TREE_VECTYPE (node);
2556 poly_uint64 vector_alignment
2557 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2558 BITS_PER_UNIT);
2559 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2560 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2561 /* Re-analyze alignment when we're facing a vectorization with a bigger
2562 alignment requirement. */
2563 else if (known_lt (dr_info->target_alignment, vector_alignment))
2565 poly_uint64 old_target_alignment = dr_info->target_alignment;
2566 int old_misalignment = dr_info->misalignment;
2567 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2568 /* But keep knowledge about a smaller alignment. */
2569 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2570 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2572 dr_info->target_alignment = old_target_alignment;
2573 dr_info->misalignment = old_misalignment;
2576 /* When we ever face unordered target alignments the first one wins in terms
2577 of analyzing and the other will become unknown in dr_misalignment. */
2578 return true;
2581 /* Function vect_slp_analyze_instance_alignment
2583 Analyze the alignment of the data-references in the SLP instance.
2584 Return FALSE if a data reference is found that cannot be vectorized. */
2586 bool
2587 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2588 slp_instance instance)
2590 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2592 slp_tree node;
2593 unsigned i;
2594 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2595 if (! vect_slp_analyze_node_alignment (vinfo, node))
2596 return false;
2598 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2599 && ! vect_slp_analyze_node_alignment
2600 (vinfo, SLP_INSTANCE_TREE (instance)))
2601 return false;
2603 return true;
2607 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2608 accesses of legal size, step, etc. Detect gaps, single element
2609 interleaving, and other special cases. Set grouped access info.
2610 Collect groups of strided stores for further use in SLP analysis.
2611 Worker for vect_analyze_group_access. */
2613 static bool
2614 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2616 data_reference *dr = dr_info->dr;
2617 tree step = DR_STEP (dr);
2618 tree scalar_type = TREE_TYPE (DR_REF (dr));
2619 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2620 stmt_vec_info stmt_info = dr_info->stmt;
2621 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2622 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2623 HOST_WIDE_INT dr_step = -1;
2624 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2625 bool slp_impossible = false;
2627 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2628 size of the interleaving group (including gaps). */
2629 if (tree_fits_shwi_p (step))
2631 dr_step = tree_to_shwi (step);
2632 /* Check that STEP is a multiple of type size. Otherwise there is
2633 a non-element-sized gap at the end of the group which we
2634 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2635 ??? As we can handle non-constant step fine here we should
2636 simply remove uses of DR_GROUP_GAP between the last and first
2637 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2638 simply not include that gap. */
2639 if ((dr_step % type_size) != 0)
2641 if (dump_enabled_p ())
2642 dump_printf_loc (MSG_NOTE, vect_location,
2643 "Step %T is not a multiple of the element size"
2644 " for %T\n",
2645 step, DR_REF (dr));
2646 return false;
2648 groupsize = absu_hwi (dr_step) / type_size;
2650 else
2651 groupsize = 0;
2653 /* Not consecutive access is possible only if it is a part of interleaving. */
2654 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2656 /* Check if it this DR is a part of interleaving, and is a single
2657 element of the group that is accessed in the loop. */
2659 /* Gaps are supported only for loads. STEP must be a multiple of the type
2660 size. */
2661 if (DR_IS_READ (dr)
2662 && (dr_step % type_size) == 0
2663 && groupsize > 0
2664 /* This could be UINT_MAX but as we are generating code in a very
2665 inefficient way we have to cap earlier.
2666 See PR91403 for example. */
2667 && groupsize <= 4096)
2669 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2670 DR_GROUP_SIZE (stmt_info) = groupsize;
2671 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2672 if (dump_enabled_p ())
2673 dump_printf_loc (MSG_NOTE, vect_location,
2674 "Detected single element interleaving %T"
2675 " step %T\n",
2676 DR_REF (dr), step);
2678 return true;
2681 if (dump_enabled_p ())
2682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2683 "not consecutive access %G", stmt_info->stmt);
2685 if (bb_vinfo)
2687 /* Mark the statement as unvectorizable. */
2688 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2689 return true;
2692 if (dump_enabled_p ())
2693 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2694 STMT_VINFO_STRIDED_P (stmt_info) = true;
2695 return true;
2698 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2700 /* First stmt in the interleaving chain. Check the chain. */
2701 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2702 struct data_reference *data_ref = dr;
2703 unsigned int count = 1;
2704 tree prev_init = DR_INIT (data_ref);
2705 HOST_WIDE_INT diff, gaps = 0;
2707 /* By construction, all group members have INTEGER_CST DR_INITs. */
2708 while (next)
2710 /* We never have the same DR multiple times. */
2711 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2712 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2714 data_ref = STMT_VINFO_DATA_REF (next);
2716 /* All group members have the same STEP by construction. */
2717 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2719 /* Check that the distance between two accesses is equal to the type
2720 size. Otherwise, we have gaps. */
2721 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2722 - TREE_INT_CST_LOW (prev_init)) / type_size;
2723 if (diff < 1 || diff > UINT_MAX)
2725 /* For artificial testcases with array accesses with large
2726 constant indices we can run into overflow issues which
2727 can end up fooling the groupsize constraint below so
2728 check the individual gaps (which are represented as
2729 unsigned int) as well. */
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2732 "interleaved access with gap larger "
2733 "than representable\n");
2734 return false;
2736 if (diff != 1)
2738 /* FORNOW: SLP of accesses with gaps is not supported. */
2739 slp_impossible = true;
2740 if (DR_IS_WRITE (data_ref))
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744 "interleaved store with gaps\n");
2745 return false;
2748 gaps += diff - 1;
2751 last_accessed_element += diff;
2753 /* Store the gap from the previous member of the group. If there is no
2754 gap in the access, DR_GROUP_GAP is always 1. */
2755 DR_GROUP_GAP (next) = diff;
2757 prev_init = DR_INIT (data_ref);
2758 next = DR_GROUP_NEXT_ELEMENT (next);
2759 /* Count the number of data-refs in the chain. */
2760 count++;
2763 if (groupsize == 0)
2764 groupsize = count + gaps;
2766 /* This could be UINT_MAX but as we are generating code in a very
2767 inefficient way we have to cap earlier. See PR78699 for example. */
2768 if (groupsize > 4096)
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2772 "group is too large\n");
2773 return false;
2776 /* Check that the size of the interleaving is equal to count for stores,
2777 i.e., that there are no gaps. */
2778 if (groupsize != count
2779 && !DR_IS_READ (dr))
2781 groupsize = count;
2782 STMT_VINFO_STRIDED_P (stmt_info) = true;
2785 /* If there is a gap after the last load in the group it is the
2786 difference between the groupsize and the last accessed
2787 element.
2788 When there is no gap, this difference should be 0. */
2789 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2791 DR_GROUP_SIZE (stmt_info) = groupsize;
2792 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_NOTE, vect_location,
2795 "Detected interleaving ");
2796 if (DR_IS_READ (dr))
2797 dump_printf (MSG_NOTE, "load ");
2798 else if (STMT_VINFO_STRIDED_P (stmt_info))
2799 dump_printf (MSG_NOTE, "strided store ");
2800 else
2801 dump_printf (MSG_NOTE, "store ");
2802 dump_printf (MSG_NOTE, "of size %u\n",
2803 (unsigned)groupsize);
2804 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2805 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2806 while (next)
2808 if (DR_GROUP_GAP (next) != 1)
2809 dump_printf_loc (MSG_NOTE, vect_location,
2810 "\t<gap of %d elements>\n",
2811 DR_GROUP_GAP (next) - 1);
2812 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2813 next = DR_GROUP_NEXT_ELEMENT (next);
2815 if (DR_GROUP_GAP (stmt_info) != 0)
2816 dump_printf_loc (MSG_NOTE, vect_location,
2817 "\t<gap of %d elements>\n",
2818 DR_GROUP_GAP (stmt_info));
2821 /* SLP: create an SLP data structure for every interleaving group of
2822 stores for further analysis in vect_analyse_slp. */
2823 if (DR_IS_WRITE (dr) && !slp_impossible)
2825 if (loop_vinfo)
2826 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2827 if (bb_vinfo)
2828 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2832 return true;
2835 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2836 accesses of legal size, step, etc. Detect gaps, single element
2837 interleaving, and other special cases. Set grouped access info.
2838 Collect groups of strided stores for further use in SLP analysis. */
2840 static bool
2841 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2843 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2845 /* Dissolve the group if present. */
2846 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2847 while (stmt_info)
2849 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2850 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2851 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2852 stmt_info = next;
2854 return false;
2856 return true;
2859 /* Analyze the access pattern of the data-reference DR_INFO.
2860 In case of non-consecutive accesses call vect_analyze_group_access() to
2861 analyze groups of accesses. */
2863 static bool
2864 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2866 data_reference *dr = dr_info->dr;
2867 tree step = DR_STEP (dr);
2868 tree scalar_type = TREE_TYPE (DR_REF (dr));
2869 stmt_vec_info stmt_info = dr_info->stmt;
2870 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2871 class loop *loop = NULL;
2873 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2874 return true;
2876 if (loop_vinfo)
2877 loop = LOOP_VINFO_LOOP (loop_vinfo);
2879 if (loop_vinfo && !step)
2881 if (dump_enabled_p ())
2882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2883 "bad data-ref access in loop\n");
2884 return false;
2887 /* Allow loads with zero step in inner-loop vectorization. */
2888 if (loop_vinfo && integer_zerop (step))
2890 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2891 if (!nested_in_vect_loop_p (loop, stmt_info))
2892 return DR_IS_READ (dr);
2893 /* Allow references with zero step for outer loops marked
2894 with pragma omp simd only - it guarantees absence of
2895 loop-carried dependencies between inner loop iterations. */
2896 if (loop->safelen < 2)
2898 if (dump_enabled_p ())
2899 dump_printf_loc (MSG_NOTE, vect_location,
2900 "zero step in inner loop of nest\n");
2901 return false;
2905 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2907 /* Interleaved accesses are not yet supported within outer-loop
2908 vectorization for references in the inner-loop. */
2909 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2911 /* For the rest of the analysis we use the outer-loop step. */
2912 step = STMT_VINFO_DR_STEP (stmt_info);
2913 if (integer_zerop (step))
2915 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_NOTE, vect_location,
2917 "zero step in outer loop.\n");
2918 return DR_IS_READ (dr);
2922 /* Consecutive? */
2923 if (TREE_CODE (step) == INTEGER_CST)
2925 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2926 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2927 || (dr_step < 0
2928 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2930 /* Mark that it is not interleaving. */
2931 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2932 return true;
2936 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_NOTE, vect_location,
2940 "grouped access in outer loop.\n");
2941 return false;
2945 /* Assume this is a DR handled by non-constant strided load case. */
2946 if (TREE_CODE (step) != INTEGER_CST)
2947 return (STMT_VINFO_STRIDED_P (stmt_info)
2948 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2949 || vect_analyze_group_access (vinfo, dr_info)));
2951 /* Not consecutive access - check if it's a part of interleaving group. */
2952 return vect_analyze_group_access (vinfo, dr_info);
2955 /* Compare two data-references DRA and DRB to group them into chunks
2956 suitable for grouping. */
2958 static int
2959 dr_group_sort_cmp (const void *dra_, const void *drb_)
2961 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2962 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2963 data_reference_p dra = dra_info->dr;
2964 data_reference_p drb = drb_info->dr;
2965 int cmp;
2967 /* Stabilize sort. */
2968 if (dra == drb)
2969 return 0;
2971 /* Different group IDs lead never belong to the same group. */
2972 if (dra_info->group != drb_info->group)
2973 return dra_info->group < drb_info->group ? -1 : 1;
2975 /* Ordering of DRs according to base. */
2976 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977 DR_BASE_ADDRESS (drb));
2978 if (cmp != 0)
2979 return cmp;
2981 /* And according to DR_OFFSET. */
2982 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2983 if (cmp != 0)
2984 return cmp;
2986 /* Put reads before writes. */
2987 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2988 return DR_IS_READ (dra) ? -1 : 1;
2990 /* Then sort after access size. */
2991 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2992 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2993 if (cmp != 0)
2994 return cmp;
2996 /* And after step. */
2997 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2998 if (cmp != 0)
2999 return cmp;
3001 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3002 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3003 if (cmp == 0)
3004 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3005 return cmp;
3008 /* If OP is the result of a conversion, return the unconverted value,
3009 otherwise return null. */
3011 static tree
3012 strip_conversion (tree op)
3014 if (TREE_CODE (op) != SSA_NAME)
3015 return NULL_TREE;
3016 gimple *stmt = SSA_NAME_DEF_STMT (op);
3017 if (!is_gimple_assign (stmt)
3018 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3019 return NULL_TREE;
3020 return gimple_assign_rhs1 (stmt);
3023 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3024 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3025 be grouped in SLP mode. */
3027 static bool
3028 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3029 bool allow_slp_p)
3031 if (gimple_assign_single_p (stmt1_info->stmt))
3032 return gimple_assign_single_p (stmt2_info->stmt);
3034 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3035 if (call1 && gimple_call_internal_p (call1))
3037 /* Check for two masked loads or two masked stores. */
3038 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3039 if (!call2 || !gimple_call_internal_p (call2))
3040 return false;
3041 internal_fn ifn = gimple_call_internal_fn (call1);
3042 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3043 return false;
3044 if (ifn != gimple_call_internal_fn (call2))
3045 return false;
3047 /* Check that the masks are the same. Cope with casts of masks,
3048 like those created by build_mask_conversion. */
3049 tree mask1 = gimple_call_arg (call1, 2);
3050 tree mask2 = gimple_call_arg (call2, 2);
3051 if (!operand_equal_p (mask1, mask2, 0)
3052 && (ifn == IFN_MASK_STORE || !allow_slp_p))
3054 mask1 = strip_conversion (mask1);
3055 if (!mask1)
3056 return false;
3057 mask2 = strip_conversion (mask2);
3058 if (!mask2)
3059 return false;
3060 if (!operand_equal_p (mask1, mask2, 0))
3061 return false;
3063 return true;
3066 return false;
3069 /* Function vect_analyze_data_ref_accesses.
3071 Analyze the access pattern of all the data references in the loop.
3073 FORNOW: the only access pattern that is considered vectorizable is a
3074 simple step 1 (consecutive) access.
3076 FORNOW: handle only arrays and pointer accesses. */
3078 opt_result
3079 vect_analyze_data_ref_accesses (vec_info *vinfo,
3080 vec<int> *dataref_groups)
3082 unsigned int i;
3083 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3085 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3087 if (datarefs.is_empty ())
3088 return opt_result::success ();
3090 /* Sort the array of datarefs to make building the interleaving chains
3091 linear. Don't modify the original vector's order, it is needed for
3092 determining what dependencies are reversed. */
3093 vec<dr_vec_info *> datarefs_copy;
3094 datarefs_copy.create (datarefs.length ());
3095 for (unsigned i = 0; i < datarefs.length (); i++)
3097 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3098 /* If the caller computed DR grouping use that, otherwise group by
3099 basic blocks. */
3100 if (dataref_groups)
3101 dr_info->group = (*dataref_groups)[i];
3102 else
3103 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3104 datarefs_copy.quick_push (dr_info);
3106 datarefs_copy.qsort (dr_group_sort_cmp);
3107 hash_set<stmt_vec_info> to_fixup;
3109 /* Build the interleaving chains. */
3110 for (i = 0; i < datarefs_copy.length () - 1;)
3112 dr_vec_info *dr_info_a = datarefs_copy[i];
3113 data_reference_p dra = dr_info_a->dr;
3114 int dra_group_id = dr_info_a->group;
3115 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3116 stmt_vec_info lastinfo = NULL;
3117 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3118 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3120 ++i;
3121 continue;
3123 for (i = i + 1; i < datarefs_copy.length (); ++i)
3125 dr_vec_info *dr_info_b = datarefs_copy[i];
3126 data_reference_p drb = dr_info_b->dr;
3127 int drb_group_id = dr_info_b->group;
3128 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3129 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3130 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3131 break;
3133 /* ??? Imperfect sorting (non-compatible types, non-modulo
3134 accesses, same accesses) can lead to a group to be artificially
3135 split here as we don't just skip over those. If it really
3136 matters we can push those to a worklist and re-iterate
3137 over them. The we can just skip ahead to the next DR here. */
3139 /* DRs in a different DR group should not be put into the same
3140 interleaving group. */
3141 if (dra_group_id != drb_group_id)
3142 break;
3144 /* Check that the data-refs have same first location (except init)
3145 and they are both either store or load (not load and store,
3146 not masked loads or stores). */
3147 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3148 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3149 DR_BASE_ADDRESS (drb)) != 0
3150 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3151 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3152 break;
3154 /* Check that the data-refs have the same constant size. */
3155 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3156 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3157 if (!tree_fits_uhwi_p (sza)
3158 || !tree_fits_uhwi_p (szb)
3159 || !tree_int_cst_equal (sza, szb))
3160 break;
3162 /* Check that the data-refs have the same step. */
3163 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3164 break;
3166 /* Check the types are compatible.
3167 ??? We don't distinguish this during sorting. */
3168 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3169 TREE_TYPE (DR_REF (drb))))
3170 break;
3172 /* Check that the DR_INITs are compile-time constants. */
3173 if (!tree_fits_shwi_p (DR_INIT (dra))
3174 || !tree_fits_shwi_p (DR_INIT (drb)))
3175 break;
3177 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3178 just hold extra information. */
3179 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3180 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3181 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3182 break;
3184 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3185 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3186 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3187 HOST_WIDE_INT init_prev
3188 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3189 gcc_assert (init_a <= init_b
3190 && init_a <= init_prev
3191 && init_prev <= init_b);
3193 /* Do not place the same access in the interleaving chain twice. */
3194 if (init_b == init_prev)
3196 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3197 < gimple_uid (DR_STMT (drb)));
3198 /* Simply link in duplicates and fix up the chain below. */
3200 else
3202 /* If init_b == init_a + the size of the type * k, we have an
3203 interleaving, and DRA is accessed before DRB. */
3204 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3205 if (type_size_a == 0
3206 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3207 % type_size_a != 0))
3208 break;
3210 /* If we have a store, the accesses are adjacent. This splits
3211 groups into chunks we support (we don't support vectorization
3212 of stores with gaps). */
3213 if (!DR_IS_READ (dra)
3214 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3215 != type_size_a))
3216 break;
3218 /* If the step (if not zero or non-constant) is smaller than the
3219 difference between data-refs' inits this splits groups into
3220 suitable sizes. */
3221 if (tree_fits_shwi_p (DR_STEP (dra)))
3223 unsigned HOST_WIDE_INT step
3224 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3225 if (step != 0
3226 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3227 break;
3231 if (dump_enabled_p ())
3232 dump_printf_loc (MSG_NOTE, vect_location,
3233 DR_IS_READ (dra)
3234 ? "Detected interleaving load %T and %T\n"
3235 : "Detected interleaving store %T and %T\n",
3236 DR_REF (dra), DR_REF (drb));
3238 /* Link the found element into the group list. */
3239 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3241 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3242 lastinfo = stmtinfo_a;
3244 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3245 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3246 lastinfo = stmtinfo_b;
3248 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3249 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3251 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3252 dump_printf_loc (MSG_NOTE, vect_location,
3253 "Load suitable for SLP vectorization only.\n");
3255 if (init_b == init_prev
3256 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3257 && dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "Queuing group with duplicate access for fixup\n");
3263 /* Fixup groups with duplicate entries by splitting it. */
3264 while (1)
3266 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3267 if (!(it != to_fixup.end ()))
3268 break;
3269 stmt_vec_info grp = *it;
3270 to_fixup.remove (grp);
3272 /* Find the earliest duplicate group member. */
3273 unsigned first_duplicate = -1u;
3274 stmt_vec_info next, g = grp;
3275 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3277 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3278 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3279 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3280 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3281 g = next;
3283 if (first_duplicate == -1U)
3284 continue;
3286 /* Then move all stmts after the first duplicate to a new group.
3287 Note this is a heuristic but one with the property that *it
3288 is fixed up completely. */
3289 g = grp;
3290 stmt_vec_info newgroup = NULL, ng = grp;
3291 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3293 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3295 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3296 if (!newgroup)
3297 newgroup = next;
3298 else
3299 DR_GROUP_NEXT_ELEMENT (ng) = next;
3300 ng = next;
3301 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3303 else
3304 g = DR_GROUP_NEXT_ELEMENT (g);
3306 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3308 /* Fixup the new group which still may contain duplicates. */
3309 to_fixup.add (newgroup);
3312 dr_vec_info *dr_info;
3313 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3315 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3316 && !vect_analyze_data_ref_access (vinfo, dr_info))
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320 "not vectorized: complicated access pattern.\n");
3322 if (is_a <bb_vec_info> (vinfo))
3324 /* Mark the statement as not vectorizable. */
3325 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3326 continue;
3328 else
3330 datarefs_copy.release ();
3331 return opt_result::failure_at (dr_info->stmt->stmt,
3332 "not vectorized:"
3333 " complicated access pattern.\n");
3338 datarefs_copy.release ();
3339 return opt_result::success ();
3342 /* Function vect_vfa_segment_size.
3344 Input:
3345 DR_INFO: The data reference.
3346 LENGTH_FACTOR: segment length to consider.
3348 Return a value suitable for the dr_with_seg_len::seg_len field.
3349 This is the "distance travelled" by the pointer from the first
3350 iteration in the segment to the last. Note that it does not include
3351 the size of the access; in effect it only describes the first byte. */
3353 static tree
3354 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3356 length_factor = size_binop (MINUS_EXPR,
3357 fold_convert (sizetype, length_factor),
3358 size_one_node);
3359 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3360 length_factor);
3363 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3364 gives the worst-case number of bytes covered by the segment. */
3366 static unsigned HOST_WIDE_INT
3367 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3369 stmt_vec_info stmt_vinfo = dr_info->stmt;
3370 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3371 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3372 unsigned HOST_WIDE_INT access_size = ref_size;
3373 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3375 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3376 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3378 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3379 int misalignment;
3380 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3381 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3382 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3383 == dr_explicit_realign_optimized))
3385 /* We might access a full vector's worth. */
3386 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3388 return access_size;
3391 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3392 describes. */
3394 static unsigned int
3395 vect_vfa_align (dr_vec_info *dr_info)
3397 return dr_alignment (dr_info->dr);
3400 /* Function vect_no_alias_p.
3402 Given data references A and B with equal base and offset, see whether
3403 the alias relation can be decided at compilation time. Return 1 if
3404 it can and the references alias, 0 if it can and the references do
3405 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3406 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3407 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3409 static int
3410 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3411 tree segment_length_a, tree segment_length_b,
3412 unsigned HOST_WIDE_INT access_size_a,
3413 unsigned HOST_WIDE_INT access_size_b)
3415 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3416 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3417 poly_uint64 const_length_a;
3418 poly_uint64 const_length_b;
3420 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3421 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3422 [a, a+12) */
3423 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3425 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3426 offset_a -= const_length_a;
3428 else
3429 const_length_a = tree_to_poly_uint64 (segment_length_a);
3430 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3432 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3433 offset_b -= const_length_b;
3435 else
3436 const_length_b = tree_to_poly_uint64 (segment_length_b);
3438 const_length_a += access_size_a;
3439 const_length_b += access_size_b;
3441 if (ranges_known_overlap_p (offset_a, const_length_a,
3442 offset_b, const_length_b))
3443 return 1;
3445 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3446 offset_b, const_length_b))
3447 return 0;
3449 return -1;
3452 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3453 in DDR is >= VF. */
3455 static bool
3456 dependence_distance_ge_vf (data_dependence_relation *ddr,
3457 unsigned int loop_depth, poly_uint64 vf)
3459 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3460 || DDR_NUM_DIST_VECTS (ddr) == 0)
3461 return false;
3463 /* If the dependence is exact, we should have limited the VF instead. */
3464 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3466 unsigned int i;
3467 lambda_vector dist_v;
3468 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3470 HOST_WIDE_INT dist = dist_v[loop_depth];
3471 if (dist != 0
3472 && !(dist > 0 && DDR_REVERSED_P (ddr))
3473 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3474 return false;
3477 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_NOTE, vect_location,
3479 "dependence distance between %T and %T is >= VF\n",
3480 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3482 return true;
3485 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3487 static void
3488 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3490 dump_printf (dump_kind, "%s (%T) >= ",
3491 lower_bound.unsigned_p ? "unsigned" : "abs",
3492 lower_bound.expr);
3493 dump_dec (dump_kind, lower_bound.min_value);
3496 /* Record that the vectorized loop requires the vec_lower_bound described
3497 by EXPR, UNSIGNED_P and MIN_VALUE. */
3499 static void
3500 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3501 poly_uint64 min_value)
3503 vec<vec_lower_bound> &lower_bounds
3504 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3505 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3506 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3508 unsigned_p &= lower_bounds[i].unsigned_p;
3509 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3510 if (lower_bounds[i].unsigned_p != unsigned_p
3511 || maybe_lt (lower_bounds[i].min_value, min_value))
3513 lower_bounds[i].unsigned_p = unsigned_p;
3514 lower_bounds[i].min_value = min_value;
3515 if (dump_enabled_p ())
3517 dump_printf_loc (MSG_NOTE, vect_location,
3518 "updating run-time check to ");
3519 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3520 dump_printf (MSG_NOTE, "\n");
3523 return;
3526 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3527 if (dump_enabled_p ())
3529 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3530 dump_lower_bound (MSG_NOTE, lower_bound);
3531 dump_printf (MSG_NOTE, "\n");
3533 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3536 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3537 will span fewer than GAP bytes. */
3539 static bool
3540 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3541 poly_int64 gap)
3543 stmt_vec_info stmt_info = dr_info->stmt;
3544 HOST_WIDE_INT count
3545 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3546 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3547 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3548 return (estimated_poly_value (gap)
3549 <= count * vect_get_scalar_dr_size (dr_info));
3552 /* Return true if we know that there is no alias between DR_INFO_A and
3553 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3554 When returning true, set *LOWER_BOUND_OUT to this N. */
3556 static bool
3557 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3558 poly_uint64 *lower_bound_out)
3560 /* Check that there is a constant gap of known sign between DR_A
3561 and DR_B. */
3562 data_reference *dr_a = dr_info_a->dr;
3563 data_reference *dr_b = dr_info_b->dr;
3564 poly_int64 init_a, init_b;
3565 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3566 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3567 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3568 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3569 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3570 || !ordered_p (init_a, init_b))
3571 return false;
3573 /* Sort DR_A and DR_B by the address they access. */
3574 if (maybe_lt (init_b, init_a))
3576 std::swap (init_a, init_b);
3577 std::swap (dr_info_a, dr_info_b);
3578 std::swap (dr_a, dr_b);
3581 /* If the two accesses could be dependent within a scalar iteration,
3582 make sure that we'd retain their order. */
3583 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3584 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3585 return false;
3587 /* There is no alias if abs (DR_STEP) is greater than or equal to
3588 the bytes spanned by the combination of the two accesses. */
3589 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3590 return true;
3593 /* Function vect_prune_runtime_alias_test_list.
3595 Prune a list of ddrs to be tested at run-time by versioning for alias.
3596 Merge several alias checks into one if possible.
3597 Return FALSE if resulting list of ddrs is longer then allowed by
3598 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3600 opt_result
3601 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3603 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3604 hash_set <tree_pair_hash> compared_objects;
3606 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3607 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3608 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3609 const vec<vec_object_pair> &check_unequal_addrs
3610 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3611 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3612 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3614 ddr_p ddr;
3615 unsigned int i;
3616 tree length_factor;
3618 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3620 /* Step values are irrelevant for aliasing if the number of vector
3621 iterations is equal to the number of scalar iterations (which can
3622 happen for fully-SLP loops). */
3623 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3625 if (!vf_one_p)
3627 /* Convert the checks for nonzero steps into bound tests. */
3628 tree value;
3629 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3630 vect_check_lower_bound (loop_vinfo, value, true, 1);
3633 if (may_alias_ddrs.is_empty ())
3634 return opt_result::success ();
3636 comp_alias_ddrs.create (may_alias_ddrs.length ());
3638 unsigned int loop_depth
3639 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3640 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3642 /* First, we collect all data ref pairs for aliasing checks. */
3643 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3645 poly_uint64 lower_bound;
3646 tree segment_length_a, segment_length_b;
3647 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3648 unsigned int align_a, align_b;
3650 /* Ignore the alias if the VF we chose ended up being no greater
3651 than the dependence distance. */
3652 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3653 continue;
3655 if (DDR_OBJECT_A (ddr))
3657 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3658 if (!compared_objects.add (new_pair))
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_NOTE, vect_location,
3662 "checking that %T and %T"
3663 " have different addresses\n",
3664 new_pair.first, new_pair.second);
3665 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3667 continue;
3670 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3671 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3673 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3674 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3676 bool preserves_scalar_order_p
3677 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3678 bool ignore_step_p
3679 = (vf_one_p
3680 && (preserves_scalar_order_p
3681 || operand_equal_p (DR_STEP (dr_info_a->dr),
3682 DR_STEP (dr_info_b->dr))));
3684 /* Skip the pair if inter-iteration dependencies are irrelevant
3685 and intra-iteration dependencies are guaranteed to be honored. */
3686 if (ignore_step_p
3687 && (preserves_scalar_order_p
3688 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3689 &lower_bound)))
3691 if (dump_enabled_p ())
3692 dump_printf_loc (MSG_NOTE, vect_location,
3693 "no need for alias check between "
3694 "%T and %T when VF is 1\n",
3695 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3696 continue;
3699 /* See whether we can handle the alias using a bounds check on
3700 the step, and whether that's likely to be the best approach.
3701 (It might not be, for example, if the minimum step is much larger
3702 than the number of bytes handled by one vector iteration.) */
3703 if (!ignore_step_p
3704 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3705 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3706 &lower_bound)
3707 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3708 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3710 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3711 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3714 "%T and %T when the step %T is outside ",
3715 DR_REF (dr_info_a->dr),
3716 DR_REF (dr_info_b->dr),
3717 DR_STEP (dr_info_a->dr));
3718 if (unsigned_p)
3719 dump_printf (MSG_NOTE, "[0");
3720 else
3722 dump_printf (MSG_NOTE, "(");
3723 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3725 dump_printf (MSG_NOTE, ", ");
3726 dump_dec (MSG_NOTE, lower_bound);
3727 dump_printf (MSG_NOTE, ")\n");
3729 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3730 unsigned_p, lower_bound);
3731 continue;
3734 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3735 if (dr_group_first_a)
3737 stmt_info_a = dr_group_first_a;
3738 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3741 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3742 if (dr_group_first_b)
3744 stmt_info_b = dr_group_first_b;
3745 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3748 if (ignore_step_p)
3750 segment_length_a = size_zero_node;
3751 segment_length_b = size_zero_node;
3753 else
3755 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3756 DR_STEP (dr_info_b->dr), 0))
3757 length_factor = scalar_loop_iters;
3758 else
3759 length_factor = size_int (vect_factor);
3760 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3761 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3763 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3764 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3765 align_a = vect_vfa_align (dr_info_a);
3766 align_b = vect_vfa_align (dr_info_b);
3768 /* See whether the alias is known at compilation time. */
3769 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3770 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3771 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3772 DR_OFFSET (dr_info_b->dr), 0)
3773 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3774 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3775 && poly_int_tree_p (segment_length_a)
3776 && poly_int_tree_p (segment_length_b))
3778 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3779 segment_length_a,
3780 segment_length_b,
3781 access_size_a,
3782 access_size_b);
3783 if (res >= 0 && dump_enabled_p ())
3785 dump_printf_loc (MSG_NOTE, vect_location,
3786 "can tell at compile time that %T and %T",
3787 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3788 if (res == 0)
3789 dump_printf (MSG_NOTE, " do not alias\n");
3790 else
3791 dump_printf (MSG_NOTE, " alias\n");
3794 if (res == 0)
3795 continue;
3797 if (res == 1)
3798 return opt_result::failure_at (stmt_info_b->stmt,
3799 "not vectorized:"
3800 " compilation time alias: %G%G",
3801 stmt_info_a->stmt,
3802 stmt_info_b->stmt);
3805 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3806 access_size_a, align_a);
3807 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3808 access_size_b, align_b);
3809 /* Canonicalize the order to be the one that's needed for accurate
3810 RAW, WAR and WAW flags, in cases where the data references are
3811 well-ordered. The order doesn't really matter otherwise,
3812 but we might as well be consistent. */
3813 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3814 std::swap (dr_a, dr_b);
3816 dr_with_seg_len_pair_t dr_with_seg_len_pair
3817 (dr_a, dr_b, (preserves_scalar_order_p
3818 ? dr_with_seg_len_pair_t::WELL_ORDERED
3819 : dr_with_seg_len_pair_t::REORDERED));
3821 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3824 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3826 unsigned int count = (comp_alias_ddrs.length ()
3827 + check_unequal_addrs.length ());
3829 if (count
3830 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3831 == VECT_COST_MODEL_VERY_CHEAP))
3832 return opt_result::failure_at
3833 (vect_location, "would need a runtime alias check\n");
3835 if (dump_enabled_p ())
3836 dump_printf_loc (MSG_NOTE, vect_location,
3837 "improved number of alias checks from %d to %d\n",
3838 may_alias_ddrs.length (), count);
3839 unsigned limit = param_vect_max_version_for_alias_checks;
3840 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3841 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3842 if (count > limit)
3843 return opt_result::failure_at
3844 (vect_location,
3845 "number of versioning for alias run-time tests exceeds %d "
3846 "(--param vect-max-version-for-alias-checks)\n", limit);
3848 return opt_result::success ();
3851 /* Check whether we can use an internal function for a gather load
3852 or scatter store. READ_P is true for loads and false for stores.
3853 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3854 the type of the memory elements being loaded or stored. OFFSET_TYPE
3855 is the type of the offset that is being applied to the invariant
3856 base address. SCALE is the amount by which the offset should
3857 be multiplied *after* it has been converted to address width.
3859 Return true if the function is supported, storing the function id in
3860 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3862 bool
3863 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3864 tree vectype, tree memory_type, tree offset_type,
3865 int scale, internal_fn *ifn_out,
3866 tree *offset_vectype_out)
3868 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3869 unsigned int element_bits = vector_element_bits (vectype);
3870 if (element_bits != memory_bits)
3871 /* For now the vector elements must be the same width as the
3872 memory elements. */
3873 return false;
3875 /* Work out which function we need. */
3876 internal_fn ifn, alt_ifn, alt_ifn2;
3877 if (read_p)
3879 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3880 alt_ifn = IFN_MASK_GATHER_LOAD;
3881 /* When target supports LEN_MASK_GATHER_LOAD, we always
3882 use LEN_MASK_GATHER_LOAD regardless whether len and
3883 mask are valid or not. */
3884 alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
3886 else
3888 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3889 alt_ifn = IFN_MASK_SCATTER_STORE;
3890 /* When target supports LEN_MASK_SCATTER_STORE, we always
3891 use LEN_MASK_SCATTER_STORE regardless whether len and
3892 mask are valid or not. */
3893 alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
3896 for (;;)
3898 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3899 if (!offset_vectype)
3900 return false;
3902 /* Test whether the target supports this combination. */
3903 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3904 offset_vectype, scale))
3906 *ifn_out = ifn;
3907 *offset_vectype_out = offset_vectype;
3908 return true;
3910 else if (!masked_p
3911 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3912 memory_type,
3913 offset_vectype,
3914 scale))
3916 *ifn_out = alt_ifn;
3917 *offset_vectype_out = offset_vectype;
3918 return true;
3920 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
3921 memory_type,
3922 offset_vectype, scale))
3924 *ifn_out = alt_ifn2;
3925 *offset_vectype_out = offset_vectype;
3926 return true;
3929 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3930 && TYPE_PRECISION (offset_type) >= element_bits)
3931 return false;
3933 offset_type = build_nonstandard_integer_type
3934 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3938 /* STMT_INFO is a call to an internal gather load or scatter store function.
3939 Describe the operation in INFO. */
3941 static void
3942 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3943 gather_scatter_info *info)
3945 gcall *call = as_a <gcall *> (stmt_info->stmt);
3946 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3947 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3949 info->ifn = gimple_call_internal_fn (call);
3950 info->decl = NULL_TREE;
3951 info->base = gimple_call_arg (call, 0);
3952 info->offset = gimple_call_arg (call, 1);
3953 info->offset_dt = vect_unknown_def_type;
3954 info->offset_vectype = NULL_TREE;
3955 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3956 info->element_type = TREE_TYPE (vectype);
3957 info->memory_type = TREE_TYPE (DR_REF (dr));
3960 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3961 gather load or scatter store. Describe the operation in *INFO if so. */
3963 bool
3964 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3965 gather_scatter_info *info)
3967 HOST_WIDE_INT scale = 1;
3968 poly_int64 pbitpos, pbitsize;
3969 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3970 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3971 tree offtype = NULL_TREE;
3972 tree decl = NULL_TREE, base, off;
3973 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3974 tree memory_type = TREE_TYPE (DR_REF (dr));
3975 machine_mode pmode;
3976 int punsignedp, reversep, pvolatilep = 0;
3977 internal_fn ifn;
3978 tree offset_vectype;
3979 bool masked_p = false;
3981 /* See whether this is already a call to a gather/scatter internal function.
3982 If not, see whether it's a masked load or store. */
3983 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3984 if (call && gimple_call_internal_p (call))
3986 ifn = gimple_call_internal_fn (call);
3987 if (internal_gather_scatter_fn_p (ifn))
3989 vect_describe_gather_scatter_call (stmt_info, info);
3990 return true;
3992 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3995 /* True if we should aim to use internal functions rather than
3996 built-in functions. */
3997 bool use_ifn_p = (DR_IS_READ (dr)
3998 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3999 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4001 base = DR_REF (dr);
4002 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4003 see if we can use the def stmt of the address. */
4004 if (masked_p
4005 && TREE_CODE (base) == MEM_REF
4006 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4007 && integer_zerop (TREE_OPERAND (base, 1))
4008 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4010 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4011 if (is_gimple_assign (def_stmt)
4012 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4013 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4016 /* The gather and scatter builtins need address of the form
4017 loop_invariant + vector * {1, 2, 4, 8}
4019 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4020 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4021 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4022 multiplications and additions in it. To get a vector, we need
4023 a single SSA_NAME that will be defined in the loop and will
4024 contain everything that is not loop invariant and that can be
4025 vectorized. The following code attempts to find such a preexistng
4026 SSA_NAME OFF and put the loop invariants into a tree BASE
4027 that can be gimplified before the loop. */
4028 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4029 &punsignedp, &reversep, &pvolatilep);
4030 if (reversep)
4031 return false;
4033 /* PR 107346. Packed structs can have fields at offsets that are not
4034 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4035 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4036 return false;
4038 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4040 if (TREE_CODE (base) == MEM_REF)
4042 if (!integer_zerop (TREE_OPERAND (base, 1)))
4044 if (off == NULL_TREE)
4045 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4046 else
4047 off = size_binop (PLUS_EXPR, off,
4048 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4050 base = TREE_OPERAND (base, 0);
4052 else
4053 base = build_fold_addr_expr (base);
4055 if (off == NULL_TREE)
4056 off = size_zero_node;
4058 /* If base is not loop invariant, either off is 0, then we start with just
4059 the constant offset in the loop invariant BASE and continue with base
4060 as OFF, otherwise give up.
4061 We could handle that case by gimplifying the addition of base + off
4062 into some SSA_NAME and use that as off, but for now punt. */
4063 if (!expr_invariant_in_loop_p (loop, base))
4065 if (!integer_zerop (off))
4066 return false;
4067 off = base;
4068 base = size_int (pbytepos);
4070 /* Otherwise put base + constant offset into the loop invariant BASE
4071 and continue with OFF. */
4072 else
4074 base = fold_convert (sizetype, base);
4075 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4078 /* OFF at this point may be either a SSA_NAME or some tree expression
4079 from get_inner_reference. Try to peel off loop invariants from it
4080 into BASE as long as possible. */
4081 STRIP_NOPS (off);
4082 while (offtype == NULL_TREE)
4084 enum tree_code code;
4085 tree op0, op1, add = NULL_TREE;
4087 if (TREE_CODE (off) == SSA_NAME)
4089 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4091 if (expr_invariant_in_loop_p (loop, off))
4092 return false;
4094 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4095 break;
4097 op0 = gimple_assign_rhs1 (def_stmt);
4098 code = gimple_assign_rhs_code (def_stmt);
4099 op1 = gimple_assign_rhs2 (def_stmt);
4101 else
4103 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4104 return false;
4105 code = TREE_CODE (off);
4106 extract_ops_from_tree (off, &code, &op0, &op1);
4108 switch (code)
4110 case POINTER_PLUS_EXPR:
4111 case PLUS_EXPR:
4112 if (expr_invariant_in_loop_p (loop, op0))
4114 add = op0;
4115 off = op1;
4116 do_add:
4117 add = fold_convert (sizetype, add);
4118 if (scale != 1)
4119 add = size_binop (MULT_EXPR, add, size_int (scale));
4120 base = size_binop (PLUS_EXPR, base, add);
4121 continue;
4123 if (expr_invariant_in_loop_p (loop, op1))
4125 add = op1;
4126 off = op0;
4127 goto do_add;
4129 break;
4130 case MINUS_EXPR:
4131 if (expr_invariant_in_loop_p (loop, op1))
4133 add = fold_convert (sizetype, op1);
4134 add = size_binop (MINUS_EXPR, size_zero_node, add);
4135 off = op0;
4136 goto do_add;
4138 break;
4139 case MULT_EXPR:
4140 if (scale == 1 && tree_fits_shwi_p (op1))
4142 int new_scale = tree_to_shwi (op1);
4143 /* Only treat this as a scaling operation if the target
4144 supports it for at least some offset type. */
4145 if (use_ifn_p
4146 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4147 masked_p, vectype, memory_type,
4148 signed_char_type_node,
4149 new_scale, &ifn,
4150 &offset_vectype)
4151 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4152 masked_p, vectype, memory_type,
4153 unsigned_char_type_node,
4154 new_scale, &ifn,
4155 &offset_vectype))
4156 break;
4157 scale = new_scale;
4158 off = op0;
4159 continue;
4161 break;
4162 case SSA_NAME:
4163 off = op0;
4164 continue;
4165 CASE_CONVERT:
4166 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4167 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4168 break;
4170 /* Don't include the conversion if the target is happy with
4171 the current offset type. */
4172 if (use_ifn_p
4173 && TREE_CODE (off) == SSA_NAME
4174 && !POINTER_TYPE_P (TREE_TYPE (off))
4175 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4176 masked_p, vectype, memory_type,
4177 TREE_TYPE (off), scale, &ifn,
4178 &offset_vectype))
4179 break;
4181 if (TYPE_PRECISION (TREE_TYPE (op0))
4182 == TYPE_PRECISION (TREE_TYPE (off)))
4184 off = op0;
4185 continue;
4188 /* Include the conversion if it is widening and we're using
4189 the IFN path or the target can handle the converted from
4190 offset or the current size is not already the same as the
4191 data vector element size. */
4192 if ((TYPE_PRECISION (TREE_TYPE (op0))
4193 < TYPE_PRECISION (TREE_TYPE (off)))
4194 && (use_ifn_p
4195 || (DR_IS_READ (dr)
4196 ? (targetm.vectorize.builtin_gather
4197 && targetm.vectorize.builtin_gather (vectype,
4198 TREE_TYPE (op0),
4199 scale))
4200 : (targetm.vectorize.builtin_scatter
4201 && targetm.vectorize.builtin_scatter (vectype,
4202 TREE_TYPE (op0),
4203 scale)))
4204 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4205 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4207 off = op0;
4208 offtype = TREE_TYPE (off);
4209 STRIP_NOPS (off);
4210 continue;
4212 break;
4213 default:
4214 break;
4216 break;
4219 /* If at the end OFF still isn't a SSA_NAME or isn't
4220 defined in the loop, punt. */
4221 if (TREE_CODE (off) != SSA_NAME
4222 || expr_invariant_in_loop_p (loop, off))
4223 return false;
4225 if (offtype == NULL_TREE)
4226 offtype = TREE_TYPE (off);
4228 if (use_ifn_p)
4230 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4231 vectype, memory_type, offtype, scale,
4232 &ifn, &offset_vectype))
4233 ifn = IFN_LAST;
4234 decl = NULL_TREE;
4236 else
4238 if (DR_IS_READ (dr))
4240 if (targetm.vectorize.builtin_gather)
4241 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4243 else
4245 if (targetm.vectorize.builtin_scatter)
4246 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4248 ifn = IFN_LAST;
4249 /* The offset vector type will be read from DECL when needed. */
4250 offset_vectype = NULL_TREE;
4253 info->ifn = ifn;
4254 info->decl = decl;
4255 info->base = base;
4256 info->offset = off;
4257 info->offset_dt = vect_unknown_def_type;
4258 info->offset_vectype = offset_vectype;
4259 info->scale = scale;
4260 info->element_type = TREE_TYPE (vectype);
4261 info->memory_type = memory_type;
4262 return true;
4265 /* Find the data references in STMT, analyze them with respect to LOOP and
4266 append them to DATAREFS. Return false if datarefs in this stmt cannot
4267 be handled. */
4269 opt_result
4270 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4271 vec<data_reference_p> *datarefs,
4272 vec<int> *dataref_groups, int group_id)
4274 /* We can ignore clobbers for dataref analysis - they are removed during
4275 loop vectorization and BB vectorization checks dependences with a
4276 stmt walk. */
4277 if (gimple_clobber_p (stmt))
4278 return opt_result::success ();
4280 if (gimple_has_volatile_ops (stmt))
4281 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4282 stmt);
4284 if (stmt_can_throw_internal (cfun, stmt))
4285 return opt_result::failure_at (stmt,
4286 "not vectorized:"
4287 " statement can throw an exception: %G",
4288 stmt);
4290 auto_vec<data_reference_p, 2> refs;
4291 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4292 if (!res)
4293 return res;
4295 if (refs.is_empty ())
4296 return opt_result::success ();
4298 if (refs.length () > 1)
4300 while (!refs.is_empty ())
4301 free_data_ref (refs.pop ());
4302 return opt_result::failure_at (stmt,
4303 "not vectorized: more than one "
4304 "data ref in stmt: %G", stmt);
4307 data_reference_p dr = refs.pop ();
4308 if (gcall *call = dyn_cast <gcall *> (stmt))
4309 if (!gimple_call_internal_p (call)
4310 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4311 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4313 free_data_ref (dr);
4314 return opt_result::failure_at (stmt,
4315 "not vectorized: dr in a call %G", stmt);
4318 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4319 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4321 free_data_ref (dr);
4322 return opt_result::failure_at (stmt,
4323 "not vectorized:"
4324 " statement is an unsupported"
4325 " bitfield access %G", stmt);
4328 if (DR_BASE_ADDRESS (dr)
4329 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4331 free_data_ref (dr);
4332 return opt_result::failure_at (stmt,
4333 "not vectorized:"
4334 " base addr of dr is a constant\n");
4337 /* Check whether this may be a SIMD lane access and adjust the
4338 DR to make it easier for us to handle it. */
4339 if (loop
4340 && loop->simduid
4341 && (!DR_BASE_ADDRESS (dr)
4342 || !DR_OFFSET (dr)
4343 || !DR_INIT (dr)
4344 || !DR_STEP (dr)))
4346 struct data_reference *newdr
4347 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4348 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4349 if (DR_BASE_ADDRESS (newdr)
4350 && DR_OFFSET (newdr)
4351 && DR_INIT (newdr)
4352 && DR_STEP (newdr)
4353 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4354 && integer_zerop (DR_STEP (newdr)))
4356 tree base_address = DR_BASE_ADDRESS (newdr);
4357 tree off = DR_OFFSET (newdr);
4358 tree step = ssize_int (1);
4359 if (integer_zerop (off)
4360 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4362 off = TREE_OPERAND (base_address, 1);
4363 base_address = TREE_OPERAND (base_address, 0);
4365 STRIP_NOPS (off);
4366 if (TREE_CODE (off) == MULT_EXPR
4367 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4369 step = TREE_OPERAND (off, 1);
4370 off = TREE_OPERAND (off, 0);
4371 STRIP_NOPS (off);
4373 if (CONVERT_EXPR_P (off)
4374 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4375 < TYPE_PRECISION (TREE_TYPE (off))))
4376 off = TREE_OPERAND (off, 0);
4377 if (TREE_CODE (off) == SSA_NAME)
4379 gimple *def = SSA_NAME_DEF_STMT (off);
4380 /* Look through widening conversion. */
4381 if (is_gimple_assign (def)
4382 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4384 tree rhs1 = gimple_assign_rhs1 (def);
4385 if (TREE_CODE (rhs1) == SSA_NAME
4386 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4387 && (TYPE_PRECISION (TREE_TYPE (off))
4388 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4389 def = SSA_NAME_DEF_STMT (rhs1);
4391 if (is_gimple_call (def)
4392 && gimple_call_internal_p (def)
4393 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4395 tree arg = gimple_call_arg (def, 0);
4396 tree reft = TREE_TYPE (DR_REF (newdr));
4397 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4398 arg = SSA_NAME_VAR (arg);
4399 if (arg == loop->simduid
4400 /* For now. */
4401 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4403 DR_BASE_ADDRESS (newdr) = base_address;
4404 DR_OFFSET (newdr) = ssize_int (0);
4405 DR_STEP (newdr) = step;
4406 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4407 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4408 /* Mark as simd-lane access. */
4409 tree arg2 = gimple_call_arg (def, 1);
4410 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4411 free_data_ref (dr);
4412 datarefs->safe_push (newdr);
4413 if (dataref_groups)
4414 dataref_groups->safe_push (group_id);
4415 return opt_result::success ();
4420 free_data_ref (newdr);
4423 datarefs->safe_push (dr);
4424 if (dataref_groups)
4425 dataref_groups->safe_push (group_id);
4426 return opt_result::success ();
4429 /* Function vect_analyze_data_refs.
4431 Find all the data references in the loop or basic block.
4433 The general structure of the analysis of data refs in the vectorizer is as
4434 follows:
4435 1- vect_analyze_data_refs(loop/bb): call
4436 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4437 in the loop/bb and their dependences.
4438 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4439 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4440 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4444 opt_result
4445 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4447 class loop *loop = NULL;
4448 unsigned int i;
4449 struct data_reference *dr;
4450 tree scalar_type;
4452 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4454 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4455 loop = LOOP_VINFO_LOOP (loop_vinfo);
4457 /* Go through the data-refs, check that the analysis succeeded. Update
4458 pointer from stmt_vec_info struct to DR and vectype. */
4460 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4461 FOR_EACH_VEC_ELT (datarefs, i, dr)
4463 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4464 poly_uint64 vf;
4466 gcc_assert (DR_REF (dr));
4467 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4468 gcc_assert (!stmt_info->dr_aux.dr);
4469 stmt_info->dr_aux.dr = dr;
4470 stmt_info->dr_aux.stmt = stmt_info;
4472 /* Check that analysis of the data-ref succeeded. */
4473 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4474 || !DR_STEP (dr))
4476 bool maybe_gather
4477 = DR_IS_READ (dr)
4478 && !TREE_THIS_VOLATILE (DR_REF (dr));
4479 bool maybe_scatter
4480 = DR_IS_WRITE (dr)
4481 && !TREE_THIS_VOLATILE (DR_REF (dr));
4483 /* If target supports vector gather loads or scatter stores,
4484 see if they can't be used. */
4485 if (is_a <loop_vec_info> (vinfo)
4486 && !nested_in_vect_loop_p (loop, stmt_info))
4488 if (maybe_gather || maybe_scatter)
4490 if (maybe_gather)
4491 gatherscatter = GATHER;
4492 else
4493 gatherscatter = SCATTER;
4497 if (gatherscatter == SG_NONE)
4499 if (dump_enabled_p ())
4500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4501 "not vectorized: data ref analysis "
4502 "failed %G", stmt_info->stmt);
4503 if (is_a <bb_vec_info> (vinfo))
4505 /* In BB vectorization the ref can still participate
4506 in dependence analysis, we just can't vectorize it. */
4507 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4508 continue;
4510 return opt_result::failure_at (stmt_info->stmt,
4511 "not vectorized:"
4512 " data ref analysis failed: %G",
4513 stmt_info->stmt);
4517 /* See if this was detected as SIMD lane access. */
4518 if (dr->aux == (void *)-1
4519 || dr->aux == (void *)-2
4520 || dr->aux == (void *)-3
4521 || dr->aux == (void *)-4)
4523 if (nested_in_vect_loop_p (loop, stmt_info))
4524 return opt_result::failure_at (stmt_info->stmt,
4525 "not vectorized:"
4526 " data ref analysis failed: %G",
4527 stmt_info->stmt);
4528 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4529 = -(uintptr_t) dr->aux;
4532 tree base = get_base_address (DR_REF (dr));
4533 if (base && VAR_P (base) && DECL_NONALIASED (base))
4535 if (dump_enabled_p ())
4536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4537 "not vectorized: base object not addressable "
4538 "for stmt: %G", stmt_info->stmt);
4539 if (is_a <bb_vec_info> (vinfo))
4541 /* In BB vectorization the ref can still participate
4542 in dependence analysis, we just can't vectorize it. */
4543 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4544 continue;
4546 return opt_result::failure_at (stmt_info->stmt,
4547 "not vectorized: base object not"
4548 " addressable for stmt: %G",
4549 stmt_info->stmt);
4552 if (is_a <loop_vec_info> (vinfo)
4553 && DR_STEP (dr)
4554 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4556 if (nested_in_vect_loop_p (loop, stmt_info))
4557 return opt_result::failure_at (stmt_info->stmt,
4558 "not vectorized: "
4559 "not suitable for strided load %G",
4560 stmt_info->stmt);
4561 STMT_VINFO_STRIDED_P (stmt_info) = true;
4564 /* Update DR field in stmt_vec_info struct. */
4566 /* If the dataref is in an inner-loop of the loop that is considered for
4567 for vectorization, we also want to analyze the access relative to
4568 the outer-loop (DR contains information only relative to the
4569 inner-most enclosing loop). We do that by building a reference to the
4570 first location accessed by the inner-loop, and analyze it relative to
4571 the outer-loop. */
4572 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4574 /* Build a reference to the first location accessed by the
4575 inner loop: *(BASE + INIT + OFFSET). By construction,
4576 this address must be invariant in the inner loop, so we
4577 can consider it as being used in the outer loop. */
4578 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4579 tree offset = unshare_expr (DR_OFFSET (dr));
4580 tree init = unshare_expr (DR_INIT (dr));
4581 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4582 init, offset);
4583 tree init_addr = fold_build_pointer_plus (base, init_offset);
4584 tree init_ref = build_fold_indirect_ref (init_addr);
4586 if (dump_enabled_p ())
4587 dump_printf_loc (MSG_NOTE, vect_location,
4588 "analyze in outer loop: %T\n", init_ref);
4590 opt_result res
4591 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4592 init_ref, loop, stmt_info->stmt);
4593 if (!res)
4594 /* dr_analyze_innermost already explained the failure. */
4595 return res;
4597 if (dump_enabled_p ())
4598 dump_printf_loc (MSG_NOTE, vect_location,
4599 "\touter base_address: %T\n"
4600 "\touter offset from base address: %T\n"
4601 "\touter constant offset from base address: %T\n"
4602 "\touter step: %T\n"
4603 "\touter base alignment: %d\n\n"
4604 "\touter base misalignment: %d\n"
4605 "\touter offset alignment: %d\n"
4606 "\touter step alignment: %d\n",
4607 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4608 STMT_VINFO_DR_OFFSET (stmt_info),
4609 STMT_VINFO_DR_INIT (stmt_info),
4610 STMT_VINFO_DR_STEP (stmt_info),
4611 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4612 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4613 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4614 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4617 /* Set vectype for STMT. */
4618 scalar_type = TREE_TYPE (DR_REF (dr));
4619 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4620 if (!vectype)
4622 if (dump_enabled_p ())
4624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4625 "not vectorized: no vectype for stmt: %G",
4626 stmt_info->stmt);
4627 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4628 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4629 scalar_type);
4630 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4633 if (is_a <bb_vec_info> (vinfo))
4635 /* No vector type is fine, the ref can still participate
4636 in dependence analysis, we just can't vectorize it. */
4637 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4638 continue;
4640 if (fatal)
4641 *fatal = false;
4642 return opt_result::failure_at (stmt_info->stmt,
4643 "not vectorized:"
4644 " no vectype for stmt: %G"
4645 " scalar_type: %T\n",
4646 stmt_info->stmt, scalar_type);
4648 else
4650 if (dump_enabled_p ())
4651 dump_printf_loc (MSG_NOTE, vect_location,
4652 "got vectype for stmt: %G%T\n",
4653 stmt_info->stmt, vectype);
4656 /* Adjust the minimal vectorization factor according to the
4657 vector type. */
4658 vf = TYPE_VECTOR_SUBPARTS (vectype);
4659 *min_vf = upper_bound (*min_vf, vf);
4661 /* Leave the BB vectorizer to pick the vector type later, based on
4662 the final dataref group size and SLP node size. */
4663 if (is_a <loop_vec_info> (vinfo))
4664 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4666 if (gatherscatter != SG_NONE)
4668 gather_scatter_info gs_info;
4669 if (!vect_check_gather_scatter (stmt_info,
4670 as_a <loop_vec_info> (vinfo),
4671 &gs_info)
4672 || !get_vectype_for_scalar_type (vinfo,
4673 TREE_TYPE (gs_info.offset)))
4675 if (fatal)
4676 *fatal = false;
4677 return opt_result::failure_at
4678 (stmt_info->stmt,
4679 (gatherscatter == GATHER)
4680 ? "not vectorized: not suitable for gather load %G"
4681 : "not vectorized: not suitable for scatter store %G",
4682 stmt_info->stmt);
4684 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4688 /* We used to stop processing and prune the list here. Verify we no
4689 longer need to. */
4690 gcc_assert (i == datarefs.length ());
4692 return opt_result::success ();
4696 /* Function vect_get_new_vect_var.
4698 Returns a name for a new variable. The current naming scheme appends the
4699 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4700 the name of vectorizer generated variables, and appends that to NAME if
4701 provided. */
4703 tree
4704 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4706 const char *prefix;
4707 tree new_vect_var;
4709 switch (var_kind)
4711 case vect_simple_var:
4712 prefix = "vect";
4713 break;
4714 case vect_scalar_var:
4715 prefix = "stmp";
4716 break;
4717 case vect_mask_var:
4718 prefix = "mask";
4719 break;
4720 case vect_pointer_var:
4721 prefix = "vectp";
4722 break;
4723 default:
4724 gcc_unreachable ();
4727 if (name)
4729 char* tmp = concat (prefix, "_", name, NULL);
4730 new_vect_var = create_tmp_reg (type, tmp);
4731 free (tmp);
4733 else
4734 new_vect_var = create_tmp_reg (type, prefix);
4736 return new_vect_var;
4739 /* Like vect_get_new_vect_var but return an SSA name. */
4741 tree
4742 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4744 const char *prefix;
4745 tree new_vect_var;
4747 switch (var_kind)
4749 case vect_simple_var:
4750 prefix = "vect";
4751 break;
4752 case vect_scalar_var:
4753 prefix = "stmp";
4754 break;
4755 case vect_pointer_var:
4756 prefix = "vectp";
4757 break;
4758 default:
4759 gcc_unreachable ();
4762 if (name)
4764 char* tmp = concat (prefix, "_", name, NULL);
4765 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4766 free (tmp);
4768 else
4769 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4771 return new_vect_var;
4774 /* Duplicate points-to info on NAME from DR_INFO. */
4776 static void
4777 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4779 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4780 /* DR_PTR_INFO is for a base SSA name, not including constant or
4781 variable offsets in the ref so its alignment info does not apply. */
4782 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4785 /* Function vect_create_addr_base_for_vector_ref.
4787 Create an expression that computes the address of the first memory location
4788 that will be accessed for a data reference.
4790 Input:
4791 STMT_INFO: The statement containing the data reference.
4792 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4793 OFFSET: Optional. If supplied, it is be added to the initial address.
4794 LOOP: Specify relative to which loop-nest should the address be computed.
4795 For example, when the dataref is in an inner-loop nested in an
4796 outer-loop that is now being vectorized, LOOP can be either the
4797 outer-loop, or the inner-loop. The first memory location accessed
4798 by the following dataref ('in' points to short):
4800 for (i=0; i<N; i++)
4801 for (j=0; j<M; j++)
4802 s += in[i+j]
4804 is as follows:
4805 if LOOP=i_loop: &in (relative to i_loop)
4806 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4808 Output:
4809 1. Return an SSA_NAME whose value is the address of the memory location of
4810 the first vector of the data reference.
4811 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4812 these statement(s) which define the returned SSA_NAME.
4814 FORNOW: We are only handling array accesses with step 1. */
4816 tree
4817 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4818 gimple_seq *new_stmt_list,
4819 tree offset)
4821 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4822 struct data_reference *dr = dr_info->dr;
4823 const char *base_name;
4824 tree addr_base;
4825 tree dest;
4826 gimple_seq seq = NULL;
4827 tree vect_ptr_type;
4828 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4829 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4831 tree data_ref_base = unshare_expr (drb->base_address);
4832 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4833 tree init = unshare_expr (drb->init);
4835 if (loop_vinfo)
4836 base_name = get_name (data_ref_base);
4837 else
4839 base_offset = ssize_int (0);
4840 init = ssize_int (0);
4841 base_name = get_name (DR_REF (dr));
4844 /* Create base_offset */
4845 base_offset = size_binop (PLUS_EXPR,
4846 fold_convert (sizetype, base_offset),
4847 fold_convert (sizetype, init));
4849 if (offset)
4851 offset = fold_convert (sizetype, offset);
4852 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4853 base_offset, offset);
4856 /* base + base_offset */
4857 if (loop_vinfo)
4858 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4859 else
4860 addr_base = build1 (ADDR_EXPR,
4861 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4862 /* Strip zero offset components since we don't need
4863 them and they can confuse late diagnostics if
4864 we CSE them wrongly. See PR106904 for example. */
4865 unshare_expr (strip_zero_offset_components
4866 (DR_REF (dr))));
4868 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4869 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4870 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4871 gimple_seq_add_seq (new_stmt_list, seq);
4873 if (DR_PTR_INFO (dr)
4874 && TREE_CODE (addr_base) == SSA_NAME
4875 /* We should only duplicate pointer info to newly created SSA names. */
4876 && SSA_NAME_VAR (addr_base) == dest)
4878 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4879 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4882 if (dump_enabled_p ())
4883 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4885 return addr_base;
4889 /* Function vect_create_data_ref_ptr.
4891 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4892 location accessed in the loop by STMT_INFO, along with the def-use update
4893 chain to appropriately advance the pointer through the loop iterations.
4894 Also set aliasing information for the pointer. This pointer is used by
4895 the callers to this function to create a memory reference expression for
4896 vector load/store access.
4898 Input:
4899 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4900 GIMPLE_ASSIGN <name, data-ref> or
4901 GIMPLE_ASSIGN <data-ref, name>.
4902 2. AGGR_TYPE: the type of the reference, which should be either a vector
4903 or an array.
4904 3. AT_LOOP: the loop where the vector memref is to be created.
4905 4. OFFSET (optional): a byte offset to be added to the initial address
4906 accessed by the data-ref in STMT_INFO.
4907 5. BSI: location where the new stmts are to be placed if there is no loop
4908 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4909 pointing to the initial address.
4910 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4911 to the IV during each iteration of the loop. NULL says to move
4912 by one copy of AGGR_TYPE up or down, depending on the step of the
4913 data reference.
4915 Output:
4916 1. Declare a new ptr to vector_type, and have it point to the base of the
4917 data reference (initial addressed accessed by the data reference).
4918 For example, for vector of type V8HI, the following code is generated:
4920 v8hi *ap;
4921 ap = (v8hi *)initial_address;
4923 if OFFSET is not supplied:
4924 initial_address = &a[init];
4925 if OFFSET is supplied:
4926 initial_address = &a[init] + OFFSET;
4927 if BYTE_OFFSET is supplied:
4928 initial_address = &a[init] + BYTE_OFFSET;
4930 Return the initial_address in INITIAL_ADDRESS.
4932 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4933 update the pointer in each iteration of the loop.
4935 Return the increment stmt that updates the pointer in PTR_INCR.
4937 3. Return the pointer. */
4939 tree
4940 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4941 tree aggr_type, class loop *at_loop, tree offset,
4942 tree *initial_address, gimple_stmt_iterator *gsi,
4943 gimple **ptr_incr, bool only_init,
4944 tree iv_step)
4946 const char *base_name;
4947 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4948 class loop *loop = NULL;
4949 bool nested_in_vect_loop = false;
4950 class loop *containing_loop = NULL;
4951 tree aggr_ptr_type;
4952 tree aggr_ptr;
4953 tree new_temp;
4954 gimple_seq new_stmt_list = NULL;
4955 edge pe = NULL;
4956 basic_block new_bb;
4957 tree aggr_ptr_init;
4958 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4959 struct data_reference *dr = dr_info->dr;
4960 tree aptr;
4961 gimple_stmt_iterator incr_gsi;
4962 bool insert_after;
4963 tree indx_before_incr, indx_after_incr;
4964 gimple *incr;
4965 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4967 gcc_assert (iv_step != NULL_TREE
4968 || TREE_CODE (aggr_type) == ARRAY_TYPE
4969 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4971 if (loop_vinfo)
4973 loop = LOOP_VINFO_LOOP (loop_vinfo);
4974 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4975 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4976 pe = loop_preheader_edge (loop);
4978 else
4980 gcc_assert (bb_vinfo);
4981 only_init = true;
4982 *ptr_incr = NULL;
4985 /* Create an expression for the first address accessed by this load
4986 in LOOP. */
4987 base_name = get_name (DR_BASE_ADDRESS (dr));
4989 if (dump_enabled_p ())
4991 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4992 dump_printf_loc (MSG_NOTE, vect_location,
4993 "create %s-pointer variable to type: %T",
4994 get_tree_code_name (TREE_CODE (aggr_type)),
4995 aggr_type);
4996 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4997 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4998 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4999 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5000 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5001 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5002 else
5003 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5004 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5007 /* (1) Create the new aggregate-pointer variable.
5008 Vector and array types inherit the alias set of their component
5009 type by default so we need to use a ref-all pointer if the data
5010 reference does not conflict with the created aggregated data
5011 reference because it is not addressable. */
5012 bool need_ref_all = false;
5013 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5014 get_alias_set (DR_REF (dr))))
5015 need_ref_all = true;
5016 /* Likewise for any of the data references in the stmt group. */
5017 else if (DR_GROUP_SIZE (stmt_info) > 1)
5019 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5022 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5023 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5024 get_alias_set (DR_REF (sdr))))
5026 need_ref_all = true;
5027 break;
5029 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5031 while (sinfo);
5033 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5034 need_ref_all);
5035 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5038 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5039 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5040 def-use update cycles for the pointer: one relative to the outer-loop
5041 (LOOP), which is what steps (3) and (4) below do. The other is relative
5042 to the inner-loop (which is the inner-most loop containing the dataref),
5043 and this is done be step (5) below.
5045 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5046 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5047 redundant. Steps (3),(4) create the following:
5049 vp0 = &base_addr;
5050 LOOP: vp1 = phi(vp0,vp2)
5053 vp2 = vp1 + step
5054 goto LOOP
5056 If there is an inner-loop nested in loop, then step (5) will also be
5057 applied, and an additional update in the inner-loop will be created:
5059 vp0 = &base_addr;
5060 LOOP: vp1 = phi(vp0,vp2)
5062 inner: vp3 = phi(vp1,vp4)
5063 vp4 = vp3 + inner_step
5064 if () goto inner
5066 vp2 = vp1 + step
5067 if () goto LOOP */
5069 /* (2) Calculate the initial address of the aggregate-pointer, and set
5070 the aggregate-pointer to point to it before the loop. */
5072 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5074 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5075 stmt_info, &new_stmt_list,
5076 offset);
5077 if (new_stmt_list)
5079 if (pe)
5081 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5082 gcc_assert (!new_bb);
5084 else
5085 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5088 *initial_address = new_temp;
5089 aggr_ptr_init = new_temp;
5091 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5092 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5093 inner-loop nested in LOOP (during outer-loop vectorization). */
5095 /* No update in loop is required. */
5096 if (only_init && (!loop_vinfo || at_loop == loop))
5097 aptr = aggr_ptr_init;
5098 else
5100 /* Accesses to invariant addresses should be handled specially
5101 by the caller. */
5102 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5103 gcc_assert (!integer_zerop (step));
5105 if (iv_step == NULL_TREE)
5107 /* The step of the aggregate pointer is the type size,
5108 negated for downward accesses. */
5109 iv_step = TYPE_SIZE_UNIT (aggr_type);
5110 if (tree_int_cst_sgn (step) == -1)
5111 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5114 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5116 create_iv (aggr_ptr_init, PLUS_EXPR,
5117 fold_convert (aggr_ptr_type, iv_step),
5118 aggr_ptr, loop, &incr_gsi, insert_after,
5119 &indx_before_incr, &indx_after_incr);
5120 incr = gsi_stmt (incr_gsi);
5122 /* Copy the points-to information if it exists. */
5123 if (DR_PTR_INFO (dr))
5125 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5126 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5128 if (ptr_incr)
5129 *ptr_incr = incr;
5131 aptr = indx_before_incr;
5134 if (!nested_in_vect_loop || only_init)
5135 return aptr;
5138 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5139 nested in LOOP, if exists. */
5141 gcc_assert (nested_in_vect_loop);
5142 if (!only_init)
5144 standard_iv_increment_position (containing_loop, &incr_gsi,
5145 &insert_after);
5146 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5147 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5148 &indx_before_incr, &indx_after_incr);
5149 incr = gsi_stmt (incr_gsi);
5151 /* Copy the points-to information if it exists. */
5152 if (DR_PTR_INFO (dr))
5154 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5155 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5157 if (ptr_incr)
5158 *ptr_incr = incr;
5160 return indx_before_incr;
5162 else
5163 gcc_unreachable ();
5167 /* Function bump_vector_ptr
5169 Increment a pointer (to a vector type) by vector-size. If requested,
5170 i.e. if PTR-INCR is given, then also connect the new increment stmt
5171 to the existing def-use update-chain of the pointer, by modifying
5172 the PTR_INCR as illustrated below:
5174 The pointer def-use update-chain before this function:
5175 DATAREF_PTR = phi (p_0, p_2)
5176 ....
5177 PTR_INCR: p_2 = DATAREF_PTR + step
5179 The pointer def-use update-chain after this function:
5180 DATAREF_PTR = phi (p_0, p_2)
5181 ....
5182 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5183 ....
5184 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5186 Input:
5187 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5188 in the loop.
5189 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5190 the loop. The increment amount across iterations is expected
5191 to be vector_size.
5192 BSI - location where the new update stmt is to be placed.
5193 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5194 BUMP - optional. The offset by which to bump the pointer. If not given,
5195 the offset is assumed to be vector_size.
5197 Output: Return NEW_DATAREF_PTR as illustrated above.
5201 tree
5202 bump_vector_ptr (vec_info *vinfo,
5203 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5204 stmt_vec_info stmt_info, tree bump)
5206 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5207 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5208 tree update = TYPE_SIZE_UNIT (vectype);
5209 gimple *incr_stmt;
5210 ssa_op_iter iter;
5211 use_operand_p use_p;
5212 tree new_dataref_ptr;
5214 if (bump)
5215 update = bump;
5217 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5218 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5219 else if (is_gimple_min_invariant (dataref_ptr))
5220 /* When possible avoid emitting a separate increment stmt that will
5221 force the addressed object addressable. */
5222 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5223 fold_build2 (MEM_REF,
5224 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5225 dataref_ptr,
5226 fold_convert (ptr_type_node, update)));
5227 else
5228 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5229 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5230 dataref_ptr, update);
5231 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5232 /* Fold the increment, avoiding excessive chains use-def chains of
5233 those, leading to compile-time issues for passes until the next
5234 forwprop pass which would do this as well. */
5235 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5236 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5238 incr_stmt = gsi_stmt (fold_gsi);
5239 update_stmt (incr_stmt);
5242 /* Copy the points-to information if it exists. */
5243 if (DR_PTR_INFO (dr))
5245 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5246 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5249 if (!ptr_incr)
5250 return new_dataref_ptr;
5252 /* Update the vector-pointer's cross-iteration increment. */
5253 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5255 tree use = USE_FROM_PTR (use_p);
5257 if (use == dataref_ptr)
5258 SET_USE (use_p, new_dataref_ptr);
5259 else
5260 gcc_assert (operand_equal_p (use, update, 0));
5263 return new_dataref_ptr;
5267 /* Copy memory reference info such as base/clique from the SRC reference
5268 to the DEST MEM_REF. */
5270 void
5271 vect_copy_ref_info (tree dest, tree src)
5273 if (TREE_CODE (dest) != MEM_REF)
5274 return;
5276 tree src_base = src;
5277 while (handled_component_p (src_base))
5278 src_base = TREE_OPERAND (src_base, 0);
5279 if (TREE_CODE (src_base) != MEM_REF
5280 && TREE_CODE (src_base) != TARGET_MEM_REF)
5281 return;
5283 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5284 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5288 /* Function vect_create_destination_var.
5290 Create a new temporary of type VECTYPE. */
5292 tree
5293 vect_create_destination_var (tree scalar_dest, tree vectype)
5295 tree vec_dest;
5296 const char *name;
5297 char *new_name;
5298 tree type;
5299 enum vect_var_kind kind;
5301 kind = vectype
5302 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5303 ? vect_mask_var
5304 : vect_simple_var
5305 : vect_scalar_var;
5306 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5308 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5310 name = get_name (scalar_dest);
5311 if (name)
5312 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5313 else
5314 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5315 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5316 free (new_name);
5318 return vec_dest;
5321 /* Function vect_grouped_store_supported.
5323 Returns TRUE if interleave high and interleave low permutations
5324 are supported, and FALSE otherwise. */
5326 bool
5327 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5329 machine_mode mode = TYPE_MODE (vectype);
5331 /* vect_permute_store_chain requires the group size to be equal to 3 or
5332 be a power of two. */
5333 if (count != 3 && exact_log2 (count) == -1)
5335 if (dump_enabled_p ())
5336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5337 "the size of the group of accesses"
5338 " is not a power of 2 or not eqaul to 3\n");
5339 return false;
5342 /* Check that the permutation is supported. */
5343 if (VECTOR_MODE_P (mode))
5345 unsigned int i;
5346 if (count == 3)
5348 unsigned int j0 = 0, j1 = 0, j2 = 0;
5349 unsigned int i, j;
5351 unsigned int nelt;
5352 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5354 if (dump_enabled_p ())
5355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5356 "cannot handle groups of 3 stores for"
5357 " variable-length vectors\n");
5358 return false;
5361 vec_perm_builder sel (nelt, nelt, 1);
5362 sel.quick_grow (nelt);
5363 vec_perm_indices indices;
5364 for (j = 0; j < 3; j++)
5366 int nelt0 = ((3 - j) * nelt) % 3;
5367 int nelt1 = ((3 - j) * nelt + 1) % 3;
5368 int nelt2 = ((3 - j) * nelt + 2) % 3;
5369 for (i = 0; i < nelt; i++)
5371 if (3 * i + nelt0 < nelt)
5372 sel[3 * i + nelt0] = j0++;
5373 if (3 * i + nelt1 < nelt)
5374 sel[3 * i + nelt1] = nelt + j1++;
5375 if (3 * i + nelt2 < nelt)
5376 sel[3 * i + nelt2] = 0;
5378 indices.new_vector (sel, 2, nelt);
5379 if (!can_vec_perm_const_p (mode, mode, indices))
5381 if (dump_enabled_p ())
5382 dump_printf (MSG_MISSED_OPTIMIZATION,
5383 "permutation op not supported by target.\n");
5384 return false;
5387 for (i = 0; i < nelt; i++)
5389 if (3 * i + nelt0 < nelt)
5390 sel[3 * i + nelt0] = 3 * i + nelt0;
5391 if (3 * i + nelt1 < nelt)
5392 sel[3 * i + nelt1] = 3 * i + nelt1;
5393 if (3 * i + nelt2 < nelt)
5394 sel[3 * i + nelt2] = nelt + j2++;
5396 indices.new_vector (sel, 2, nelt);
5397 if (!can_vec_perm_const_p (mode, mode, indices))
5399 if (dump_enabled_p ())
5400 dump_printf (MSG_MISSED_OPTIMIZATION,
5401 "permutation op not supported by target.\n");
5402 return false;
5405 return true;
5407 else
5409 /* If length is not equal to 3 then only power of 2 is supported. */
5410 gcc_assert (pow2p_hwi (count));
5411 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5413 /* The encoding has 2 interleaved stepped patterns. */
5414 if(!multiple_p (nelt, 2))
5415 return false;
5416 vec_perm_builder sel (nelt, 2, 3);
5417 sel.quick_grow (6);
5418 for (i = 0; i < 3; i++)
5420 sel[i * 2] = i;
5421 sel[i * 2 + 1] = i + nelt;
5423 vec_perm_indices indices (sel, 2, nelt);
5424 if (can_vec_perm_const_p (mode, mode, indices))
5426 for (i = 0; i < 6; i++)
5427 sel[i] += exact_div (nelt, 2);
5428 indices.new_vector (sel, 2, nelt);
5429 if (can_vec_perm_const_p (mode, mode, indices))
5430 return true;
5435 if (dump_enabled_p ())
5436 dump_printf (MSG_MISSED_OPTIMIZATION,
5437 "permutation op not supported by target.\n");
5438 return false;
5442 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5443 type VECTYPE. MASKED_P says whether the masked form is needed. */
5445 bool
5446 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5447 bool masked_p)
5449 if (masked_p)
5450 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5451 vec_mask_store_lanes_optab,
5452 vectype, count);
5453 else
5454 return vect_lanes_optab_supported_p ("vec_store_lanes",
5455 vec_store_lanes_optab,
5456 vectype, count);
5460 /* Function vect_permute_store_chain.
5462 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5463 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5464 the data correctly for the stores. Return the final references for stores
5465 in RESULT_CHAIN.
5467 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5468 The input is 4 vectors each containing 8 elements. We assign a number to
5469 each element, the input sequence is:
5471 1st vec: 0 1 2 3 4 5 6 7
5472 2nd vec: 8 9 10 11 12 13 14 15
5473 3rd vec: 16 17 18 19 20 21 22 23
5474 4th vec: 24 25 26 27 28 29 30 31
5476 The output sequence should be:
5478 1st vec: 0 8 16 24 1 9 17 25
5479 2nd vec: 2 10 18 26 3 11 19 27
5480 3rd vec: 4 12 20 28 5 13 21 30
5481 4th vec: 6 14 22 30 7 15 23 31
5483 i.e., we interleave the contents of the four vectors in their order.
5485 We use interleave_high/low instructions to create such output. The input of
5486 each interleave_high/low operation is two vectors:
5487 1st vec 2nd vec
5488 0 1 2 3 4 5 6 7
5489 the even elements of the result vector are obtained left-to-right from the
5490 high/low elements of the first vector. The odd elements of the result are
5491 obtained left-to-right from the high/low elements of the second vector.
5492 The output of interleave_high will be: 0 4 1 5
5493 and of interleave_low: 2 6 3 7
5496 The permutation is done in log LENGTH stages. In each stage interleave_high
5497 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5498 where the first argument is taken from the first half of DR_CHAIN and the
5499 second argument from it's second half.
5500 In our example,
5502 I1: interleave_high (1st vec, 3rd vec)
5503 I2: interleave_low (1st vec, 3rd vec)
5504 I3: interleave_high (2nd vec, 4th vec)
5505 I4: interleave_low (2nd vec, 4th vec)
5507 The output for the first stage is:
5509 I1: 0 16 1 17 2 18 3 19
5510 I2: 4 20 5 21 6 22 7 23
5511 I3: 8 24 9 25 10 26 11 27
5512 I4: 12 28 13 29 14 30 15 31
5514 The output of the second stage, i.e. the final result is:
5516 I1: 0 8 16 24 1 9 17 25
5517 I2: 2 10 18 26 3 11 19 27
5518 I3: 4 12 20 28 5 13 21 30
5519 I4: 6 14 22 30 7 15 23 31. */
5521 void
5522 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5523 unsigned int length,
5524 stmt_vec_info stmt_info,
5525 gimple_stmt_iterator *gsi,
5526 vec<tree> *result_chain)
5528 tree vect1, vect2, high, low;
5529 gimple *perm_stmt;
5530 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5531 tree perm_mask_low, perm_mask_high;
5532 tree data_ref;
5533 tree perm3_mask_low, perm3_mask_high;
5534 unsigned int i, j, n, log_length = exact_log2 (length);
5536 result_chain->quick_grow (length);
5537 memcpy (result_chain->address (), dr_chain.address (),
5538 length * sizeof (tree));
5540 if (length == 3)
5542 /* vect_grouped_store_supported ensures that this is constant. */
5543 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5544 unsigned int j0 = 0, j1 = 0, j2 = 0;
5546 vec_perm_builder sel (nelt, nelt, 1);
5547 sel.quick_grow (nelt);
5548 vec_perm_indices indices;
5549 for (j = 0; j < 3; j++)
5551 int nelt0 = ((3 - j) * nelt) % 3;
5552 int nelt1 = ((3 - j) * nelt + 1) % 3;
5553 int nelt2 = ((3 - j) * nelt + 2) % 3;
5555 for (i = 0; i < nelt; i++)
5557 if (3 * i + nelt0 < nelt)
5558 sel[3 * i + nelt0] = j0++;
5559 if (3 * i + nelt1 < nelt)
5560 sel[3 * i + nelt1] = nelt + j1++;
5561 if (3 * i + nelt2 < nelt)
5562 sel[3 * i + nelt2] = 0;
5564 indices.new_vector (sel, 2, nelt);
5565 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5567 for (i = 0; i < nelt; i++)
5569 if (3 * i + nelt0 < nelt)
5570 sel[3 * i + nelt0] = 3 * i + nelt0;
5571 if (3 * i + nelt1 < nelt)
5572 sel[3 * i + nelt1] = 3 * i + nelt1;
5573 if (3 * i + nelt2 < nelt)
5574 sel[3 * i + nelt2] = nelt + j2++;
5576 indices.new_vector (sel, 2, nelt);
5577 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5579 vect1 = dr_chain[0];
5580 vect2 = dr_chain[1];
5582 /* Create interleaving stmt:
5583 low = VEC_PERM_EXPR <vect1, vect2,
5584 {j, nelt, *, j + 1, nelt + j + 1, *,
5585 j + 2, nelt + j + 2, *, ...}> */
5586 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5587 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5588 vect2, perm3_mask_low);
5589 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5591 vect1 = data_ref;
5592 vect2 = dr_chain[2];
5593 /* Create interleaving stmt:
5594 low = VEC_PERM_EXPR <vect1, vect2,
5595 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5596 6, 7, nelt + j + 2, ...}> */
5597 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5598 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5599 vect2, perm3_mask_high);
5600 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5601 (*result_chain)[j] = data_ref;
5604 else
5606 /* If length is not equal to 3 then only power of 2 is supported. */
5607 gcc_assert (pow2p_hwi (length));
5609 /* The encoding has 2 interleaved stepped patterns. */
5610 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5611 vec_perm_builder sel (nelt, 2, 3);
5612 sel.quick_grow (6);
5613 for (i = 0; i < 3; i++)
5615 sel[i * 2] = i;
5616 sel[i * 2 + 1] = i + nelt;
5618 vec_perm_indices indices (sel, 2, nelt);
5619 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5621 for (i = 0; i < 6; i++)
5622 sel[i] += exact_div (nelt, 2);
5623 indices.new_vector (sel, 2, nelt);
5624 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5626 for (i = 0, n = log_length; i < n; i++)
5628 for (j = 0; j < length/2; j++)
5630 vect1 = dr_chain[j];
5631 vect2 = dr_chain[j+length/2];
5633 /* Create interleaving stmt:
5634 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5635 ...}> */
5636 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5637 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5638 vect2, perm_mask_high);
5639 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5640 (*result_chain)[2*j] = high;
5642 /* Create interleaving stmt:
5643 low = VEC_PERM_EXPR <vect1, vect2,
5644 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5645 ...}> */
5646 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5647 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5648 vect2, perm_mask_low);
5649 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5650 (*result_chain)[2*j+1] = low;
5652 memcpy (dr_chain.address (), result_chain->address (),
5653 length * sizeof (tree));
5658 /* Function vect_setup_realignment
5660 This function is called when vectorizing an unaligned load using
5661 the dr_explicit_realign[_optimized] scheme.
5662 This function generates the following code at the loop prolog:
5664 p = initial_addr;
5665 x msq_init = *(floor(p)); # prolog load
5666 realignment_token = call target_builtin;
5667 loop:
5668 x msq = phi (msq_init, ---)
5670 The stmts marked with x are generated only for the case of
5671 dr_explicit_realign_optimized.
5673 The code above sets up a new (vector) pointer, pointing to the first
5674 location accessed by STMT_INFO, and a "floor-aligned" load using that
5675 pointer. It also generates code to compute the "realignment-token"
5676 (if the relevant target hook was defined), and creates a phi-node at the
5677 loop-header bb whose arguments are the result of the prolog-load (created
5678 by this function) and the result of a load that takes place in the loop
5679 (to be created by the caller to this function).
5681 For the case of dr_explicit_realign_optimized:
5682 The caller to this function uses the phi-result (msq) to create the
5683 realignment code inside the loop, and sets up the missing phi argument,
5684 as follows:
5685 loop:
5686 msq = phi (msq_init, lsq)
5687 lsq = *(floor(p')); # load in loop
5688 result = realign_load (msq, lsq, realignment_token);
5690 For the case of dr_explicit_realign:
5691 loop:
5692 msq = *(floor(p)); # load in loop
5693 p' = p + (VS-1);
5694 lsq = *(floor(p')); # load in loop
5695 result = realign_load (msq, lsq, realignment_token);
5697 Input:
5698 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5699 a memory location that may be unaligned.
5700 BSI - place where new code is to be inserted.
5701 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5702 is used.
5704 Output:
5705 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5706 target hook, if defined.
5707 Return value - the result of the loop-header phi node. */
5709 tree
5710 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5711 gimple_stmt_iterator *gsi, tree *realignment_token,
5712 enum dr_alignment_support alignment_support_scheme,
5713 tree init_addr,
5714 class loop **at_loop)
5716 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5717 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5718 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5719 struct data_reference *dr = dr_info->dr;
5720 class loop *loop = NULL;
5721 edge pe = NULL;
5722 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5723 tree vec_dest;
5724 gimple *inc;
5725 tree ptr;
5726 tree data_ref;
5727 basic_block new_bb;
5728 tree msq_init = NULL_TREE;
5729 tree new_temp;
5730 gphi *phi_stmt;
5731 tree msq = NULL_TREE;
5732 gimple_seq stmts = NULL;
5733 bool compute_in_loop = false;
5734 bool nested_in_vect_loop = false;
5735 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5736 class loop *loop_for_initial_load = NULL;
5738 if (loop_vinfo)
5740 loop = LOOP_VINFO_LOOP (loop_vinfo);
5741 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5744 gcc_assert (alignment_support_scheme == dr_explicit_realign
5745 || alignment_support_scheme == dr_explicit_realign_optimized);
5747 /* We need to generate three things:
5748 1. the misalignment computation
5749 2. the extra vector load (for the optimized realignment scheme).
5750 3. the phi node for the two vectors from which the realignment is
5751 done (for the optimized realignment scheme). */
5753 /* 1. Determine where to generate the misalignment computation.
5755 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5756 calculation will be generated by this function, outside the loop (in the
5757 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5758 caller, inside the loop.
5760 Background: If the misalignment remains fixed throughout the iterations of
5761 the loop, then both realignment schemes are applicable, and also the
5762 misalignment computation can be done outside LOOP. This is because we are
5763 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5764 are a multiple of VS (the Vector Size), and therefore the misalignment in
5765 different vectorized LOOP iterations is always the same.
5766 The problem arises only if the memory access is in an inner-loop nested
5767 inside LOOP, which is now being vectorized using outer-loop vectorization.
5768 This is the only case when the misalignment of the memory access may not
5769 remain fixed throughout the iterations of the inner-loop (as explained in
5770 detail in vect_supportable_dr_alignment). In this case, not only is the
5771 optimized realignment scheme not applicable, but also the misalignment
5772 computation (and generation of the realignment token that is passed to
5773 REALIGN_LOAD) have to be done inside the loop.
5775 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5776 or not, which in turn determines if the misalignment is computed inside
5777 the inner-loop, or outside LOOP. */
5779 if (init_addr != NULL_TREE || !loop_vinfo)
5781 compute_in_loop = true;
5782 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5786 /* 2. Determine where to generate the extra vector load.
5788 For the optimized realignment scheme, instead of generating two vector
5789 loads in each iteration, we generate a single extra vector load in the
5790 preheader of the loop, and in each iteration reuse the result of the
5791 vector load from the previous iteration. In case the memory access is in
5792 an inner-loop nested inside LOOP, which is now being vectorized using
5793 outer-loop vectorization, we need to determine whether this initial vector
5794 load should be generated at the preheader of the inner-loop, or can be
5795 generated at the preheader of LOOP. If the memory access has no evolution
5796 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5797 to be generated inside LOOP (in the preheader of the inner-loop). */
5799 if (nested_in_vect_loop)
5801 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5802 bool invariant_in_outerloop =
5803 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5804 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5806 else
5807 loop_for_initial_load = loop;
5808 if (at_loop)
5809 *at_loop = loop_for_initial_load;
5811 tree vuse = NULL_TREE;
5812 if (loop_for_initial_load)
5814 pe = loop_preheader_edge (loop_for_initial_load);
5815 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5816 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5818 if (!vuse)
5819 vuse = gimple_vuse (gsi_stmt (*gsi));
5821 /* 3. For the case of the optimized realignment, create the first vector
5822 load at the loop preheader. */
5824 if (alignment_support_scheme == dr_explicit_realign_optimized)
5826 /* Create msq_init = *(floor(p1)) in the loop preheader */
5827 gassign *new_stmt;
5829 gcc_assert (!compute_in_loop);
5830 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5831 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5832 loop_for_initial_load, NULL_TREE,
5833 &init_addr, NULL, &inc, true);
5834 if (TREE_CODE (ptr) == SSA_NAME)
5835 new_temp = copy_ssa_name (ptr);
5836 else
5837 new_temp = make_ssa_name (TREE_TYPE (ptr));
5838 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5839 tree type = TREE_TYPE (ptr);
5840 new_stmt = gimple_build_assign
5841 (new_temp, BIT_AND_EXPR, ptr,
5842 fold_build2 (MINUS_EXPR, type,
5843 build_int_cst (type, 0),
5844 build_int_cst (type, align)));
5845 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5846 gcc_assert (!new_bb);
5847 data_ref
5848 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5849 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5850 vect_copy_ref_info (data_ref, DR_REF (dr));
5851 new_stmt = gimple_build_assign (vec_dest, data_ref);
5852 new_temp = make_ssa_name (vec_dest, new_stmt);
5853 gimple_assign_set_lhs (new_stmt, new_temp);
5854 gimple_set_vuse (new_stmt, vuse);
5855 if (pe)
5857 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5858 gcc_assert (!new_bb);
5860 else
5861 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5863 msq_init = gimple_assign_lhs (new_stmt);
5866 /* 4. Create realignment token using a target builtin, if available.
5867 It is done either inside the containing loop, or before LOOP (as
5868 determined above). */
5870 if (targetm.vectorize.builtin_mask_for_load)
5872 gcall *new_stmt;
5873 tree builtin_decl;
5875 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5876 if (!init_addr)
5878 /* Generate the INIT_ADDR computation outside LOOP. */
5879 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5880 stmt_info, &stmts,
5881 NULL_TREE);
5882 if (loop)
5884 pe = loop_preheader_edge (loop);
5885 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5886 gcc_assert (!new_bb);
5888 else
5889 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5892 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5893 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5894 vec_dest =
5895 vect_create_destination_var (scalar_dest,
5896 gimple_call_return_type (new_stmt));
5897 new_temp = make_ssa_name (vec_dest, new_stmt);
5898 gimple_call_set_lhs (new_stmt, new_temp);
5900 if (compute_in_loop)
5901 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5902 else
5904 /* Generate the misalignment computation outside LOOP. */
5905 pe = loop_preheader_edge (loop);
5906 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5907 gcc_assert (!new_bb);
5910 *realignment_token = gimple_call_lhs (new_stmt);
5912 /* The result of the CALL_EXPR to this builtin is determined from
5913 the value of the parameter and no global variables are touched
5914 which makes the builtin a "const" function. Requiring the
5915 builtin to have the "const" attribute makes it unnecessary
5916 to call mark_call_clobbered. */
5917 gcc_assert (TREE_READONLY (builtin_decl));
5920 if (alignment_support_scheme == dr_explicit_realign)
5921 return msq;
5923 gcc_assert (!compute_in_loop);
5924 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5927 /* 5. Create msq = phi <msq_init, lsq> in loop */
5929 pe = loop_preheader_edge (containing_loop);
5930 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5931 msq = make_ssa_name (vec_dest);
5932 phi_stmt = create_phi_node (msq, containing_loop->header);
5933 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5935 return msq;
5939 /* Function vect_grouped_load_supported.
5941 COUNT is the size of the load group (the number of statements plus the
5942 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5943 only one statement, with a gap of COUNT - 1.
5945 Returns true if a suitable permute exists. */
5947 bool
5948 vect_grouped_load_supported (tree vectype, bool single_element_p,
5949 unsigned HOST_WIDE_INT count)
5951 machine_mode mode = TYPE_MODE (vectype);
5953 /* If this is single-element interleaving with an element distance
5954 that leaves unused vector loads around punt - we at least create
5955 very sub-optimal code in that case (and blow up memory,
5956 see PR65518). */
5957 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5959 if (dump_enabled_p ())
5960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5961 "single-element interleaving not supported "
5962 "for not adjacent vector loads\n");
5963 return false;
5966 /* vect_permute_load_chain requires the group size to be equal to 3 or
5967 be a power of two. */
5968 if (count != 3 && exact_log2 (count) == -1)
5970 if (dump_enabled_p ())
5971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5972 "the size of the group of accesses"
5973 " is not a power of 2 or not equal to 3\n");
5974 return false;
5977 /* Check that the permutation is supported. */
5978 if (VECTOR_MODE_P (mode))
5980 unsigned int i, j;
5981 if (count == 3)
5983 unsigned int nelt;
5984 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5986 if (dump_enabled_p ())
5987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5988 "cannot handle groups of 3 loads for"
5989 " variable-length vectors\n");
5990 return false;
5993 vec_perm_builder sel (nelt, nelt, 1);
5994 sel.quick_grow (nelt);
5995 vec_perm_indices indices;
5996 unsigned int k;
5997 for (k = 0; k < 3; k++)
5999 for (i = 0; i < nelt; i++)
6000 if (3 * i + k < 2 * nelt)
6001 sel[i] = 3 * i + k;
6002 else
6003 sel[i] = 0;
6004 indices.new_vector (sel, 2, nelt);
6005 if (!can_vec_perm_const_p (mode, mode, indices))
6007 if (dump_enabled_p ())
6008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6009 "shuffle of 3 loads is not supported by"
6010 " target\n");
6011 return false;
6013 for (i = 0, j = 0; i < nelt; i++)
6014 if (3 * i + k < 2 * nelt)
6015 sel[i] = i;
6016 else
6017 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6018 indices.new_vector (sel, 2, nelt);
6019 if (!can_vec_perm_const_p (mode, mode, indices))
6021 if (dump_enabled_p ())
6022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6023 "shuffle of 3 loads is not supported by"
6024 " target\n");
6025 return false;
6028 return true;
6030 else
6032 /* If length is not equal to 3 then only power of 2 is supported. */
6033 gcc_assert (pow2p_hwi (count));
6034 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6036 /* The encoding has a single stepped pattern. */
6037 vec_perm_builder sel (nelt, 1, 3);
6038 sel.quick_grow (3);
6039 for (i = 0; i < 3; i++)
6040 sel[i] = i * 2;
6041 vec_perm_indices indices (sel, 2, nelt);
6042 if (can_vec_perm_const_p (mode, mode, indices))
6044 for (i = 0; i < 3; i++)
6045 sel[i] = i * 2 + 1;
6046 indices.new_vector (sel, 2, nelt);
6047 if (can_vec_perm_const_p (mode, mode, indices))
6048 return true;
6053 if (dump_enabled_p ())
6054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6055 "extract even/odd not supported by target\n");
6056 return false;
6059 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6060 type VECTYPE. MASKED_P says whether the masked form is needed. */
6062 bool
6063 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6064 bool masked_p)
6066 if (masked_p)
6067 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6068 vec_mask_load_lanes_optab,
6069 vectype, count);
6070 else
6071 return vect_lanes_optab_supported_p ("vec_load_lanes",
6072 vec_load_lanes_optab,
6073 vectype, count);
6076 /* Function vect_permute_load_chain.
6078 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6079 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6080 the input data correctly. Return the final references for loads in
6081 RESULT_CHAIN.
6083 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6084 The input is 4 vectors each containing 8 elements. We assign a number to each
6085 element, the input sequence is:
6087 1st vec: 0 1 2 3 4 5 6 7
6088 2nd vec: 8 9 10 11 12 13 14 15
6089 3rd vec: 16 17 18 19 20 21 22 23
6090 4th vec: 24 25 26 27 28 29 30 31
6092 The output sequence should be:
6094 1st vec: 0 4 8 12 16 20 24 28
6095 2nd vec: 1 5 9 13 17 21 25 29
6096 3rd vec: 2 6 10 14 18 22 26 30
6097 4th vec: 3 7 11 15 19 23 27 31
6099 i.e., the first output vector should contain the first elements of each
6100 interleaving group, etc.
6102 We use extract_even/odd instructions to create such output. The input of
6103 each extract_even/odd operation is two vectors
6104 1st vec 2nd vec
6105 0 1 2 3 4 5 6 7
6107 and the output is the vector of extracted even/odd elements. The output of
6108 extract_even will be: 0 2 4 6
6109 and of extract_odd: 1 3 5 7
6112 The permutation is done in log LENGTH stages. In each stage extract_even
6113 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6114 their order. In our example,
6116 E1: extract_even (1st vec, 2nd vec)
6117 E2: extract_odd (1st vec, 2nd vec)
6118 E3: extract_even (3rd vec, 4th vec)
6119 E4: extract_odd (3rd vec, 4th vec)
6121 The output for the first stage will be:
6123 E1: 0 2 4 6 8 10 12 14
6124 E2: 1 3 5 7 9 11 13 15
6125 E3: 16 18 20 22 24 26 28 30
6126 E4: 17 19 21 23 25 27 29 31
6128 In order to proceed and create the correct sequence for the next stage (or
6129 for the correct output, if the second stage is the last one, as in our
6130 example), we first put the output of extract_even operation and then the
6131 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6132 The input for the second stage is:
6134 1st vec (E1): 0 2 4 6 8 10 12 14
6135 2nd vec (E3): 16 18 20 22 24 26 28 30
6136 3rd vec (E2): 1 3 5 7 9 11 13 15
6137 4th vec (E4): 17 19 21 23 25 27 29 31
6139 The output of the second stage:
6141 E1: 0 4 8 12 16 20 24 28
6142 E2: 2 6 10 14 18 22 26 30
6143 E3: 1 5 9 13 17 21 25 29
6144 E4: 3 7 11 15 19 23 27 31
6146 And RESULT_CHAIN after reordering:
6148 1st vec (E1): 0 4 8 12 16 20 24 28
6149 2nd vec (E3): 1 5 9 13 17 21 25 29
6150 3rd vec (E2): 2 6 10 14 18 22 26 30
6151 4th vec (E4): 3 7 11 15 19 23 27 31. */
6153 static void
6154 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6155 unsigned int length,
6156 stmt_vec_info stmt_info,
6157 gimple_stmt_iterator *gsi,
6158 vec<tree> *result_chain)
6160 tree data_ref, first_vect, second_vect;
6161 tree perm_mask_even, perm_mask_odd;
6162 tree perm3_mask_low, perm3_mask_high;
6163 gimple *perm_stmt;
6164 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6165 unsigned int i, j, log_length = exact_log2 (length);
6167 result_chain->quick_grow (length);
6168 memcpy (result_chain->address (), dr_chain.address (),
6169 length * sizeof (tree));
6171 if (length == 3)
6173 /* vect_grouped_load_supported ensures that this is constant. */
6174 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6175 unsigned int k;
6177 vec_perm_builder sel (nelt, nelt, 1);
6178 sel.quick_grow (nelt);
6179 vec_perm_indices indices;
6180 for (k = 0; k < 3; k++)
6182 for (i = 0; i < nelt; i++)
6183 if (3 * i + k < 2 * nelt)
6184 sel[i] = 3 * i + k;
6185 else
6186 sel[i] = 0;
6187 indices.new_vector (sel, 2, nelt);
6188 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6190 for (i = 0, j = 0; i < nelt; i++)
6191 if (3 * i + k < 2 * nelt)
6192 sel[i] = i;
6193 else
6194 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6195 indices.new_vector (sel, 2, nelt);
6196 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6198 first_vect = dr_chain[0];
6199 second_vect = dr_chain[1];
6201 /* Create interleaving stmt (low part of):
6202 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6203 ...}> */
6204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6205 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6206 second_vect, perm3_mask_low);
6207 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6209 /* Create interleaving stmt (high part of):
6210 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6211 ...}> */
6212 first_vect = data_ref;
6213 second_vect = dr_chain[2];
6214 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6215 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6216 second_vect, perm3_mask_high);
6217 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6218 (*result_chain)[k] = data_ref;
6221 else
6223 /* If length is not equal to 3 then only power of 2 is supported. */
6224 gcc_assert (pow2p_hwi (length));
6226 /* The encoding has a single stepped pattern. */
6227 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6228 vec_perm_builder sel (nelt, 1, 3);
6229 sel.quick_grow (3);
6230 for (i = 0; i < 3; ++i)
6231 sel[i] = i * 2;
6232 vec_perm_indices indices (sel, 2, nelt);
6233 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6235 for (i = 0; i < 3; ++i)
6236 sel[i] = i * 2 + 1;
6237 indices.new_vector (sel, 2, nelt);
6238 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6240 for (i = 0; i < log_length; i++)
6242 for (j = 0; j < length; j += 2)
6244 first_vect = dr_chain[j];
6245 second_vect = dr_chain[j+1];
6247 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6248 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6249 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6250 first_vect, second_vect,
6251 perm_mask_even);
6252 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6253 (*result_chain)[j/2] = data_ref;
6255 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6256 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6257 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6258 first_vect, second_vect,
6259 perm_mask_odd);
6260 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6261 (*result_chain)[j/2+length/2] = data_ref;
6263 memcpy (dr_chain.address (), result_chain->address (),
6264 length * sizeof (tree));
6269 /* Function vect_shift_permute_load_chain.
6271 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6272 sequence of stmts to reorder the input data accordingly.
6273 Return the final references for loads in RESULT_CHAIN.
6274 Return true if successed, false otherwise.
6276 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6277 The input is 3 vectors each containing 8 elements. We assign a
6278 number to each element, the input sequence is:
6280 1st vec: 0 1 2 3 4 5 6 7
6281 2nd vec: 8 9 10 11 12 13 14 15
6282 3rd vec: 16 17 18 19 20 21 22 23
6284 The output sequence should be:
6286 1st vec: 0 3 6 9 12 15 18 21
6287 2nd vec: 1 4 7 10 13 16 19 22
6288 3rd vec: 2 5 8 11 14 17 20 23
6290 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6292 First we shuffle all 3 vectors to get correct elements order:
6294 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6295 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6296 3rd vec: (16 19 22) (17 20 23) (18 21)
6298 Next we unite and shift vector 3 times:
6300 1st step:
6301 shift right by 6 the concatenation of:
6302 "1st vec" and "2nd vec"
6303 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6304 "2nd vec" and "3rd vec"
6305 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6306 "3rd vec" and "1st vec"
6307 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6308 | New vectors |
6310 So that now new vectors are:
6312 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6313 2nd vec: (10 13) (16 19 22) (17 20 23)
6314 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6316 2nd step:
6317 shift right by 5 the concatenation of:
6318 "1st vec" and "3rd vec"
6319 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6320 "2nd vec" and "1st vec"
6321 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6322 "3rd vec" and "2nd vec"
6323 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6324 | New vectors |
6326 So that now new vectors are:
6328 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6329 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6330 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6332 3rd step:
6333 shift right by 5 the concatenation of:
6334 "1st vec" and "1st vec"
6335 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6336 shift right by 3 the concatenation of:
6337 "2nd vec" and "2nd vec"
6338 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6339 | New vectors |
6341 So that now all vectors are READY:
6342 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6343 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6344 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6346 This algorithm is faster than one in vect_permute_load_chain if:
6347 1. "shift of a concatination" is faster than general permutation.
6348 This is usually so.
6349 2. The TARGET machine can't execute vector instructions in parallel.
6350 This is because each step of the algorithm depends on previous.
6351 The algorithm in vect_permute_load_chain is much more parallel.
6353 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6356 static bool
6357 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6358 unsigned int length,
6359 stmt_vec_info stmt_info,
6360 gimple_stmt_iterator *gsi,
6361 vec<tree> *result_chain)
6363 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6364 tree perm2_mask1, perm2_mask2, perm3_mask;
6365 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6366 gimple *perm_stmt;
6368 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6369 machine_mode vmode = TYPE_MODE (vectype);
6370 unsigned int i;
6371 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6373 unsigned HOST_WIDE_INT nelt, vf;
6374 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6375 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6376 /* Not supported for variable-length vectors. */
6377 return false;
6379 vec_perm_builder sel (nelt, nelt, 1);
6380 sel.quick_grow (nelt);
6382 result_chain->quick_grow (length);
6383 memcpy (result_chain->address (), dr_chain.address (),
6384 length * sizeof (tree));
6386 if (pow2p_hwi (length) && vf > 4)
6388 unsigned int j, log_length = exact_log2 (length);
6389 for (i = 0; i < nelt / 2; ++i)
6390 sel[i] = i * 2;
6391 for (i = 0; i < nelt / 2; ++i)
6392 sel[nelt / 2 + i] = i * 2 + 1;
6393 vec_perm_indices indices (sel, 2, nelt);
6394 if (!can_vec_perm_const_p (vmode, vmode, indices))
6396 if (dump_enabled_p ())
6397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398 "shuffle of 2 fields structure is not \
6399 supported by target\n");
6400 return false;
6402 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6404 for (i = 0; i < nelt / 2; ++i)
6405 sel[i] = i * 2 + 1;
6406 for (i = 0; i < nelt / 2; ++i)
6407 sel[nelt / 2 + i] = i * 2;
6408 indices.new_vector (sel, 2, nelt);
6409 if (!can_vec_perm_const_p (vmode, vmode, indices))
6411 if (dump_enabled_p ())
6412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6413 "shuffle of 2 fields structure is not \
6414 supported by target\n");
6415 return false;
6417 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6419 /* Generating permutation constant to shift all elements.
6420 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6421 for (i = 0; i < nelt; i++)
6422 sel[i] = nelt / 2 + i;
6423 indices.new_vector (sel, 2, nelt);
6424 if (!can_vec_perm_const_p (vmode, vmode, indices))
6426 if (dump_enabled_p ())
6427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428 "shift permutation is not supported by target\n");
6429 return false;
6431 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6433 /* Generating permutation constant to select vector from 2.
6434 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6435 for (i = 0; i < nelt / 2; i++)
6436 sel[i] = i;
6437 for (i = nelt / 2; i < nelt; i++)
6438 sel[i] = nelt + i;
6439 indices.new_vector (sel, 2, nelt);
6440 if (!can_vec_perm_const_p (vmode, vmode, indices))
6442 if (dump_enabled_p ())
6443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6444 "select is not supported by target\n");
6445 return false;
6447 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6449 for (i = 0; i < log_length; i++)
6451 for (j = 0; j < length; j += 2)
6453 first_vect = dr_chain[j];
6454 second_vect = dr_chain[j + 1];
6456 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6457 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6458 first_vect, first_vect,
6459 perm2_mask1);
6460 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6461 vect[0] = data_ref;
6463 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6464 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6465 second_vect, second_vect,
6466 perm2_mask2);
6467 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6468 vect[1] = data_ref;
6470 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6471 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6472 vect[0], vect[1], shift1_mask);
6473 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6474 (*result_chain)[j/2 + length/2] = data_ref;
6476 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6477 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6478 vect[0], vect[1], select_mask);
6479 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6480 (*result_chain)[j/2] = data_ref;
6482 memcpy (dr_chain.address (), result_chain->address (),
6483 length * sizeof (tree));
6485 return true;
6487 if (length == 3 && vf > 2)
6489 unsigned int k = 0, l = 0;
6491 /* Generating permutation constant to get all elements in rigth order.
6492 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6493 for (i = 0; i < nelt; i++)
6495 if (3 * k + (l % 3) >= nelt)
6497 k = 0;
6498 l += (3 - (nelt % 3));
6500 sel[i] = 3 * k + (l % 3);
6501 k++;
6503 vec_perm_indices indices (sel, 2, nelt);
6504 if (!can_vec_perm_const_p (vmode, vmode, indices))
6506 if (dump_enabled_p ())
6507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6508 "shuffle of 3 fields structure is not \
6509 supported by target\n");
6510 return false;
6512 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6514 /* Generating permutation constant to shift all elements.
6515 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6516 for (i = 0; i < nelt; i++)
6517 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6518 indices.new_vector (sel, 2, nelt);
6519 if (!can_vec_perm_const_p (vmode, vmode, indices))
6521 if (dump_enabled_p ())
6522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6523 "shift permutation is not supported by target\n");
6524 return false;
6526 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6528 /* Generating permutation constant to shift all elements.
6529 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6530 for (i = 0; i < nelt; i++)
6531 sel[i] = 2 * (nelt / 3) + 1 + i;
6532 indices.new_vector (sel, 2, nelt);
6533 if (!can_vec_perm_const_p (vmode, vmode, indices))
6535 if (dump_enabled_p ())
6536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6537 "shift permutation is not supported by target\n");
6538 return false;
6540 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6542 /* Generating permutation constant to shift all elements.
6543 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6544 for (i = 0; i < nelt; i++)
6545 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6546 indices.new_vector (sel, 2, nelt);
6547 if (!can_vec_perm_const_p (vmode, vmode, indices))
6549 if (dump_enabled_p ())
6550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551 "shift permutation is not supported by target\n");
6552 return false;
6554 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6556 /* Generating permutation constant to shift all elements.
6557 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6558 for (i = 0; i < nelt; i++)
6559 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6560 indices.new_vector (sel, 2, nelt);
6561 if (!can_vec_perm_const_p (vmode, vmode, indices))
6563 if (dump_enabled_p ())
6564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6565 "shift permutation is not supported by target\n");
6566 return false;
6568 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6570 for (k = 0; k < 3; k++)
6572 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6573 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6574 dr_chain[k], dr_chain[k],
6575 perm3_mask);
6576 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6577 vect[k] = data_ref;
6580 for (k = 0; k < 3; k++)
6582 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6583 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6584 vect[k % 3], vect[(k + 1) % 3],
6585 shift1_mask);
6586 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6587 vect_shift[k] = data_ref;
6590 for (k = 0; k < 3; k++)
6592 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6593 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6594 vect_shift[(4 - k) % 3],
6595 vect_shift[(3 - k) % 3],
6596 shift2_mask);
6597 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6598 vect[k] = data_ref;
6601 (*result_chain)[3 - (nelt % 3)] = vect[2];
6603 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6604 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6605 vect[0], shift3_mask);
6606 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6607 (*result_chain)[nelt % 3] = data_ref;
6609 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6610 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6611 vect[1], shift4_mask);
6612 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6613 (*result_chain)[0] = data_ref;
6614 return true;
6616 return false;
6619 /* Function vect_transform_grouped_load.
6621 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6622 to perform their permutation and ascribe the result vectorized statements to
6623 the scalar statements.
6626 void
6627 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6628 vec<tree> dr_chain,
6629 int size, gimple_stmt_iterator *gsi)
6631 machine_mode mode;
6632 vec<tree> result_chain = vNULL;
6634 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6635 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6636 vectors, that are ready for vector computation. */
6637 result_chain.create (size);
6639 /* If reassociation width for vector type is 2 or greater target machine can
6640 execute 2 or more vector instructions in parallel. Otherwise try to
6641 get chain for loads group using vect_shift_permute_load_chain. */
6642 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6643 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6644 || pow2p_hwi (size)
6645 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6646 gsi, &result_chain))
6647 vect_permute_load_chain (vinfo, dr_chain,
6648 size, stmt_info, gsi, &result_chain);
6649 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6650 result_chain.release ();
6653 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6654 generated as part of the vectorization of STMT_INFO. Assign the statement
6655 for each vector to the associated scalar statement. */
6657 void
6658 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6659 vec<tree> result_chain)
6661 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6662 unsigned int i, gap_count;
6663 tree tmp_data_ref;
6665 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6666 Since we scan the chain starting from it's first node, their order
6667 corresponds the order of data-refs in RESULT_CHAIN. */
6668 stmt_vec_info next_stmt_info = first_stmt_info;
6669 gap_count = 1;
6670 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6672 if (!next_stmt_info)
6673 break;
6675 /* Skip the gaps. Loads created for the gaps will be removed by dead
6676 code elimination pass later. No need to check for the first stmt in
6677 the group, since it always exists.
6678 DR_GROUP_GAP is the number of steps in elements from the previous
6679 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6680 correspond to the gaps. */
6681 if (next_stmt_info != first_stmt_info
6682 && gap_count < DR_GROUP_GAP (next_stmt_info))
6684 gap_count++;
6685 continue;
6688 /* ??? The following needs cleanup after the removal of
6689 DR_GROUP_SAME_DR_STMT. */
6690 if (next_stmt_info)
6692 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6693 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6694 copies, and we put the new vector statement last. */
6695 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6697 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6698 gap_count = 1;
6703 /* Function vect_force_dr_alignment_p.
6705 Returns whether the alignment of a DECL can be forced to be aligned
6706 on ALIGNMENT bit boundary. */
6708 bool
6709 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6711 if (!VAR_P (decl))
6712 return false;
6714 if (decl_in_symtab_p (decl)
6715 && !symtab_node::get (decl)->can_increase_alignment_p ())
6716 return false;
6718 if (TREE_STATIC (decl))
6719 return (known_le (alignment,
6720 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6721 else
6722 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6725 /* Return whether the data reference DR_INFO is supported with respect to its
6726 alignment.
6727 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6728 it is aligned, i.e., check if it is possible to vectorize it with different
6729 alignment. */
6731 enum dr_alignment_support
6732 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6733 tree vectype, int misalignment)
6735 data_reference *dr = dr_info->dr;
6736 stmt_vec_info stmt_info = dr_info->stmt;
6737 machine_mode mode = TYPE_MODE (vectype);
6738 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6739 class loop *vect_loop = NULL;
6740 bool nested_in_vect_loop = false;
6742 if (misalignment == 0)
6743 return dr_aligned;
6745 /* For now assume all conditional loads/stores support unaligned
6746 access without any special code. */
6747 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6748 if (gimple_call_internal_p (stmt)
6749 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6750 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6751 return dr_unaligned_supported;
6753 if (loop_vinfo)
6755 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6756 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6759 /* Possibly unaligned access. */
6761 /* We can choose between using the implicit realignment scheme (generating
6762 a misaligned_move stmt) and the explicit realignment scheme (generating
6763 aligned loads with a REALIGN_LOAD). There are two variants to the
6764 explicit realignment scheme: optimized, and unoptimized.
6765 We can optimize the realignment only if the step between consecutive
6766 vector loads is equal to the vector size. Since the vector memory
6767 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6768 is guaranteed that the misalignment amount remains the same throughout the
6769 execution of the vectorized loop. Therefore, we can create the
6770 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6771 at the loop preheader.
6773 However, in the case of outer-loop vectorization, when vectorizing a
6774 memory access in the inner-loop nested within the LOOP that is now being
6775 vectorized, while it is guaranteed that the misalignment of the
6776 vectorized memory access will remain the same in different outer-loop
6777 iterations, it is *not* guaranteed that is will remain the same throughout
6778 the execution of the inner-loop. This is because the inner-loop advances
6779 with the original scalar step (and not in steps of VS). If the inner-loop
6780 step happens to be a multiple of VS, then the misalignment remains fixed
6781 and we can use the optimized realignment scheme. For example:
6783 for (i=0; i<N; i++)
6784 for (j=0; j<M; j++)
6785 s += a[i+j];
6787 When vectorizing the i-loop in the above example, the step between
6788 consecutive vector loads is 1, and so the misalignment does not remain
6789 fixed across the execution of the inner-loop, and the realignment cannot
6790 be optimized (as illustrated in the following pseudo vectorized loop):
6792 for (i=0; i<N; i+=4)
6793 for (j=0; j<M; j++){
6794 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6795 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6796 // (assuming that we start from an aligned address).
6799 We therefore have to use the unoptimized realignment scheme:
6801 for (i=0; i<N; i+=4)
6802 for (j=k; j<M; j+=4)
6803 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6804 // that the misalignment of the initial address is
6805 // 0).
6807 The loop can then be vectorized as follows:
6809 for (k=0; k<4; k++){
6810 rt = get_realignment_token (&vp[k]);
6811 for (i=0; i<N; i+=4){
6812 v1 = vp[i+k];
6813 for (j=k; j<M; j+=4){
6814 v2 = vp[i+j+VS-1];
6815 va = REALIGN_LOAD <v1,v2,rt>;
6816 vs += va;
6817 v1 = v2;
6820 } */
6822 if (DR_IS_READ (dr))
6824 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6825 && (!targetm.vectorize.builtin_mask_for_load
6826 || targetm.vectorize.builtin_mask_for_load ()))
6828 /* If we are doing SLP then the accesses need not have the
6829 same alignment, instead it depends on the SLP group size. */
6830 if (loop_vinfo
6831 && STMT_SLP_TYPE (stmt_info)
6832 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
6833 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6834 * (DR_GROUP_SIZE
6835 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6836 TYPE_VECTOR_SUBPARTS (vectype))))
6838 else if (!loop_vinfo
6839 || (nested_in_vect_loop
6840 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6841 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6842 return dr_explicit_realign;
6843 else
6844 return dr_explicit_realign_optimized;
6848 bool is_packed = false;
6849 tree type = TREE_TYPE (DR_REF (dr));
6850 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6851 is_packed = not_size_aligned (DR_REF (dr));
6852 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6853 is_packed))
6854 return dr_unaligned_supported;
6856 /* Unsupported. */
6857 return dr_unaligned_unsupported;