Fix couple of endianness issues in fold_ctor_reference
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blobebe93832b1e89120eab2fdac0fc30fe35c0356a2
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 HOST_WIDE_INT lhs, rhs;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
150 unsigned int i = 0;
151 if (gimple_call_internal_p (call))
153 internal_fn ifn = gimple_call_internal_fn (call);
154 if (internal_load_fn_p (ifn))
155 /* For loads the LHS type does the trick. */
156 i = ~0U;
157 else if (internal_store_fn_p (ifn))
159 /* For stores use the tyep of the stored value. */
160 i = internal_fn_stored_value_index (ifn);
161 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
162 i = ~0U;
164 else if (internal_fn_mask_index (ifn) == 0)
165 i = 1;
167 if (i < gimple_call_num_args (call))
169 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
170 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
172 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
173 if (rhs < lhs)
174 scalar_type = rhs_type;
179 return scalar_type;
183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
184 tested at run-time. Return TRUE if DDR was successfully inserted.
185 Return false if versioning is not supported. */
187 static opt_result
188 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
190 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
192 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
193 return opt_result::failure_at (vect_location,
194 "will not create alias checks, as"
195 " --param vect-max-version-for-alias-checks"
196 " == 0\n");
198 opt_result res
199 = runtime_alias_check_p (ddr, loop,
200 optimize_loop_nest_for_speed_p (loop));
201 if (!res)
202 return res;
204 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
205 return opt_result::success ();
208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
210 static void
211 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
213 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
214 for (unsigned int i = 0; i < checks.length(); ++i)
215 if (checks[i] == value)
216 return;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location,
220 "need run-time check that %T is nonzero\n",
221 value);
222 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 /* Return true if we know that the order of vectorized DR_INFO_A and
226 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
227 DR_INFO_B. At least one of the accesses is a write. */
229 static bool
230 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
232 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
233 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
235 /* Single statements are always kept in their original order. */
236 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
237 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
238 return true;
240 /* STMT_A and STMT_B belong to overlapping groups. All loads are
241 emitted at the position of the first scalar load.
242 Stores in a group are emitted at the position of the last scalar store.
243 Compute that position and check whether the resulting order matches
244 the current one. */
245 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246 if (il_a)
248 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
250 s = DR_GROUP_NEXT_ELEMENT (s))
251 il_a = get_later_stmt (il_a, s);
252 else /* DR_IS_READ */
253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 s = DR_GROUP_NEXT_ELEMENT (s))
255 if (get_later_stmt (il_a, s) == il_a)
256 il_a = s;
258 else
259 il_a = stmtinfo_a;
260 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
261 if (il_b)
263 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
264 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
265 s = DR_GROUP_NEXT_ELEMENT (s))
266 il_b = get_later_stmt (il_b, s);
267 else /* DR_IS_READ */
268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
269 s = DR_GROUP_NEXT_ELEMENT (s))
270 if (get_later_stmt (il_b, s) == il_b)
271 il_b = s;
273 else
274 il_b = stmtinfo_b;
275 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
276 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 /* A subroutine of vect_analyze_data_ref_dependence. Handle
280 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
281 distances. These distances are conservatively correct but they don't
282 reflect a guaranteed dependence.
284 Return true if this function does all the work necessary to avoid
285 an alias or false if the caller should use the dependence distances
286 to limit the vectorization factor in the usual way. LOOP_DEPTH is
287 the depth of the loop described by LOOP_VINFO and the other arguments
288 are as for vect_analyze_data_ref_dependence. */
290 static bool
291 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
292 loop_vec_info loop_vinfo,
293 int loop_depth, unsigned int *max_vf)
295 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
296 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
298 int dist = dist_v[loop_depth];
299 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
301 /* If the user asserted safelen >= DIST consecutive iterations
302 can be executed concurrently, assume independence.
304 ??? An alternative would be to add the alias check even
305 in this case, and vectorize the fallback loop with the
306 maximum VF set to safelen. However, if the user has
307 explicitly given a length, it's less likely that that
308 would be a win. */
309 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
311 if ((unsigned int) loop->safelen < *max_vf)
312 *max_vf = loop->safelen;
313 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
314 continue;
317 /* For dependence distances of 2 or more, we have the option
318 of limiting VF or checking for an alias at runtime.
319 Prefer to check at runtime if we can, to avoid limiting
320 the VF unnecessarily when the bases are in fact independent.
322 Note that the alias checks will be removed if the VF ends up
323 being small enough. */
324 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
325 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
326 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
327 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
328 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 return true;
335 /* Function vect_analyze_data_ref_dependence.
337 FIXME: I needed to change the sense of the returned flag.
339 Return FALSE if there (might) exist a dependence between a memory-reference
340 DRA and a memory-reference DRB. When versioning for alias may check a
341 dependence at run-time, return TRUE. Adjust *MAX_VF according to
342 the data dependence. */
344 static opt_result
345 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
346 loop_vec_info loop_vinfo,
347 unsigned int *max_vf)
349 unsigned int i;
350 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
351 struct data_reference *dra = DDR_A (ddr);
352 struct data_reference *drb = DDR_B (ddr);
353 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
354 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
355 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
356 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
357 lambda_vector dist_v;
358 unsigned int loop_depth;
360 /* If user asserted safelen consecutive iterations can be
361 executed concurrently, assume independence. */
362 auto apply_safelen = [&]()
364 if (loop->safelen >= 2)
366 if ((unsigned int) loop->safelen < *max_vf)
367 *max_vf = loop->safelen;
368 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
369 return true;
371 return false;
374 /* In loop analysis all data references should be vectorizable. */
375 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
376 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
377 gcc_unreachable ();
379 /* Independent data accesses. */
380 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
381 return opt_result::success ();
383 if (dra == drb
384 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
385 return opt_result::success ();
387 /* We do not have to consider dependences between accesses that belong
388 to the same group, unless the stride could be smaller than the
389 group size. */
390 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
391 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
393 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
394 return opt_result::success ();
396 /* Even if we have an anti-dependence then, as the vectorized loop covers at
397 least two scalar iterations, there is always also a true dependence.
398 As the vectorizer does not re-order loads and stores we can ignore
399 the anti-dependence if TBAA can disambiguate both DRs similar to the
400 case with known negative distance anti-dependences (positive
401 distance anti-dependences would violate TBAA constraints). */
402 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
403 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
404 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
405 get_alias_set (DR_REF (drb))))
406 return opt_result::success ();
408 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
409 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
411 if (apply_safelen ())
412 return opt_result::success ();
414 return opt_result::failure_at
415 (stmtinfo_a->stmt,
416 "possible alias involving gather/scatter between %T and %T\n",
417 DR_REF (dra), DR_REF (drb));
420 /* Unknown data dependence. */
421 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
423 if (apply_safelen ())
424 return opt_result::success ();
426 if (dump_enabled_p ())
427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
428 "versioning for alias required: "
429 "can't determine dependence between %T and %T\n",
430 DR_REF (dra), DR_REF (drb));
432 /* Add to list of ddrs that need to be tested at run-time. */
433 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436 /* Known data dependence. */
437 if (DDR_NUM_DIST_VECTS (ddr) == 0)
439 if (apply_safelen ())
440 return opt_result::success ();
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
444 "versioning for alias required: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra), DR_REF (drb));
447 /* Add to list of ddrs that need to be tested at run-time. */
448 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
453 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
454 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
455 loop_depth, max_vf))
456 return opt_result::success ();
458 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
460 int dist = dist_v[loop_depth];
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "dependence distance = %d.\n", dist);
466 if (dist == 0)
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance == 0 between %T and %T\n",
471 DR_REF (dra), DR_REF (drb));
473 /* When we perform grouped accesses and perform implicit CSE
474 by detecting equal accesses and doing disambiguation with
475 runtime alias tests like for
476 .. = a[i];
477 .. = a[i+1];
478 a[i] = ..;
479 a[i+1] = ..;
480 *p = ..;
481 .. = a[i];
482 .. = a[i+1];
483 where we will end up loading { a[i], a[i+1] } once, make
484 sure that inserting group loads before the first load and
485 stores after the last store will do the right thing.
486 Similar for groups like
487 a[i] = ...;
488 ... = a[i];
489 a[i+1] = ...;
490 where loads from the group interleave with the store. */
491 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
492 return opt_result::failure_at (stmtinfo_a->stmt,
493 "READ_WRITE dependence"
494 " in interleaving.\n");
496 if (loop->safelen < 2)
498 tree indicator = dr_zero_step_indicator (dra);
499 if (!indicator || integer_zerop (indicator))
500 return opt_result::failure_at (stmtinfo_a->stmt,
501 "access also has a zero step\n");
502 else if (TREE_CODE (indicator) != INTEGER_CST)
503 vect_check_nonzero_value (loop_vinfo, indicator);
505 continue;
508 if (dist > 0 && DDR_REVERSED_P (ddr))
510 /* If DDR_REVERSED_P the order of the data-refs in DDR was
511 reversed (to make distance vector positive), and the actual
512 distance is negative. */
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance negative.\n");
516 /* When doing outer loop vectorization, we need to check if there is
517 a backward dependence at the inner loop level if the dependence
518 at the outer loop is reversed. See PR81740. */
519 if (nested_in_vect_loop_p (loop, stmtinfo_a)
520 || nested_in_vect_loop_p (loop, stmtinfo_b))
522 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
523 DDR_LOOP_NEST (ddr));
524 if (dist_v[inner_depth] < 0)
525 return opt_result::failure_at (stmtinfo_a->stmt,
526 "not vectorized, dependence "
527 "between data-refs %T and %T\n",
528 DR_REF (dra), DR_REF (drb));
530 /* Record a negative dependence distance to later limit the
531 amount of stmt copying / unrolling we can perform.
532 Only need to handle read-after-write dependence. */
533 if (DR_IS_READ (drb)
534 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
535 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
536 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
537 continue;
540 unsigned int abs_dist = abs (dist);
541 if (abs_dist >= 2 && abs_dist < *max_vf)
543 /* The dependence distance requires reduction of the maximal
544 vectorization factor. */
545 *max_vf = abs_dist;
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE, vect_location,
548 "adjusting maximal vectorization factor to %i\n",
549 *max_vf);
552 if (abs_dist >= *max_vf)
554 /* Dependence distance does not create dependence, as far as
555 vectorization is concerned, in this case. */
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "dependence distance >= VF.\n");
559 continue;
562 return opt_result::failure_at (stmtinfo_a->stmt,
563 "not vectorized, possible dependence "
564 "between data-refs %T and %T\n",
565 DR_REF (dra), DR_REF (drb));
568 return opt_result::success ();
571 /* Function vect_analyze_data_ref_dependences.
573 Examine all the data references in the loop, and make sure there do not
574 exist any data dependences between them. Set *MAX_VF according to
575 the maximum vectorization factor the data dependences allow. */
577 opt_result
578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
579 unsigned int *max_vf)
581 unsigned int i;
582 struct data_dependence_relation *ddr;
584 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
586 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
588 LOOP_VINFO_DDRS (loop_vinfo)
589 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
590 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
591 /* We do not need read-read dependences. */
592 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
593 &LOOP_VINFO_DDRS (loop_vinfo),
594 LOOP_VINFO_LOOP_NEST (loop_vinfo),
595 false);
596 gcc_assert (res);
599 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
601 /* For epilogues we either have no aliases or alias versioning
602 was applied to original loop. Therefore we may just get max_vf
603 using VF of original loop. */
604 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
605 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
606 else
607 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
609 opt_result res
610 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
611 if (!res)
612 return res;
615 return opt_result::success ();
619 /* Function vect_slp_analyze_data_ref_dependence.
621 Return TRUE if there (might) exist a dependence between a memory-reference
622 DRA and a memory-reference DRB for VINFO. When versioning for alias
623 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
624 according to the data dependence. */
626 static bool
627 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
628 struct data_dependence_relation *ddr)
630 struct data_reference *dra = DDR_A (ddr);
631 struct data_reference *drb = DDR_B (ddr);
632 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
633 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
635 /* We need to check dependences of statements marked as unvectorizable
636 as well, they still can prohibit vectorization. */
638 /* Independent data accesses. */
639 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
640 return false;
642 if (dra == drb)
643 return false;
645 /* Read-read is OK. */
646 if (DR_IS_READ (dra) && DR_IS_READ (drb))
647 return false;
649 /* If dra and drb are part of the same interleaving chain consider
650 them independent. */
651 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
652 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
653 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
654 return false;
656 /* Unknown data dependence. */
657 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
661 "can't determine dependence between %T and %T\n",
662 DR_REF (dra), DR_REF (drb));
664 else if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "determined dependence between %T and %T\n",
667 DR_REF (dra), DR_REF (drb));
669 return true;
673 /* Analyze dependences involved in the transform of SLP NODE. STORES
674 contain the vector of scalar stores of this instance if we are
675 disambiguating the loads. */
677 static bool
678 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
679 vec<stmt_vec_info> stores,
680 stmt_vec_info last_store_info)
682 /* This walks over all stmts involved in the SLP load/store done
683 in NODE verifying we can sink them up to the last stmt in the
684 group. */
685 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
687 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
688 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
690 stmt_vec_info access_info
691 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
692 if (access_info == last_access_info)
693 continue;
694 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
695 ao_ref ref;
696 bool ref_initialized_p = false;
697 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
698 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
700 gimple *stmt = gsi_stmt (gsi);
701 if (! gimple_vuse (stmt))
702 continue;
704 /* If we couldn't record a (single) data reference for this
705 stmt we have to resort to the alias oracle. */
706 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
707 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
708 if (!dr_b)
710 /* We are moving a store - this means
711 we cannot use TBAA for disambiguation. */
712 if (!ref_initialized_p)
713 ao_ref_init (&ref, DR_REF (dr_a));
714 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
715 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
716 return false;
717 continue;
720 bool dependent = false;
721 /* If we run into a store of this same instance (we've just
722 marked those) then delay dependence checking until we run
723 into the last store because this is where it will have
724 been sunk to (and we verify if we can do that as well). */
725 if (gimple_visited_p (stmt))
727 if (stmt_info != last_store_info)
728 continue;
730 for (stmt_vec_info &store_info : stores)
732 data_reference *store_dr
733 = STMT_VINFO_DATA_REF (store_info);
734 ddr_p ddr = initialize_data_dependence_relation
735 (dr_a, store_dr, vNULL);
736 dependent
737 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
738 free_dependence_relation (ddr);
739 if (dependent)
740 break;
743 else
745 ddr_p ddr = initialize_data_dependence_relation (dr_a,
746 dr_b, vNULL);
747 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
748 free_dependence_relation (ddr);
750 if (dependent)
751 return false;
755 else /* DR_IS_READ */
757 stmt_vec_info first_access_info
758 = vect_find_first_scalar_stmt_in_slp (node);
759 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
761 stmt_vec_info access_info
762 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
763 if (access_info == first_access_info)
764 continue;
765 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
766 ao_ref ref;
767 bool ref_initialized_p = false;
768 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
769 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
771 gimple *stmt = gsi_stmt (gsi);
772 if (! gimple_vdef (stmt))
773 continue;
775 /* If we couldn't record a (single) data reference for this
776 stmt we have to resort to the alias oracle. */
777 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
778 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
780 /* We are hoisting a load - this means we can use
781 TBAA for disambiguation. */
782 if (!ref_initialized_p)
783 ao_ref_init (&ref, DR_REF (dr_a));
784 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
786 if (!dr_b)
787 return false;
788 /* Resort to dependence checking below. */
790 else
791 /* No dependence. */
792 continue;
794 bool dependent = false;
795 /* If we run into a store of this same instance (we've just
796 marked those) then delay dependence checking until we run
797 into the last store because this is where it will have
798 been sunk to (and we verify if we can do that as well). */
799 if (gimple_visited_p (stmt))
801 if (stmt_info != last_store_info)
802 continue;
804 for (stmt_vec_info &store_info : stores)
806 data_reference *store_dr
807 = STMT_VINFO_DATA_REF (store_info);
808 ddr_p ddr = initialize_data_dependence_relation
809 (dr_a, store_dr, vNULL);
810 dependent
811 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
812 free_dependence_relation (ddr);
813 if (dependent)
814 break;
817 else
819 ddr_p ddr = initialize_data_dependence_relation (dr_a,
820 dr_b, vNULL);
821 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
822 free_dependence_relation (ddr);
824 if (dependent)
825 return false;
829 return true;
833 /* Function vect_analyze_data_ref_dependences.
835 Examine all the data references in the basic-block, and make sure there
836 do not exist any data dependences between them. Set *MAX_VF according to
837 the maximum vectorization factor the data dependences allow. */
839 bool
840 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
842 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
844 /* The stores of this instance are at the root of the SLP tree. */
845 slp_tree store = NULL;
846 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
847 store = SLP_INSTANCE_TREE (instance);
849 /* Verify we can sink stores to the vectorized stmt insert location. */
850 stmt_vec_info last_store_info = NULL;
851 if (store)
853 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
854 return false;
856 /* Mark stores in this instance and remember the last one. */
857 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
858 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
859 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
862 bool res = true;
864 /* Verify we can sink loads to the vectorized stmt insert location,
865 special-casing stores of this instance. */
866 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
867 if (! vect_slp_analyze_node_dependences (vinfo, load,
868 store
869 ? SLP_TREE_SCALAR_STMTS (store)
870 : vNULL, last_store_info))
872 res = false;
873 break;
876 /* Unset the visited flag. */
877 if (store)
878 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
879 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
881 return res;
884 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
885 applied. */
888 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
890 HOST_WIDE_INT diff = 0;
891 /* Alignment is only analyzed for the first element of a DR group,
892 use that but adjust misalignment by the offset of the access. */
893 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
895 dr_vec_info *first_dr
896 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
897 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
898 INTEGER_CSTs and the first element in the group has the lowest
899 address. */
900 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
901 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
902 gcc_assert (diff >= 0);
903 dr_info = first_dr;
906 int misalign = dr_info->misalignment;
907 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
908 if (misalign == DR_MISALIGNMENT_UNKNOWN)
909 return misalign;
911 /* If the access is only aligned for a vector type with smaller alignment
912 requirement the access has unknown misalignment. */
913 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
914 targetm.vectorize.preferred_vector_alignment (vectype)))
915 return DR_MISALIGNMENT_UNKNOWN;
917 /* Apply the offset from the DR group start and the externally supplied
918 offset which can for example result from a negative stride access. */
919 poly_int64 misalignment = misalign + diff + offset;
921 /* vect_compute_data_ref_alignment will have ensured that target_alignment
922 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
923 unsigned HOST_WIDE_INT target_alignment_c
924 = dr_info->target_alignment.to_constant ();
925 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
926 return DR_MISALIGNMENT_UNKNOWN;
927 return misalign;
930 /* Record the base alignment guarantee given by DRB, which occurs
931 in STMT_INFO. */
933 static void
934 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
935 innermost_loop_behavior *drb)
937 bool existed;
938 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
939 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
940 if (!existed || entry.second->base_alignment < drb->base_alignment)
942 entry = std::make_pair (stmt_info, drb);
943 if (dump_enabled_p ())
944 dump_printf_loc (MSG_NOTE, vect_location,
945 "recording new base alignment for %T\n"
946 " alignment: %d\n"
947 " misalignment: %d\n"
948 " based on: %G",
949 drb->base_address,
950 drb->base_alignment,
951 drb->base_misalignment,
952 stmt_info->stmt);
956 /* If the region we're going to vectorize is reached, all unconditional
957 data references occur at least once. We can therefore pool the base
958 alignment guarantees from each unconditional reference. Do this by
959 going through all the data references in VINFO and checking whether
960 the containing statement makes the reference unconditionally. If so,
961 record the alignment of the base address in VINFO so that it can be
962 used for all other references with the same base. */
964 void
965 vect_record_base_alignments (vec_info *vinfo)
967 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
968 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
969 for (data_reference *dr : vinfo->shared->datarefs)
971 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
972 stmt_vec_info stmt_info = dr_info->stmt;
973 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
974 && STMT_VINFO_VECTORIZABLE (stmt_info)
975 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
977 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
979 /* If DR is nested in the loop that is being vectorized, we can also
980 record the alignment of the base wrt the outer loop. */
981 if (loop && nested_in_vect_loop_p (loop, stmt_info))
982 vect_record_base_alignment
983 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
988 /* Function vect_compute_data_ref_alignment
990 Compute the misalignment of the data reference DR_INFO when vectorizing
991 with VECTYPE.
993 Output:
994 1. initialized misalignment info for DR_INFO
996 FOR NOW: No analysis is actually performed. Misalignment is calculated
997 only for trivial cases. TODO. */
999 static void
1000 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1001 tree vectype)
1003 stmt_vec_info stmt_info = dr_info->stmt;
1004 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1005 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1006 class loop *loop = NULL;
1007 tree ref = DR_REF (dr_info->dr);
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_NOTE, vect_location,
1011 "vect_compute_data_ref_alignment:\n");
1013 if (loop_vinfo)
1014 loop = LOOP_VINFO_LOOP (loop_vinfo);
1016 /* Initialize misalignment to unknown. */
1017 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1019 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1020 return;
1022 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1023 bool step_preserves_misalignment_p;
1025 poly_uint64 vector_alignment
1026 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1027 BITS_PER_UNIT);
1028 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1030 /* If the main loop has peeled for alignment we have no way of knowing
1031 whether the data accesses in the epilogues are aligned. We can't at
1032 compile time answer the question whether we have entered the main loop or
1033 not. Fixes PR 92351. */
1034 if (loop_vinfo)
1036 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1037 if (orig_loop_vinfo
1038 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1039 return;
1042 unsigned HOST_WIDE_INT vect_align_c;
1043 if (!vector_alignment.is_constant (&vect_align_c))
1044 return;
1046 /* No step for BB vectorization. */
1047 if (!loop)
1049 gcc_assert (integer_zerop (drb->step));
1050 step_preserves_misalignment_p = true;
1053 /* In case the dataref is in an inner-loop of the loop that is being
1054 vectorized (LOOP), we use the base and misalignment information
1055 relative to the outer-loop (LOOP). This is ok only if the misalignment
1056 stays the same throughout the execution of the inner-loop, which is why
1057 we have to check that the stride of the dataref in the inner-loop evenly
1058 divides by the vector alignment. */
1059 else if (nested_in_vect_loop_p (loop, stmt_info))
1061 step_preserves_misalignment_p
1062 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1064 if (dump_enabled_p ())
1066 if (step_preserves_misalignment_p)
1067 dump_printf_loc (MSG_NOTE, vect_location,
1068 "inner step divides the vector alignment.\n");
1069 else
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071 "inner step doesn't divide the vector"
1072 " alignment.\n");
1076 /* Similarly we can only use base and misalignment information relative to
1077 an innermost loop if the misalignment stays the same throughout the
1078 execution of the loop. As above, this is the case if the stride of
1079 the dataref evenly divides by the alignment. */
1080 else
1082 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1083 step_preserves_misalignment_p
1084 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1086 if (!step_preserves_misalignment_p && dump_enabled_p ())
1087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1088 "step doesn't divide the vector alignment.\n");
1091 unsigned int base_alignment = drb->base_alignment;
1092 unsigned int base_misalignment = drb->base_misalignment;
1094 /* Calculate the maximum of the pooled base address alignment and the
1095 alignment that we can compute for DR itself. */
1096 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1097 = base_alignments->get (drb->base_address);
1098 if (entry
1099 && base_alignment < (*entry).second->base_alignment
1100 && (loop_vinfo
1101 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1102 gimple_bb (entry->first->stmt))
1103 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1104 || (entry->first->dr_aux.group <= dr_info->group)))))
1106 base_alignment = entry->second->base_alignment;
1107 base_misalignment = entry->second->base_misalignment;
1110 if (drb->offset_alignment < vect_align_c
1111 || !step_preserves_misalignment_p
1112 /* We need to know whether the step wrt the vectorized loop is
1113 negative when computing the starting misalignment below. */
1114 || TREE_CODE (drb->step) != INTEGER_CST)
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Unknown alignment for access: %T\n", ref);
1119 return;
1122 if (base_alignment < vect_align_c)
1124 unsigned int max_alignment;
1125 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1126 if (max_alignment < vect_align_c
1127 || !vect_can_force_dr_alignment_p (base,
1128 vect_align_c * BITS_PER_UNIT))
1130 if (dump_enabled_p ())
1131 dump_printf_loc (MSG_NOTE, vect_location,
1132 "can't force alignment of ref: %T\n", ref);
1133 return;
1136 /* Force the alignment of the decl.
1137 NOTE: This is the only change to the code we make during
1138 the analysis phase, before deciding to vectorize the loop. */
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_NOTE, vect_location,
1141 "force alignment of %T\n", ref);
1143 dr_info->base_decl = base;
1144 dr_info->base_misaligned = true;
1145 base_misalignment = 0;
1147 poly_int64 misalignment
1148 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1150 unsigned int const_misalignment;
1151 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1153 if (dump_enabled_p ())
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155 "Non-constant misalignment for access: %T\n", ref);
1156 return;
1159 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1161 if (dump_enabled_p ())
1162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163 "misalign = %d bytes of ref %T\n",
1164 const_misalignment, ref);
1166 return;
1169 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1170 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1171 is made aligned via peeling. */
1173 static bool
1174 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1175 dr_vec_info *dr_peel_info)
1177 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1178 DR_TARGET_ALIGNMENT (dr_info)))
1180 poly_offset_int diff
1181 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1182 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1183 if (known_eq (diff, 0)
1184 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1185 return true;
1187 return false;
1190 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1191 aligned via peeling. */
1193 static bool
1194 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1195 dr_vec_info *dr_peel_info)
1197 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1198 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1199 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1200 DR_OFFSET (dr_peel_info->dr), 0)
1201 || !operand_equal_p (DR_STEP (dr_info->dr),
1202 DR_STEP (dr_peel_info->dr), 0))
1203 return false;
1205 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 /* Compute the value for dr_info->misalign so that the access appears
1209 aligned. This is used by peeling to compensate for dr_misalignment
1210 applying the offset for negative step. */
1213 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1215 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1216 return 0;
1218 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1219 poly_int64 misalignment
1220 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1221 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1223 unsigned HOST_WIDE_INT target_alignment_c;
1224 int misalign;
1225 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1226 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1227 return DR_MISALIGNMENT_UNKNOWN;
1228 return misalign;
1231 /* Function vect_update_misalignment_for_peel.
1232 Sets DR_INFO's misalignment
1233 - to 0 if it has the same alignment as DR_PEEL_INFO,
1234 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1235 - to -1 (unknown) otherwise.
1237 DR_INFO - the data reference whose misalignment is to be adjusted.
1238 DR_PEEL_INFO - the data reference whose misalignment is being made
1239 zero in the vector loop by the peel.
1240 NPEEL - the number of iterations in the peel loop if the misalignment
1241 of DR_PEEL_INFO is known at compile time. */
1243 static void
1244 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1245 dr_vec_info *dr_peel_info, int npeel)
1247 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1248 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1250 SET_DR_MISALIGNMENT (dr_info,
1251 vect_dr_misalign_for_aligned_access (dr_peel_info));
1252 return;
1255 unsigned HOST_WIDE_INT alignment;
1256 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1257 && known_alignment_for_access_p (dr_info,
1258 STMT_VINFO_VECTYPE (dr_info->stmt))
1259 && known_alignment_for_access_p (dr_peel_info,
1260 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1262 int misal = dr_info->misalignment;
1263 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1264 misal &= alignment - 1;
1265 set_dr_misalignment (dr_info, misal);
1266 return;
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1271 "to unknown (-1).\n");
1272 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 /* Return true if alignment is relevant for DR_INFO. */
1277 static bool
1278 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1280 stmt_vec_info stmt_info = dr_info->stmt;
1282 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1283 return false;
1285 /* For interleaving, only the alignment of the first access matters. */
1286 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1287 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1288 return false;
1290 /* Scatter-gather and invariant accesses continue to address individual
1291 scalars, so vector-level alignment is irrelevant. */
1292 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1293 || integer_zerop (DR_STEP (dr_info->dr)))
1294 return false;
1296 /* Strided accesses perform only component accesses, alignment is
1297 irrelevant for them. */
1298 if (STMT_VINFO_STRIDED_P (stmt_info)
1299 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1300 return false;
1302 return true;
1305 /* Given an memory reference EXP return whether its alignment is less
1306 than its size. */
1308 static bool
1309 not_size_aligned (tree exp)
1311 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1312 return true;
1314 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1315 > get_object_alignment (exp));
1318 /* Function vector_alignment_reachable_p
1320 Return true if vector alignment for DR_INFO is reachable by peeling
1321 a few loop iterations. Return false otherwise. */
1323 static bool
1324 vector_alignment_reachable_p (dr_vec_info *dr_info)
1326 stmt_vec_info stmt_info = dr_info->stmt;
1327 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1329 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1331 /* For interleaved access we peel only if number of iterations in
1332 the prolog loop ({VF - misalignment}), is a multiple of the
1333 number of the interleaved accesses. */
1334 int elem_size, mis_in_elements;
1336 /* FORNOW: handle only known alignment. */
1337 if (!known_alignment_for_access_p (dr_info, vectype))
1338 return false;
1340 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1341 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1342 elem_size = vector_element_size (vector_size, nelements);
1343 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1345 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1346 return false;
1349 /* If misalignment is known at the compile time then allow peeling
1350 only if natural alignment is reachable through peeling. */
1351 if (known_alignment_for_access_p (dr_info, vectype)
1352 && !aligned_access_p (dr_info, vectype))
1354 HOST_WIDE_INT elmsize =
1355 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1356 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_NOTE, vect_location,
1359 "data size = %wd. misalignment = %d.\n", elmsize,
1360 dr_misalignment (dr_info, vectype));
1362 if (dr_misalignment (dr_info, vectype) % elmsize)
1364 if (dump_enabled_p ())
1365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 "data size does not divide the misalignment.\n");
1367 return false;
1371 if (!known_alignment_for_access_p (dr_info, vectype))
1373 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1374 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1375 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "Unknown misalignment, %snaturally aligned\n",
1378 is_packed ? "not " : "");
1379 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382 return true;
1386 /* Calculate the cost of the memory access represented by DR_INFO. */
1388 static void
1389 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1390 dr_alignment_support alignment_support_scheme,
1391 int misalignment,
1392 unsigned int *inside_cost,
1393 unsigned int *outside_cost,
1394 stmt_vector_for_cost *body_cost_vec,
1395 stmt_vector_for_cost *prologue_cost_vec)
1397 stmt_vec_info stmt_info = dr_info->stmt;
1398 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1399 int ncopies;
1401 if (PURE_SLP_STMT (stmt_info))
1402 ncopies = 1;
1403 else
1404 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1406 if (DR_IS_READ (dr_info->dr))
1407 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1408 misalignment, true, inside_cost,
1409 outside_cost, prologue_cost_vec, body_cost_vec, false);
1410 else
1411 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1412 misalignment, inside_cost, body_cost_vec);
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "vect_get_data_access_cost: inside_cost = %d, "
1417 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1421 typedef struct _vect_peel_info
1423 dr_vec_info *dr_info;
1424 int npeel;
1425 unsigned int count;
1426 } *vect_peel_info;
1428 typedef struct _vect_peel_extended_info
1430 vec_info *vinfo;
1431 struct _vect_peel_info peel_info;
1432 unsigned int inside_cost;
1433 unsigned int outside_cost;
1434 } *vect_peel_extended_info;
1437 /* Peeling hashtable helpers. */
1439 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1441 static inline hashval_t hash (const _vect_peel_info *);
1442 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 inline hashval_t
1446 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1448 return (hashval_t) peel_info->npeel;
1451 inline bool
1452 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1454 return (a->npeel == b->npeel);
1458 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1460 static void
1461 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1462 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1463 int npeel, bool supportable_if_not_aligned)
1465 struct _vect_peel_info elem, *slot;
1466 _vect_peel_info **new_slot;
1468 elem.npeel = npeel;
1469 slot = peeling_htab->find (&elem);
1470 if (slot)
1471 slot->count++;
1472 else
1474 slot = XNEW (struct _vect_peel_info);
1475 slot->npeel = npeel;
1476 slot->dr_info = dr_info;
1477 slot->count = 1;
1478 new_slot = peeling_htab->find_slot (slot, INSERT);
1479 *new_slot = slot;
1482 /* If this DR is not supported with unknown misalignment then bias
1483 this slot when the cost model is disabled. */
1484 if (!supportable_if_not_aligned
1485 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1486 slot->count += VECT_MAX_COST;
1490 /* Traverse peeling hash table to find peeling option that aligns maximum
1491 number of data accesses. */
1494 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1495 _vect_peel_extended_info *max)
1497 vect_peel_info elem = *slot;
1499 if (elem->count > max->peel_info.count
1500 || (elem->count == max->peel_info.count
1501 && max->peel_info.npeel > elem->npeel))
1503 max->peel_info.npeel = elem->npeel;
1504 max->peel_info.count = elem->count;
1505 max->peel_info.dr_info = elem->dr_info;
1508 return 1;
1511 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1512 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1513 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1514 after peeling. */
1516 static void
1517 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1518 dr_vec_info *dr0_info,
1519 unsigned int *inside_cost,
1520 unsigned int *outside_cost,
1521 stmt_vector_for_cost *body_cost_vec,
1522 stmt_vector_for_cost *prologue_cost_vec,
1523 unsigned int npeel)
1525 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1527 bool dr0_alignment_known_p
1528 = (dr0_info
1529 && known_alignment_for_access_p (dr0_info,
1530 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1532 for (data_reference *dr : datarefs)
1534 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1535 if (!vect_relevant_for_alignment_p (dr_info))
1536 continue;
1538 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1539 dr_alignment_support alignment_support_scheme;
1540 int misalignment;
1541 unsigned HOST_WIDE_INT alignment;
1543 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1544 size_zero_node) < 0;
1545 poly_int64 off = 0;
1546 if (negative)
1547 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1548 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1550 if (npeel == 0)
1551 misalignment = dr_misalignment (dr_info, vectype, off);
1552 else if (dr_info == dr0_info
1553 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1554 misalignment = 0;
1555 else if (!dr0_alignment_known_p
1556 || !known_alignment_for_access_p (dr_info, vectype)
1557 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1558 misalignment = DR_MISALIGNMENT_UNKNOWN;
1559 else
1561 misalignment = dr_misalignment (dr_info, vectype, off);
1562 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1563 misalignment &= alignment - 1;
1565 alignment_support_scheme
1566 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1567 misalignment);
1569 vect_get_data_access_cost (loop_vinfo, dr_info,
1570 alignment_support_scheme, misalignment,
1571 inside_cost, outside_cost,
1572 body_cost_vec, prologue_cost_vec);
1576 /* Traverse peeling hash table and calculate cost for each peeling option.
1577 Find the one with the lowest cost. */
1580 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1581 _vect_peel_extended_info *min)
1583 vect_peel_info elem = *slot;
1584 int dummy;
1585 unsigned int inside_cost = 0, outside_cost = 0;
1586 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1587 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1588 epilogue_cost_vec;
1590 prologue_cost_vec.create (2);
1591 body_cost_vec.create (2);
1592 epilogue_cost_vec.create (2);
1594 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1595 &outside_cost, &body_cost_vec,
1596 &prologue_cost_vec, elem->npeel);
1598 body_cost_vec.release ();
1600 outside_cost += vect_get_known_peeling_cost
1601 (loop_vinfo, elem->npeel, &dummy,
1602 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1603 &prologue_cost_vec, &epilogue_cost_vec);
1605 /* Prologue and epilogue costs are added to the target model later.
1606 These costs depend only on the scalar iteration cost, the
1607 number of peeling iterations finally chosen, and the number of
1608 misaligned statements. So discard the information found here. */
1609 prologue_cost_vec.release ();
1610 epilogue_cost_vec.release ();
1612 if (inside_cost < min->inside_cost
1613 || (inside_cost == min->inside_cost
1614 && outside_cost < min->outside_cost))
1616 min->inside_cost = inside_cost;
1617 min->outside_cost = outside_cost;
1618 min->peel_info.dr_info = elem->dr_info;
1619 min->peel_info.npeel = elem->npeel;
1620 min->peel_info.count = elem->count;
1623 return 1;
1627 /* Choose best peeling option by traversing peeling hash table and either
1628 choosing an option with the lowest cost (if cost model is enabled) or the
1629 option that aligns as many accesses as possible. */
1631 static struct _vect_peel_extended_info
1632 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1633 loop_vec_info loop_vinfo)
1635 struct _vect_peel_extended_info res;
1637 res.peel_info.dr_info = NULL;
1638 res.vinfo = loop_vinfo;
1640 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1642 res.inside_cost = INT_MAX;
1643 res.outside_cost = INT_MAX;
1644 peeling_htab->traverse <_vect_peel_extended_info *,
1645 vect_peeling_hash_get_lowest_cost> (&res);
1647 else
1649 res.peel_info.count = 0;
1650 peeling_htab->traverse <_vect_peel_extended_info *,
1651 vect_peeling_hash_get_most_frequent> (&res);
1652 res.inside_cost = 0;
1653 res.outside_cost = 0;
1656 return res;
1659 /* Return true if the new peeling NPEEL is supported. */
1661 static bool
1662 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1663 unsigned npeel)
1665 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1666 enum dr_alignment_support supportable_dr_alignment;
1668 bool dr0_alignment_known_p
1669 = known_alignment_for_access_p (dr0_info,
1670 STMT_VINFO_VECTYPE (dr0_info->stmt));
1672 /* Ensure that all data refs can be vectorized after the peel. */
1673 for (data_reference *dr : datarefs)
1675 if (dr == dr0_info->dr)
1676 continue;
1678 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1679 if (!vect_relevant_for_alignment_p (dr_info)
1680 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1681 continue;
1683 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1684 int misalignment;
1685 unsigned HOST_WIDE_INT alignment;
1686 if (!dr0_alignment_known_p
1687 || !known_alignment_for_access_p (dr_info, vectype)
1688 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1689 misalignment = DR_MISALIGNMENT_UNKNOWN;
1690 else
1692 misalignment = dr_misalignment (dr_info, vectype);
1693 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1694 misalignment &= alignment - 1;
1696 supportable_dr_alignment
1697 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1698 misalignment);
1699 if (supportable_dr_alignment == dr_unaligned_unsupported)
1700 return false;
1703 return true;
1706 /* Compare two data-references DRA and DRB to group them into chunks
1707 with related alignment. */
1709 static int
1710 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1712 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1713 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1714 int cmp;
1716 /* Stabilize sort. */
1717 if (dra == drb)
1718 return 0;
1720 /* Ordering of DRs according to base. */
1721 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1722 DR_BASE_ADDRESS (drb));
1723 if (cmp != 0)
1724 return cmp;
1726 /* And according to DR_OFFSET. */
1727 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1728 if (cmp != 0)
1729 return cmp;
1731 /* And after step. */
1732 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1733 if (cmp != 0)
1734 return cmp;
1736 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1737 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1738 if (cmp == 0)
1739 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1740 return cmp;
1743 /* Function vect_enhance_data_refs_alignment
1745 This pass will use loop versioning and loop peeling in order to enhance
1746 the alignment of data references in the loop.
1748 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1749 original loop is to be vectorized. Any other loops that are created by
1750 the transformations performed in this pass - are not supposed to be
1751 vectorized. This restriction will be relaxed.
1753 This pass will require a cost model to guide it whether to apply peeling
1754 or versioning or a combination of the two. For example, the scheme that
1755 intel uses when given a loop with several memory accesses, is as follows:
1756 choose one memory access ('p') which alignment you want to force by doing
1757 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1758 other accesses are not necessarily aligned, or (2) use loop versioning to
1759 generate one loop in which all accesses are aligned, and another loop in
1760 which only 'p' is necessarily aligned.
1762 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1763 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1764 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1766 Devising a cost model is the most critical aspect of this work. It will
1767 guide us on which access to peel for, whether to use loop versioning, how
1768 many versions to create, etc. The cost model will probably consist of
1769 generic considerations as well as target specific considerations (on
1770 powerpc for example, misaligned stores are more painful than misaligned
1771 loads).
1773 Here are the general steps involved in alignment enhancements:
1775 -- original loop, before alignment analysis:
1776 for (i=0; i<N; i++){
1777 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1778 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1781 -- After vect_compute_data_refs_alignment:
1782 for (i=0; i<N; i++){
1783 x = q[i]; # DR_MISALIGNMENT(q) = 3
1784 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1787 -- Possibility 1: we do loop versioning:
1788 if (p is aligned) {
1789 for (i=0; i<N; i++){ # loop 1A
1790 x = q[i]; # DR_MISALIGNMENT(q) = 3
1791 p[i] = y; # DR_MISALIGNMENT(p) = 0
1794 else {
1795 for (i=0; i<N; i++){ # loop 1B
1796 x = q[i]; # DR_MISALIGNMENT(q) = 3
1797 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1801 -- Possibility 2: we do loop peeling:
1802 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1803 x = q[i];
1804 p[i] = y;
1806 for (i = 3; i < N; i++){ # loop 2A
1807 x = q[i]; # DR_MISALIGNMENT(q) = 0
1808 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1811 -- Possibility 3: combination of loop peeling and versioning:
1812 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1813 x = q[i];
1814 p[i] = y;
1816 if (p is aligned) {
1817 for (i = 3; i<N; i++){ # loop 3A
1818 x = q[i]; # DR_MISALIGNMENT(q) = 0
1819 p[i] = y; # DR_MISALIGNMENT(p) = 0
1822 else {
1823 for (i = 3; i<N; i++){ # loop 3B
1824 x = q[i]; # DR_MISALIGNMENT(q) = 0
1825 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1829 These loops are later passed to loop_transform to be vectorized. The
1830 vectorizer will use the alignment information to guide the transformation
1831 (whether to generate regular loads/stores, or with special handling for
1832 misalignment). */
1834 opt_result
1835 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838 dr_vec_info *first_store = NULL;
1839 dr_vec_info *dr0_info = NULL;
1840 struct data_reference *dr;
1841 unsigned int i;
1842 bool do_peeling = false;
1843 bool do_versioning = false;
1844 unsigned int npeel = 0;
1845 bool one_misalignment_known = false;
1846 bool one_misalignment_unknown = false;
1847 bool one_dr_unsupportable = false;
1848 dr_vec_info *unsupportable_dr_info = NULL;
1849 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1850 hash_table<peel_info_hasher> peeling_htab (1);
1852 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1854 /* Reset data so we can safely be called multiple times. */
1855 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1856 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1858 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1859 return opt_result::success ();
1861 /* Sort the vector of datarefs so DRs that have the same or dependent
1862 alignment are next to each other. */
1863 auto_vec<data_reference_p> datarefs
1864 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1865 datarefs.qsort (dr_align_group_sort_cmp);
1867 /* Compute the number of DRs that become aligned when we peel
1868 a dataref so it becomes aligned. */
1869 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1870 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1871 unsigned i0;
1872 for (i0 = 0; i0 < datarefs.length (); ++i0)
1873 if (DR_BASE_ADDRESS (datarefs[i0]))
1874 break;
1875 for (i = i0 + 1; i <= datarefs.length (); ++i)
1877 if (i == datarefs.length ()
1878 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1879 DR_BASE_ADDRESS (datarefs[i]), 0)
1880 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1881 DR_OFFSET (datarefs[i]), 0)
1882 || !operand_equal_p (DR_STEP (datarefs[i0]),
1883 DR_STEP (datarefs[i]), 0))
1885 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1886 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1887 will get known misalignment if we align one of the refs
1888 with the largest DR_TARGET_ALIGNMENT. */
1889 for (unsigned j = i0; j < i; ++j)
1891 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1892 for (unsigned k = i0; k < i; ++k)
1894 if (k == j)
1895 continue;
1896 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1897 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1898 dr_infoj))
1899 n_same_align_refs[j]++;
1902 i0 = i;
1906 /* While cost model enhancements are expected in the future, the high level
1907 view of the code at this time is as follows:
1909 A) If there is a misaligned access then see if peeling to align
1910 this access can make all data references satisfy
1911 vect_supportable_dr_alignment. If so, update data structures
1912 as needed and return true.
1914 B) If peeling wasn't possible and there is a data reference with an
1915 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1916 then see if loop versioning checks can be used to make all data
1917 references satisfy vect_supportable_dr_alignment. If so, update
1918 data structures as needed and return true.
1920 C) If neither peeling nor versioning were successful then return false if
1921 any data reference does not satisfy vect_supportable_dr_alignment.
1923 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1925 Note, Possibility 3 above (which is peeling and versioning together) is not
1926 being done at this time. */
1928 /* (1) Peeling to force alignment. */
1930 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1931 Considerations:
1932 + How many accesses will become aligned due to the peeling
1933 - How many accesses will become unaligned due to the peeling,
1934 and the cost of misaligned accesses.
1935 - The cost of peeling (the extra runtime checks, the increase
1936 in code size). */
1938 FOR_EACH_VEC_ELT (datarefs, i, dr)
1940 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1941 if (!vect_relevant_for_alignment_p (dr_info))
1942 continue;
1944 stmt_vec_info stmt_info = dr_info->stmt;
1945 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1946 do_peeling = vector_alignment_reachable_p (dr_info);
1947 if (do_peeling)
1949 if (known_alignment_for_access_p (dr_info, vectype))
1951 unsigned int npeel_tmp = 0;
1952 bool negative = tree_int_cst_compare (DR_STEP (dr),
1953 size_zero_node) < 0;
1955 /* If known_alignment_for_access_p then we have set
1956 DR_MISALIGNMENT which is only done if we know it at compiler
1957 time, so it is safe to assume target alignment is constant.
1959 unsigned int target_align =
1960 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1961 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1962 poly_int64 off = 0;
1963 if (negative)
1964 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1965 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1966 mis = negative ? mis : -mis;
1967 if (mis != 0)
1968 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1970 /* For multiple types, it is possible that the bigger type access
1971 will have more than one peeling option. E.g., a loop with two
1972 types: one of size (vector size / 4), and the other one of
1973 size (vector size / 8). Vectorization factor will 8. If both
1974 accesses are misaligned by 3, the first one needs one scalar
1975 iteration to be aligned, and the second one needs 5. But the
1976 first one will be aligned also by peeling 5 scalar
1977 iterations, and in that case both accesses will be aligned.
1978 Hence, except for the immediate peeling amount, we also want
1979 to try to add full vector size, while we don't exceed
1980 vectorization factor.
1981 We do this automatically for cost model, since we calculate
1982 cost for every peeling option. */
1983 poly_uint64 nscalars = npeel_tmp;
1984 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1986 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1987 nscalars = (STMT_SLP_TYPE (stmt_info)
1988 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991 /* Save info about DR in the hash table. Also include peeling
1992 amounts according to the explanation above. Indicate
1993 the alignment status when the ref is not aligned.
1994 ??? Rather than using unknown alignment here we should
1995 prune all entries from the peeling hashtable which cause
1996 DRs to be not supported. */
1997 bool supportable_if_not_aligned
1998 = vect_supportable_dr_alignment
1999 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2000 while (known_le (npeel_tmp, nscalars))
2002 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2003 dr_info, npeel_tmp,
2004 supportable_if_not_aligned);
2005 npeel_tmp += MAX (1, target_align / dr_size);
2008 one_misalignment_known = true;
2010 else
2012 /* If we don't know any misalignment values, we prefer
2013 peeling for data-ref that has the maximum number of data-refs
2014 with the same alignment, unless the target prefers to align
2015 stores over load. */
2016 unsigned same_align_drs = n_same_align_refs[i];
2017 if (!dr0_info
2018 || dr0_same_align_drs < same_align_drs)
2020 dr0_same_align_drs = same_align_drs;
2021 dr0_info = dr_info;
2023 /* For data-refs with the same number of related
2024 accesses prefer the one where the misalign
2025 computation will be invariant in the outermost loop. */
2026 else if (dr0_same_align_drs == same_align_drs)
2028 class loop *ivloop0, *ivloop;
2029 ivloop0 = outermost_invariant_loop_for_expr
2030 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2031 ivloop = outermost_invariant_loop_for_expr
2032 (loop, DR_BASE_ADDRESS (dr));
2033 if ((ivloop && !ivloop0)
2034 || (ivloop && ivloop0
2035 && flow_loop_nested_p (ivloop, ivloop0)))
2036 dr0_info = dr_info;
2039 one_misalignment_unknown = true;
2041 /* Check for data refs with unsupportable alignment that
2042 can be peeled. */
2043 enum dr_alignment_support supportable_dr_alignment
2044 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2045 DR_MISALIGNMENT_UNKNOWN);
2046 if (supportable_dr_alignment == dr_unaligned_unsupported)
2048 one_dr_unsupportable = true;
2049 unsupportable_dr_info = dr_info;
2052 if (!first_store && DR_IS_WRITE (dr))
2054 first_store = dr_info;
2055 first_store_same_align_drs = same_align_drs;
2059 else
2061 if (!aligned_access_p (dr_info, vectype))
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2065 "vector alignment may not be reachable\n");
2066 break;
2071 /* Check if we can possibly peel the loop. */
2072 if (!vect_can_advance_ivs_p (loop_vinfo)
2073 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2074 || loop->inner)
2075 do_peeling = false;
2077 struct _vect_peel_extended_info peel_for_known_alignment;
2078 struct _vect_peel_extended_info peel_for_unknown_alignment;
2079 struct _vect_peel_extended_info best_peel;
2081 peel_for_unknown_alignment.inside_cost = INT_MAX;
2082 peel_for_unknown_alignment.outside_cost = INT_MAX;
2083 peel_for_unknown_alignment.peel_info.count = 0;
2085 if (do_peeling
2086 && one_misalignment_unknown)
2088 /* Check if the target requires to prefer stores over loads, i.e., if
2089 misaligned stores are more expensive than misaligned loads (taking
2090 drs with same alignment into account). */
2091 unsigned int load_inside_cost = 0;
2092 unsigned int load_outside_cost = 0;
2093 unsigned int store_inside_cost = 0;
2094 unsigned int store_outside_cost = 0;
2095 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2097 stmt_vector_for_cost dummy;
2098 dummy.create (2);
2099 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2100 &load_inside_cost,
2101 &load_outside_cost,
2102 &dummy, &dummy, estimated_npeels);
2103 dummy.release ();
2105 if (first_store)
2107 dummy.create (2);
2108 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2109 &store_inside_cost,
2110 &store_outside_cost,
2111 &dummy, &dummy,
2112 estimated_npeels);
2113 dummy.release ();
2115 else
2117 store_inside_cost = INT_MAX;
2118 store_outside_cost = INT_MAX;
2121 if (load_inside_cost > store_inside_cost
2122 || (load_inside_cost == store_inside_cost
2123 && load_outside_cost > store_outside_cost))
2125 dr0_info = first_store;
2126 dr0_same_align_drs = first_store_same_align_drs;
2127 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2128 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2130 else
2132 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2133 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2137 prologue_cost_vec.create (2);
2138 epilogue_cost_vec.create (2);
2140 int dummy2;
2141 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2142 (loop_vinfo, estimated_npeels, &dummy2,
2143 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2144 &prologue_cost_vec, &epilogue_cost_vec);
2146 prologue_cost_vec.release ();
2147 epilogue_cost_vec.release ();
2149 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152 peel_for_unknown_alignment.peel_info.npeel = 0;
2153 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2155 best_peel = peel_for_unknown_alignment;
2157 peel_for_known_alignment.inside_cost = INT_MAX;
2158 peel_for_known_alignment.outside_cost = INT_MAX;
2159 peel_for_known_alignment.peel_info.count = 0;
2160 peel_for_known_alignment.peel_info.dr_info = NULL;
2162 if (do_peeling && one_misalignment_known)
2164 /* Peeling is possible, but there is no data access that is not supported
2165 unless aligned. So we try to choose the best possible peeling from
2166 the hash table. */
2167 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2168 (&peeling_htab, loop_vinfo);
2171 /* Compare costs of peeling for known and unknown alignment. */
2172 if (peel_for_known_alignment.peel_info.dr_info != NULL
2173 && peel_for_unknown_alignment.inside_cost
2174 >= peel_for_known_alignment.inside_cost)
2176 best_peel = peel_for_known_alignment;
2178 /* If the best peeling for known alignment has NPEEL == 0, perform no
2179 peeling at all except if there is an unsupportable dr that we can
2180 align. */
2181 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2182 do_peeling = false;
2185 /* If there is an unsupportable data ref, prefer this over all choices so far
2186 since we'd have to discard a chosen peeling except when it accidentally
2187 aligned the unsupportable data ref. */
2188 if (one_dr_unsupportable)
2189 dr0_info = unsupportable_dr_info;
2190 else if (do_peeling)
2192 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2193 TODO: Use nopeel_outside_cost or get rid of it? */
2194 unsigned nopeel_inside_cost = 0;
2195 unsigned nopeel_outside_cost = 0;
2197 stmt_vector_for_cost dummy;
2198 dummy.create (2);
2199 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2200 &nopeel_outside_cost, &dummy, &dummy, 0);
2201 dummy.release ();
2203 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2204 costs will be recorded. */
2205 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2206 prologue_cost_vec.create (2);
2207 epilogue_cost_vec.create (2);
2209 int dummy2;
2210 nopeel_outside_cost += vect_get_known_peeling_cost
2211 (loop_vinfo, 0, &dummy2,
2212 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2213 &prologue_cost_vec, &epilogue_cost_vec);
2215 prologue_cost_vec.release ();
2216 epilogue_cost_vec.release ();
2218 npeel = best_peel.peel_info.npeel;
2219 dr0_info = best_peel.peel_info.dr_info;
2221 /* If no peeling is not more expensive than the best peeling we
2222 have so far, don't perform any peeling. */
2223 if (nopeel_inside_cost <= best_peel.inside_cost)
2224 do_peeling = false;
2227 if (do_peeling)
2229 stmt_vec_info stmt_info = dr0_info->stmt;
2230 if (known_alignment_for_access_p (dr0_info,
2231 STMT_VINFO_VECTYPE (stmt_info)))
2233 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2234 size_zero_node) < 0;
2235 if (!npeel)
2237 /* Since it's known at compile time, compute the number of
2238 iterations in the peeled loop (the peeling factor) for use in
2239 updating DR_MISALIGNMENT values. The peeling factor is the
2240 vectorization factor minus the misalignment as an element
2241 count. */
2242 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2243 poly_int64 off = 0;
2244 if (negative)
2245 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2246 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2247 unsigned int mis
2248 = dr_misalignment (dr0_info, vectype, off);
2249 mis = negative ? mis : -mis;
2250 /* If known_alignment_for_access_p then we have set
2251 DR_MISALIGNMENT which is only done if we know it at compiler
2252 time, so it is safe to assume target alignment is constant.
2254 unsigned int target_align =
2255 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2256 npeel = ((mis & (target_align - 1))
2257 / vect_get_scalar_dr_size (dr0_info));
2260 /* For interleaved data access every iteration accesses all the
2261 members of the group, therefore we divide the number of iterations
2262 by the group size. */
2263 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2264 npeel /= DR_GROUP_SIZE (stmt_info);
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "Try peeling by %d\n", npeel);
2271 /* Ensure that all datarefs can be vectorized after the peel. */
2272 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2273 do_peeling = false;
2275 /* Check if all datarefs are supportable and log. */
2276 if (do_peeling
2277 && npeel == 0
2278 && known_alignment_for_access_p (dr0_info,
2279 STMT_VINFO_VECTYPE (stmt_info)))
2280 return opt_result::success ();
2282 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2283 if (do_peeling)
2285 unsigned max_allowed_peel
2286 = param_vect_max_peeling_for_alignment;
2287 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2288 max_allowed_peel = 0;
2289 if (max_allowed_peel != (unsigned)-1)
2291 unsigned max_peel = npeel;
2292 if (max_peel == 0)
2294 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2295 unsigned HOST_WIDE_INT target_align_c;
2296 if (target_align.is_constant (&target_align_c))
2297 max_peel =
2298 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2299 else
2301 do_peeling = false;
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "Disable peeling, max peels set and vector"
2305 " alignment unknown\n");
2308 if (max_peel > max_allowed_peel)
2310 do_peeling = false;
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "Disable peeling, max peels reached: %d\n", max_peel);
2318 /* Cost model #2 - if peeling may result in a remaining loop not
2319 iterating enough to be vectorized then do not peel. Since this
2320 is a cost heuristic rather than a correctness decision, use the
2321 most likely runtime value for variable vectorization factors. */
2322 if (do_peeling
2323 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2325 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2326 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2327 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2328 < assumed_vf + max_peel)
2329 do_peeling = false;
2332 if (do_peeling)
2334 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2335 If the misalignment of DR_i is identical to that of dr0 then set
2336 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2337 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2338 by the peeling factor times the element size of DR_i (MOD the
2339 vectorization factor times the size). Otherwise, the
2340 misalignment of DR_i must be set to unknown. */
2341 FOR_EACH_VEC_ELT (datarefs, i, dr)
2342 if (dr != dr0_info->dr)
2344 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2345 if (!vect_relevant_for_alignment_p (dr_info))
2346 continue;
2348 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2352 if (npeel)
2353 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2354 else
2355 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2356 SET_DR_MISALIGNMENT (dr0_info,
2357 vect_dr_misalign_for_aligned_access (dr0_info));
2358 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_NOTE, vect_location,
2361 "Alignment of access forced using peeling.\n");
2362 dump_printf_loc (MSG_NOTE, vect_location,
2363 "Peeling for alignment will be applied.\n");
2366 /* The inside-loop cost will be accounted for in vectorizable_load
2367 and vectorizable_store correctly with adjusted alignments.
2368 Drop the body_cst_vec on the floor here. */
2369 return opt_result::success ();
2373 /* (2) Versioning to force alignment. */
2375 /* Try versioning if:
2376 1) optimize loop for speed and the cost-model is not cheap
2377 2) there is at least one unsupported misaligned data ref with an unknown
2378 misalignment, and
2379 3) all misaligned data refs with a known misalignment are supported, and
2380 4) the number of runtime alignment checks is within reason. */
2382 do_versioning
2383 = (optimize_loop_nest_for_speed_p (loop)
2384 && !loop->inner /* FORNOW */
2385 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2387 if (do_versioning)
2389 FOR_EACH_VEC_ELT (datarefs, i, dr)
2391 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2392 if (!vect_relevant_for_alignment_p (dr_info))
2393 continue;
2395 stmt_vec_info stmt_info = dr_info->stmt;
2396 if (STMT_VINFO_STRIDED_P (stmt_info))
2398 do_versioning = false;
2399 break;
2402 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2403 bool negative = tree_int_cst_compare (DR_STEP (dr),
2404 size_zero_node) < 0;
2405 poly_int64 off = 0;
2406 if (negative)
2407 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2408 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2409 int misalignment;
2410 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2411 continue;
2413 enum dr_alignment_support supportable_dr_alignment
2414 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2415 misalignment);
2416 if (supportable_dr_alignment == dr_unaligned_unsupported)
2418 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2419 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2420 >= (unsigned) param_vect_max_version_for_alignment_checks))
2422 do_versioning = false;
2423 break;
2426 /* At present we don't support versioning for alignment
2427 with variable VF, since there's no guarantee that the
2428 VF is a power of two. We could relax this if we added
2429 a way of enforcing a power-of-two size. */
2430 unsigned HOST_WIDE_INT size;
2431 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2433 do_versioning = false;
2434 break;
2437 /* Forcing alignment in the first iteration is no good if
2438 we don't keep it across iterations. For now, just disable
2439 versioning in this case.
2440 ?? We could actually unroll the loop to achieve the required
2441 overall step alignment, and forcing the alignment could be
2442 done by doing some iterations of the non-vectorized loop. */
2443 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2444 * DR_STEP_ALIGNMENT (dr),
2445 DR_TARGET_ALIGNMENT (dr_info)))
2447 do_versioning = false;
2448 break;
2451 /* The rightmost bits of an aligned address must be zeros.
2452 Construct the mask needed for this test. For example,
2453 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2454 mask must be 15 = 0xf. */
2455 int mask = size - 1;
2457 /* FORNOW: use the same mask to test all potentially unaligned
2458 references in the loop. */
2459 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2460 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2462 do_versioning = false;
2463 break;
2466 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2467 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2471 /* Versioning requires at least one misaligned data reference. */
2472 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2473 do_versioning = false;
2474 else if (!do_versioning)
2475 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478 if (do_versioning)
2480 const vec<stmt_vec_info> &may_misalign_stmts
2481 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2482 stmt_vec_info stmt_info;
2484 /* It can now be assumed that the data references in the statements
2485 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2486 of the loop being vectorized. */
2487 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2489 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2490 SET_DR_MISALIGNMENT (dr_info,
2491 vect_dr_misalign_for_aligned_access (dr_info));
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_NOTE, vect_location,
2494 "Alignment of access forced using versioning.\n");
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE, vect_location,
2499 "Versioning for alignment will be applied.\n");
2501 /* Peeling and versioning can't be done together at this time. */
2502 gcc_assert (! (do_peeling && do_versioning));
2504 return opt_result::success ();
2507 /* This point is reached if neither peeling nor versioning is being done. */
2508 gcc_assert (! (do_peeling || do_versioning));
2510 return opt_result::success ();
2514 /* Function vect_analyze_data_refs_alignment
2516 Analyze the alignment of the data-references in the loop.
2517 Return FALSE if a data reference is found that cannot be vectorized. */
2519 opt_result
2520 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2522 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2524 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2525 struct data_reference *dr;
2526 unsigned int i;
2528 vect_record_base_alignments (loop_vinfo);
2529 FOR_EACH_VEC_ELT (datarefs, i, dr)
2531 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2532 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2534 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2535 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2536 continue;
2537 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2538 STMT_VINFO_VECTYPE (dr_info->stmt));
2542 return opt_result::success ();
2546 /* Analyze alignment of DRs of stmts in NODE. */
2548 static bool
2549 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2551 /* Alignment is maintained in the first element of the group. */
2552 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2553 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2554 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2555 tree vectype = SLP_TREE_VECTYPE (node);
2556 poly_uint64 vector_alignment
2557 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2558 BITS_PER_UNIT);
2559 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2560 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2561 /* Re-analyze alignment when we're facing a vectorization with a bigger
2562 alignment requirement. */
2563 else if (known_lt (dr_info->target_alignment, vector_alignment))
2565 poly_uint64 old_target_alignment = dr_info->target_alignment;
2566 int old_misalignment = dr_info->misalignment;
2567 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2568 /* But keep knowledge about a smaller alignment. */
2569 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2570 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2572 dr_info->target_alignment = old_target_alignment;
2573 dr_info->misalignment = old_misalignment;
2576 /* When we ever face unordered target alignments the first one wins in terms
2577 of analyzing and the other will become unknown in dr_misalignment. */
2578 return true;
2581 /* Function vect_slp_analyze_instance_alignment
2583 Analyze the alignment of the data-references in the SLP instance.
2584 Return FALSE if a data reference is found that cannot be vectorized. */
2586 bool
2587 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2588 slp_instance instance)
2590 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2592 slp_tree node;
2593 unsigned i;
2594 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2595 if (! vect_slp_analyze_node_alignment (vinfo, node))
2596 return false;
2598 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2599 && ! vect_slp_analyze_node_alignment
2600 (vinfo, SLP_INSTANCE_TREE (instance)))
2601 return false;
2603 return true;
2607 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2608 accesses of legal size, step, etc. Detect gaps, single element
2609 interleaving, and other special cases. Set grouped access info.
2610 Collect groups of strided stores for further use in SLP analysis.
2611 Worker for vect_analyze_group_access. */
2613 static bool
2614 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2616 data_reference *dr = dr_info->dr;
2617 tree step = DR_STEP (dr);
2618 tree scalar_type = TREE_TYPE (DR_REF (dr));
2619 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2620 stmt_vec_info stmt_info = dr_info->stmt;
2621 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2622 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2623 HOST_WIDE_INT dr_step = -1;
2624 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2625 bool slp_impossible = false;
2627 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2628 size of the interleaving group (including gaps). */
2629 if (tree_fits_shwi_p (step))
2631 dr_step = tree_to_shwi (step);
2632 /* Check that STEP is a multiple of type size. Otherwise there is
2633 a non-element-sized gap at the end of the group which we
2634 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2635 ??? As we can handle non-constant step fine here we should
2636 simply remove uses of DR_GROUP_GAP between the last and first
2637 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2638 simply not include that gap. */
2639 if ((dr_step % type_size) != 0)
2641 if (dump_enabled_p ())
2642 dump_printf_loc (MSG_NOTE, vect_location,
2643 "Step %T is not a multiple of the element size"
2644 " for %T\n",
2645 step, DR_REF (dr));
2646 return false;
2648 groupsize = absu_hwi (dr_step) / type_size;
2650 else
2651 groupsize = 0;
2653 /* Not consecutive access is possible only if it is a part of interleaving. */
2654 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2656 /* Check if it this DR is a part of interleaving, and is a single
2657 element of the group that is accessed in the loop. */
2659 /* Gaps are supported only for loads. STEP must be a multiple of the type
2660 size. */
2661 if (DR_IS_READ (dr)
2662 && (dr_step % type_size) == 0
2663 && groupsize > 0
2664 /* This could be UINT_MAX but as we are generating code in a very
2665 inefficient way we have to cap earlier.
2666 See PR91403 for example. */
2667 && groupsize <= 4096)
2669 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2670 DR_GROUP_SIZE (stmt_info) = groupsize;
2671 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2672 if (dump_enabled_p ())
2673 dump_printf_loc (MSG_NOTE, vect_location,
2674 "Detected single element interleaving %T"
2675 " step %T\n",
2676 DR_REF (dr), step);
2678 return true;
2681 if (dump_enabled_p ())
2682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2683 "not consecutive access %G", stmt_info->stmt);
2685 if (bb_vinfo)
2687 /* Mark the statement as unvectorizable. */
2688 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2689 return true;
2692 if (dump_enabled_p ())
2693 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2694 STMT_VINFO_STRIDED_P (stmt_info) = true;
2695 return true;
2698 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2700 /* First stmt in the interleaving chain. Check the chain. */
2701 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2702 struct data_reference *data_ref = dr;
2703 unsigned int count = 1;
2704 tree prev_init = DR_INIT (data_ref);
2705 HOST_WIDE_INT diff, gaps = 0;
2707 /* By construction, all group members have INTEGER_CST DR_INITs. */
2708 while (next)
2710 /* We never have the same DR multiple times. */
2711 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2712 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2714 data_ref = STMT_VINFO_DATA_REF (next);
2716 /* All group members have the same STEP by construction. */
2717 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2719 /* Check that the distance between two accesses is equal to the type
2720 size. Otherwise, we have gaps. */
2721 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2722 - TREE_INT_CST_LOW (prev_init)) / type_size;
2723 if (diff < 1 || diff > UINT_MAX)
2725 /* For artificial testcases with array accesses with large
2726 constant indices we can run into overflow issues which
2727 can end up fooling the groupsize constraint below so
2728 check the individual gaps (which are represented as
2729 unsigned int) as well. */
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2732 "interleaved access with gap larger "
2733 "than representable\n");
2734 return false;
2736 if (diff != 1)
2738 /* FORNOW: SLP of accesses with gaps is not supported. */
2739 slp_impossible = true;
2740 if (DR_IS_WRITE (data_ref))
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744 "interleaved store with gaps\n");
2745 return false;
2748 gaps += diff - 1;
2751 last_accessed_element += diff;
2753 /* Store the gap from the previous member of the group. If there is no
2754 gap in the access, DR_GROUP_GAP is always 1. */
2755 DR_GROUP_GAP (next) = diff;
2757 prev_init = DR_INIT (data_ref);
2758 next = DR_GROUP_NEXT_ELEMENT (next);
2759 /* Count the number of data-refs in the chain. */
2760 count++;
2763 if (groupsize == 0)
2764 groupsize = count + gaps;
2766 /* This could be UINT_MAX but as we are generating code in a very
2767 inefficient way we have to cap earlier. See PR78699 for example. */
2768 if (groupsize > 4096)
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2772 "group is too large\n");
2773 return false;
2776 /* Check that the size of the interleaving is equal to count for stores,
2777 i.e., that there are no gaps. */
2778 if (groupsize != count
2779 && !DR_IS_READ (dr))
2781 groupsize = count;
2782 STMT_VINFO_STRIDED_P (stmt_info) = true;
2785 /* If there is a gap after the last load in the group it is the
2786 difference between the groupsize and the last accessed
2787 element.
2788 When there is no gap, this difference should be 0. */
2789 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2791 DR_GROUP_SIZE (stmt_info) = groupsize;
2792 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_NOTE, vect_location,
2795 "Detected interleaving ");
2796 if (DR_IS_READ (dr))
2797 dump_printf (MSG_NOTE, "load ");
2798 else if (STMT_VINFO_STRIDED_P (stmt_info))
2799 dump_printf (MSG_NOTE, "strided store ");
2800 else
2801 dump_printf (MSG_NOTE, "store ");
2802 dump_printf (MSG_NOTE, "of size %u\n",
2803 (unsigned)groupsize);
2804 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2805 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2806 while (next)
2808 if (DR_GROUP_GAP (next) != 1)
2809 dump_printf_loc (MSG_NOTE, vect_location,
2810 "\t<gap of %d elements>\n",
2811 DR_GROUP_GAP (next) - 1);
2812 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2813 next = DR_GROUP_NEXT_ELEMENT (next);
2815 if (DR_GROUP_GAP (stmt_info) != 0)
2816 dump_printf_loc (MSG_NOTE, vect_location,
2817 "\t<gap of %d elements>\n",
2818 DR_GROUP_GAP (stmt_info));
2821 /* SLP: create an SLP data structure for every interleaving group of
2822 stores for further analysis in vect_analyse_slp. */
2823 if (DR_IS_WRITE (dr) && !slp_impossible)
2825 if (loop_vinfo)
2826 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2827 if (bb_vinfo)
2828 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2832 return true;
2835 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2836 accesses of legal size, step, etc. Detect gaps, single element
2837 interleaving, and other special cases. Set grouped access info.
2838 Collect groups of strided stores for further use in SLP analysis. */
2840 static bool
2841 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2843 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2845 /* Dissolve the group if present. */
2846 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2847 while (stmt_info)
2849 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2850 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2851 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2852 stmt_info = next;
2854 return false;
2856 return true;
2859 /* Analyze the access pattern of the data-reference DR_INFO.
2860 In case of non-consecutive accesses call vect_analyze_group_access() to
2861 analyze groups of accesses. */
2863 static bool
2864 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2866 data_reference *dr = dr_info->dr;
2867 tree step = DR_STEP (dr);
2868 tree scalar_type = TREE_TYPE (DR_REF (dr));
2869 stmt_vec_info stmt_info = dr_info->stmt;
2870 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2871 class loop *loop = NULL;
2873 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2874 return true;
2876 if (loop_vinfo)
2877 loop = LOOP_VINFO_LOOP (loop_vinfo);
2879 if (loop_vinfo && !step)
2881 if (dump_enabled_p ())
2882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2883 "bad data-ref access in loop\n");
2884 return false;
2887 /* Allow loads with zero step in inner-loop vectorization. */
2888 if (loop_vinfo && integer_zerop (step))
2890 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2891 if (!nested_in_vect_loop_p (loop, stmt_info))
2892 return DR_IS_READ (dr);
2893 /* Allow references with zero step for outer loops marked
2894 with pragma omp simd only - it guarantees absence of
2895 loop-carried dependencies between inner loop iterations. */
2896 if (loop->safelen < 2)
2898 if (dump_enabled_p ())
2899 dump_printf_loc (MSG_NOTE, vect_location,
2900 "zero step in inner loop of nest\n");
2901 return false;
2905 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2907 /* Interleaved accesses are not yet supported within outer-loop
2908 vectorization for references in the inner-loop. */
2909 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2911 /* For the rest of the analysis we use the outer-loop step. */
2912 step = STMT_VINFO_DR_STEP (stmt_info);
2913 if (integer_zerop (step))
2915 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_NOTE, vect_location,
2917 "zero step in outer loop.\n");
2918 return DR_IS_READ (dr);
2922 /* Consecutive? */
2923 if (TREE_CODE (step) == INTEGER_CST)
2925 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2926 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2927 || (dr_step < 0
2928 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2930 /* Mark that it is not interleaving. */
2931 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2932 return true;
2936 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_NOTE, vect_location,
2940 "grouped access in outer loop.\n");
2941 return false;
2945 /* Assume this is a DR handled by non-constant strided load case. */
2946 if (TREE_CODE (step) != INTEGER_CST)
2947 return (STMT_VINFO_STRIDED_P (stmt_info)
2948 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2949 || vect_analyze_group_access (vinfo, dr_info)));
2951 /* Not consecutive access - check if it's a part of interleaving group. */
2952 return vect_analyze_group_access (vinfo, dr_info);
2955 /* Compare two data-references DRA and DRB to group them into chunks
2956 suitable for grouping. */
2958 static int
2959 dr_group_sort_cmp (const void *dra_, const void *drb_)
2961 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2962 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2963 data_reference_p dra = dra_info->dr;
2964 data_reference_p drb = drb_info->dr;
2965 int cmp;
2967 /* Stabilize sort. */
2968 if (dra == drb)
2969 return 0;
2971 /* Different group IDs lead never belong to the same group. */
2972 if (dra_info->group != drb_info->group)
2973 return dra_info->group < drb_info->group ? -1 : 1;
2975 /* Ordering of DRs according to base. */
2976 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977 DR_BASE_ADDRESS (drb));
2978 if (cmp != 0)
2979 return cmp;
2981 /* And according to DR_OFFSET. */
2982 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2983 if (cmp != 0)
2984 return cmp;
2986 /* Put reads before writes. */
2987 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2988 return DR_IS_READ (dra) ? -1 : 1;
2990 /* Then sort after access size. */
2991 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2992 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2993 if (cmp != 0)
2994 return cmp;
2996 /* And after step. */
2997 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2998 if (cmp != 0)
2999 return cmp;
3001 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3002 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3003 if (cmp == 0)
3004 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3005 return cmp;
3008 /* If OP is the result of a conversion, return the unconverted value,
3009 otherwise return null. */
3011 static tree
3012 strip_conversion (tree op)
3014 if (TREE_CODE (op) != SSA_NAME)
3015 return NULL_TREE;
3016 gimple *stmt = SSA_NAME_DEF_STMT (op);
3017 if (!is_gimple_assign (stmt)
3018 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3019 return NULL_TREE;
3020 return gimple_assign_rhs1 (stmt);
3023 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3024 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3025 be grouped in SLP mode. */
3027 static bool
3028 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3029 bool allow_slp_p)
3031 if (gimple_assign_single_p (stmt1_info->stmt))
3032 return gimple_assign_single_p (stmt2_info->stmt);
3034 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3035 if (call1 && gimple_call_internal_p (call1))
3037 /* Check for two masked loads or two masked stores. */
3038 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3039 if (!call2 || !gimple_call_internal_p (call2))
3040 return false;
3041 internal_fn ifn = gimple_call_internal_fn (call1);
3042 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3043 return false;
3044 if (ifn != gimple_call_internal_fn (call2))
3045 return false;
3047 /* Check that the masks are the same. Cope with casts of masks,
3048 like those created by build_mask_conversion. */
3049 tree mask1 = gimple_call_arg (call1, 2);
3050 tree mask2 = gimple_call_arg (call2, 2);
3051 if (!operand_equal_p (mask1, mask2, 0)
3052 && (ifn == IFN_MASK_STORE || !allow_slp_p))
3054 mask1 = strip_conversion (mask1);
3055 if (!mask1)
3056 return false;
3057 mask2 = strip_conversion (mask2);
3058 if (!mask2)
3059 return false;
3060 if (!operand_equal_p (mask1, mask2, 0))
3061 return false;
3063 return true;
3066 return false;
3069 /* Function vect_analyze_data_ref_accesses.
3071 Analyze the access pattern of all the data references in the loop.
3073 FORNOW: the only access pattern that is considered vectorizable is a
3074 simple step 1 (consecutive) access.
3076 FORNOW: handle only arrays and pointer accesses. */
3078 opt_result
3079 vect_analyze_data_ref_accesses (vec_info *vinfo,
3080 vec<int> *dataref_groups)
3082 unsigned int i;
3083 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3085 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3087 if (datarefs.is_empty ())
3088 return opt_result::success ();
3090 /* Sort the array of datarefs to make building the interleaving chains
3091 linear. Don't modify the original vector's order, it is needed for
3092 determining what dependencies are reversed. */
3093 vec<dr_vec_info *> datarefs_copy;
3094 datarefs_copy.create (datarefs.length ());
3095 for (unsigned i = 0; i < datarefs.length (); i++)
3097 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3098 /* If the caller computed DR grouping use that, otherwise group by
3099 basic blocks. */
3100 if (dataref_groups)
3101 dr_info->group = (*dataref_groups)[i];
3102 else
3103 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3104 datarefs_copy.quick_push (dr_info);
3106 datarefs_copy.qsort (dr_group_sort_cmp);
3107 hash_set<stmt_vec_info> to_fixup;
3109 /* Build the interleaving chains. */
3110 for (i = 0; i < datarefs_copy.length () - 1;)
3112 dr_vec_info *dr_info_a = datarefs_copy[i];
3113 data_reference_p dra = dr_info_a->dr;
3114 int dra_group_id = dr_info_a->group;
3115 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3116 stmt_vec_info lastinfo = NULL;
3117 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3118 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3120 ++i;
3121 continue;
3123 for (i = i + 1; i < datarefs_copy.length (); ++i)
3125 dr_vec_info *dr_info_b = datarefs_copy[i];
3126 data_reference_p drb = dr_info_b->dr;
3127 int drb_group_id = dr_info_b->group;
3128 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3129 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3130 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3131 break;
3133 /* ??? Imperfect sorting (non-compatible types, non-modulo
3134 accesses, same accesses) can lead to a group to be artificially
3135 split here as we don't just skip over those. If it really
3136 matters we can push those to a worklist and re-iterate
3137 over them. The we can just skip ahead to the next DR here. */
3139 /* DRs in a different DR group should not be put into the same
3140 interleaving group. */
3141 if (dra_group_id != drb_group_id)
3142 break;
3144 /* Check that the data-refs have same first location (except init)
3145 and they are both either store or load (not load and store,
3146 not masked loads or stores). */
3147 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3148 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3149 DR_BASE_ADDRESS (drb)) != 0
3150 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3151 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3152 break;
3154 /* Check that the data-refs have the same constant size. */
3155 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3156 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3157 if (!tree_fits_uhwi_p (sza)
3158 || !tree_fits_uhwi_p (szb)
3159 || !tree_int_cst_equal (sza, szb))
3160 break;
3162 /* Check that the data-refs have the same step. */
3163 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3164 break;
3166 /* Check the types are compatible.
3167 ??? We don't distinguish this during sorting. */
3168 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3169 TREE_TYPE (DR_REF (drb))))
3170 break;
3172 /* Check that the DR_INITs are compile-time constants. */
3173 if (!tree_fits_shwi_p (DR_INIT (dra))
3174 || !tree_fits_shwi_p (DR_INIT (drb)))
3175 break;
3177 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3178 just hold extra information. */
3179 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3180 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3181 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3182 break;
3184 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3185 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3186 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3187 HOST_WIDE_INT init_prev
3188 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3189 gcc_assert (init_a <= init_b
3190 && init_a <= init_prev
3191 && init_prev <= init_b);
3193 /* Do not place the same access in the interleaving chain twice. */
3194 if (init_b == init_prev)
3196 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3197 < gimple_uid (DR_STMT (drb)));
3198 /* Simply link in duplicates and fix up the chain below. */
3200 else
3202 /* If init_b == init_a + the size of the type * k, we have an
3203 interleaving, and DRA is accessed before DRB. */
3204 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3205 if (type_size_a == 0
3206 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3207 % type_size_a != 0))
3208 break;
3210 /* If we have a store, the accesses are adjacent. This splits
3211 groups into chunks we support (we don't support vectorization
3212 of stores with gaps). */
3213 if (!DR_IS_READ (dra)
3214 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3215 != type_size_a))
3216 break;
3218 /* If the step (if not zero or non-constant) is smaller than the
3219 difference between data-refs' inits this splits groups into
3220 suitable sizes. */
3221 if (tree_fits_shwi_p (DR_STEP (dra)))
3223 unsigned HOST_WIDE_INT step
3224 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3225 if (step != 0
3226 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3227 break;
3231 if (dump_enabled_p ())
3232 dump_printf_loc (MSG_NOTE, vect_location,
3233 DR_IS_READ (dra)
3234 ? "Detected interleaving load %T and %T\n"
3235 : "Detected interleaving store %T and %T\n",
3236 DR_REF (dra), DR_REF (drb));
3238 /* Link the found element into the group list. */
3239 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3241 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3242 lastinfo = stmtinfo_a;
3244 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3245 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3246 lastinfo = stmtinfo_b;
3248 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3249 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3251 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3252 dump_printf_loc (MSG_NOTE, vect_location,
3253 "Load suitable for SLP vectorization only.\n");
3255 if (init_b == init_prev
3256 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3257 && dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "Queuing group with duplicate access for fixup\n");
3263 /* Fixup groups with duplicate entries by splitting it. */
3264 while (1)
3266 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3267 if (!(it != to_fixup.end ()))
3268 break;
3269 stmt_vec_info grp = *it;
3270 to_fixup.remove (grp);
3272 /* Find the earliest duplicate group member. */
3273 unsigned first_duplicate = -1u;
3274 stmt_vec_info next, g = grp;
3275 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3277 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3278 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3279 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3280 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3281 g = next;
3283 if (first_duplicate == -1U)
3284 continue;
3286 /* Then move all stmts after the first duplicate to a new group.
3287 Note this is a heuristic but one with the property that *it
3288 is fixed up completely. */
3289 g = grp;
3290 stmt_vec_info newgroup = NULL, ng = grp;
3291 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3293 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3295 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3296 if (!newgroup)
3297 newgroup = next;
3298 else
3299 DR_GROUP_NEXT_ELEMENT (ng) = next;
3300 ng = next;
3301 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3303 else
3304 g = DR_GROUP_NEXT_ELEMENT (g);
3306 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3308 /* Fixup the new group which still may contain duplicates. */
3309 to_fixup.add (newgroup);
3312 dr_vec_info *dr_info;
3313 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3315 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3316 && !vect_analyze_data_ref_access (vinfo, dr_info))
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320 "not vectorized: complicated access pattern.\n");
3322 if (is_a <bb_vec_info> (vinfo))
3324 /* Mark the statement as not vectorizable. */
3325 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3326 continue;
3328 else
3330 datarefs_copy.release ();
3331 return opt_result::failure_at (dr_info->stmt->stmt,
3332 "not vectorized:"
3333 " complicated access pattern.\n");
3338 datarefs_copy.release ();
3339 return opt_result::success ();
3342 /* Function vect_vfa_segment_size.
3344 Input:
3345 DR_INFO: The data reference.
3346 LENGTH_FACTOR: segment length to consider.
3348 Return a value suitable for the dr_with_seg_len::seg_len field.
3349 This is the "distance travelled" by the pointer from the first
3350 iteration in the segment to the last. Note that it does not include
3351 the size of the access; in effect it only describes the first byte. */
3353 static tree
3354 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3356 length_factor = size_binop (MINUS_EXPR,
3357 fold_convert (sizetype, length_factor),
3358 size_one_node);
3359 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3360 length_factor);
3363 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3364 gives the worst-case number of bytes covered by the segment. */
3366 static unsigned HOST_WIDE_INT
3367 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3369 stmt_vec_info stmt_vinfo = dr_info->stmt;
3370 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3371 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3372 unsigned HOST_WIDE_INT access_size = ref_size;
3373 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3375 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3376 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3378 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3379 int misalignment;
3380 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3381 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3382 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3383 == dr_explicit_realign_optimized))
3385 /* We might access a full vector's worth. */
3386 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3388 return access_size;
3391 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3392 describes. */
3394 static unsigned int
3395 vect_vfa_align (dr_vec_info *dr_info)
3397 return dr_alignment (dr_info->dr);
3400 /* Function vect_no_alias_p.
3402 Given data references A and B with equal base and offset, see whether
3403 the alias relation can be decided at compilation time. Return 1 if
3404 it can and the references alias, 0 if it can and the references do
3405 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3406 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3407 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3409 static int
3410 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3411 tree segment_length_a, tree segment_length_b,
3412 unsigned HOST_WIDE_INT access_size_a,
3413 unsigned HOST_WIDE_INT access_size_b)
3415 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3416 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3417 poly_uint64 const_length_a;
3418 poly_uint64 const_length_b;
3420 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3421 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3422 [a, a+12) */
3423 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3425 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3426 offset_a -= const_length_a;
3428 else
3429 const_length_a = tree_to_poly_uint64 (segment_length_a);
3430 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3432 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3433 offset_b -= const_length_b;
3435 else
3436 const_length_b = tree_to_poly_uint64 (segment_length_b);
3438 const_length_a += access_size_a;
3439 const_length_b += access_size_b;
3441 if (ranges_known_overlap_p (offset_a, const_length_a,
3442 offset_b, const_length_b))
3443 return 1;
3445 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3446 offset_b, const_length_b))
3447 return 0;
3449 return -1;
3452 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3453 in DDR is >= VF. */
3455 static bool
3456 dependence_distance_ge_vf (data_dependence_relation *ddr,
3457 unsigned int loop_depth, poly_uint64 vf)
3459 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3460 || DDR_NUM_DIST_VECTS (ddr) == 0)
3461 return false;
3463 /* If the dependence is exact, we should have limited the VF instead. */
3464 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3466 unsigned int i;
3467 lambda_vector dist_v;
3468 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3470 HOST_WIDE_INT dist = dist_v[loop_depth];
3471 if (dist != 0
3472 && !(dist > 0 && DDR_REVERSED_P (ddr))
3473 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3474 return false;
3477 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_NOTE, vect_location,
3479 "dependence distance between %T and %T is >= VF\n",
3480 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3482 return true;
3485 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3487 static void
3488 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3490 dump_printf (dump_kind, "%s (%T) >= ",
3491 lower_bound.unsigned_p ? "unsigned" : "abs",
3492 lower_bound.expr);
3493 dump_dec (dump_kind, lower_bound.min_value);
3496 /* Record that the vectorized loop requires the vec_lower_bound described
3497 by EXPR, UNSIGNED_P and MIN_VALUE. */
3499 static void
3500 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3501 poly_uint64 min_value)
3503 vec<vec_lower_bound> &lower_bounds
3504 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3505 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3506 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3508 unsigned_p &= lower_bounds[i].unsigned_p;
3509 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3510 if (lower_bounds[i].unsigned_p != unsigned_p
3511 || maybe_lt (lower_bounds[i].min_value, min_value))
3513 lower_bounds[i].unsigned_p = unsigned_p;
3514 lower_bounds[i].min_value = min_value;
3515 if (dump_enabled_p ())
3517 dump_printf_loc (MSG_NOTE, vect_location,
3518 "updating run-time check to ");
3519 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3520 dump_printf (MSG_NOTE, "\n");
3523 return;
3526 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3527 if (dump_enabled_p ())
3529 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3530 dump_lower_bound (MSG_NOTE, lower_bound);
3531 dump_printf (MSG_NOTE, "\n");
3533 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3536 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3537 will span fewer than GAP bytes. */
3539 static bool
3540 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3541 poly_int64 gap)
3543 stmt_vec_info stmt_info = dr_info->stmt;
3544 HOST_WIDE_INT count
3545 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3546 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3547 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3548 return (estimated_poly_value (gap)
3549 <= count * vect_get_scalar_dr_size (dr_info));
3552 /* Return true if we know that there is no alias between DR_INFO_A and
3553 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3554 When returning true, set *LOWER_BOUND_OUT to this N. */
3556 static bool
3557 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3558 poly_uint64 *lower_bound_out)
3560 /* Check that there is a constant gap of known sign between DR_A
3561 and DR_B. */
3562 data_reference *dr_a = dr_info_a->dr;
3563 data_reference *dr_b = dr_info_b->dr;
3564 poly_int64 init_a, init_b;
3565 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3566 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3567 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3568 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3569 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3570 || !ordered_p (init_a, init_b))
3571 return false;
3573 /* Sort DR_A and DR_B by the address they access. */
3574 if (maybe_lt (init_b, init_a))
3576 std::swap (init_a, init_b);
3577 std::swap (dr_info_a, dr_info_b);
3578 std::swap (dr_a, dr_b);
3581 /* If the two accesses could be dependent within a scalar iteration,
3582 make sure that we'd retain their order. */
3583 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3584 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3585 return false;
3587 /* There is no alias if abs (DR_STEP) is greater than or equal to
3588 the bytes spanned by the combination of the two accesses. */
3589 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3590 return true;
3593 /* Function vect_prune_runtime_alias_test_list.
3595 Prune a list of ddrs to be tested at run-time by versioning for alias.
3596 Merge several alias checks into one if possible.
3597 Return FALSE if resulting list of ddrs is longer then allowed by
3598 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3600 opt_result
3601 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3603 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3604 hash_set <tree_pair_hash> compared_objects;
3606 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3607 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3608 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3609 const vec<vec_object_pair> &check_unequal_addrs
3610 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3611 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3612 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3614 ddr_p ddr;
3615 unsigned int i;
3616 tree length_factor;
3618 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3620 /* Step values are irrelevant for aliasing if the number of vector
3621 iterations is equal to the number of scalar iterations (which can
3622 happen for fully-SLP loops). */
3623 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3625 if (!vf_one_p)
3627 /* Convert the checks for nonzero steps into bound tests. */
3628 tree value;
3629 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3630 vect_check_lower_bound (loop_vinfo, value, true, 1);
3633 if (may_alias_ddrs.is_empty ())
3634 return opt_result::success ();
3636 comp_alias_ddrs.create (may_alias_ddrs.length ());
3638 unsigned int loop_depth
3639 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3640 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3642 /* First, we collect all data ref pairs for aliasing checks. */
3643 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3645 poly_uint64 lower_bound;
3646 tree segment_length_a, segment_length_b;
3647 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3648 unsigned int align_a, align_b;
3650 /* Ignore the alias if the VF we chose ended up being no greater
3651 than the dependence distance. */
3652 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3653 continue;
3655 if (DDR_OBJECT_A (ddr))
3657 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3658 if (!compared_objects.add (new_pair))
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_NOTE, vect_location,
3662 "checking that %T and %T"
3663 " have different addresses\n",
3664 new_pair.first, new_pair.second);
3665 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3667 continue;
3670 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3671 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3673 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3674 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3676 bool preserves_scalar_order_p
3677 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3678 bool ignore_step_p
3679 = (vf_one_p
3680 && (preserves_scalar_order_p
3681 || operand_equal_p (DR_STEP (dr_info_a->dr),
3682 DR_STEP (dr_info_b->dr))));
3684 /* Skip the pair if inter-iteration dependencies are irrelevant
3685 and intra-iteration dependencies are guaranteed to be honored. */
3686 if (ignore_step_p
3687 && (preserves_scalar_order_p
3688 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3689 &lower_bound)))
3691 if (dump_enabled_p ())
3692 dump_printf_loc (MSG_NOTE, vect_location,
3693 "no need for alias check between "
3694 "%T and %T when VF is 1\n",
3695 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3696 continue;
3699 /* See whether we can handle the alias using a bounds check on
3700 the step, and whether that's likely to be the best approach.
3701 (It might not be, for example, if the minimum step is much larger
3702 than the number of bytes handled by one vector iteration.) */
3703 if (!ignore_step_p
3704 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3705 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3706 &lower_bound)
3707 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3708 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3710 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3711 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3714 "%T and %T when the step %T is outside ",
3715 DR_REF (dr_info_a->dr),
3716 DR_REF (dr_info_b->dr),
3717 DR_STEP (dr_info_a->dr));
3718 if (unsigned_p)
3719 dump_printf (MSG_NOTE, "[0");
3720 else
3722 dump_printf (MSG_NOTE, "(");
3723 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3725 dump_printf (MSG_NOTE, ", ");
3726 dump_dec (MSG_NOTE, lower_bound);
3727 dump_printf (MSG_NOTE, ")\n");
3729 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3730 unsigned_p, lower_bound);
3731 continue;
3734 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3735 if (dr_group_first_a)
3737 stmt_info_a = dr_group_first_a;
3738 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3741 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3742 if (dr_group_first_b)
3744 stmt_info_b = dr_group_first_b;
3745 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3748 if (ignore_step_p)
3750 segment_length_a = size_zero_node;
3751 segment_length_b = size_zero_node;
3753 else
3755 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3756 DR_STEP (dr_info_b->dr), 0))
3757 length_factor = scalar_loop_iters;
3758 else
3759 length_factor = size_int (vect_factor);
3760 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3761 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3763 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3764 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3765 align_a = vect_vfa_align (dr_info_a);
3766 align_b = vect_vfa_align (dr_info_b);
3768 /* See whether the alias is known at compilation time. */
3769 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3770 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3771 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3772 DR_OFFSET (dr_info_b->dr), 0)
3773 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3774 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3775 && poly_int_tree_p (segment_length_a)
3776 && poly_int_tree_p (segment_length_b))
3778 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3779 segment_length_a,
3780 segment_length_b,
3781 access_size_a,
3782 access_size_b);
3783 if (res >= 0 && dump_enabled_p ())
3785 dump_printf_loc (MSG_NOTE, vect_location,
3786 "can tell at compile time that %T and %T",
3787 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3788 if (res == 0)
3789 dump_printf (MSG_NOTE, " do not alias\n");
3790 else
3791 dump_printf (MSG_NOTE, " alias\n");
3794 if (res == 0)
3795 continue;
3797 if (res == 1)
3798 return opt_result::failure_at (stmt_info_b->stmt,
3799 "not vectorized:"
3800 " compilation time alias: %G%G",
3801 stmt_info_a->stmt,
3802 stmt_info_b->stmt);
3805 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3806 access_size_a, align_a);
3807 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3808 access_size_b, align_b);
3809 /* Canonicalize the order to be the one that's needed for accurate
3810 RAW, WAR and WAW flags, in cases where the data references are
3811 well-ordered. The order doesn't really matter otherwise,
3812 but we might as well be consistent. */
3813 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3814 std::swap (dr_a, dr_b);
3816 dr_with_seg_len_pair_t dr_with_seg_len_pair
3817 (dr_a, dr_b, (preserves_scalar_order_p
3818 ? dr_with_seg_len_pair_t::WELL_ORDERED
3819 : dr_with_seg_len_pair_t::REORDERED));
3821 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3824 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3826 unsigned int count = (comp_alias_ddrs.length ()
3827 + check_unequal_addrs.length ());
3829 if (count
3830 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3831 == VECT_COST_MODEL_VERY_CHEAP))
3832 return opt_result::failure_at
3833 (vect_location, "would need a runtime alias check\n");
3835 if (dump_enabled_p ())
3836 dump_printf_loc (MSG_NOTE, vect_location,
3837 "improved number of alias checks from %d to %d\n",
3838 may_alias_ddrs.length (), count);
3839 unsigned limit = param_vect_max_version_for_alias_checks;
3840 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3841 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3842 if (count > limit)
3843 return opt_result::failure_at
3844 (vect_location,
3845 "number of versioning for alias run-time tests exceeds %d "
3846 "(--param vect-max-version-for-alias-checks)\n", limit);
3848 return opt_result::success ();
3851 /* Check whether we can use an internal function for a gather load
3852 or scatter store. READ_P is true for loads and false for stores.
3853 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3854 the type of the memory elements being loaded or stored. OFFSET_TYPE
3855 is the type of the offset that is being applied to the invariant
3856 base address. SCALE is the amount by which the offset should
3857 be multiplied *after* it has been converted to address width.
3859 Return true if the function is supported, storing the function id in
3860 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3862 bool
3863 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3864 tree vectype, tree memory_type, tree offset_type,
3865 int scale, internal_fn *ifn_out,
3866 tree *offset_vectype_out)
3868 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3869 unsigned int element_bits = vector_element_bits (vectype);
3870 if (element_bits != memory_bits)
3871 /* For now the vector elements must be the same width as the
3872 memory elements. */
3873 return false;
3875 /* Work out which function we need. */
3876 internal_fn ifn, alt_ifn;
3877 if (read_p)
3879 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3880 alt_ifn = IFN_MASK_GATHER_LOAD;
3882 else
3884 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3885 alt_ifn = IFN_MASK_SCATTER_STORE;
3888 for (;;)
3890 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3891 if (!offset_vectype)
3892 return false;
3894 /* Test whether the target supports this combination. */
3895 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3896 offset_vectype, scale))
3898 *ifn_out = ifn;
3899 *offset_vectype_out = offset_vectype;
3900 return true;
3902 else if (!masked_p
3903 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3904 memory_type,
3905 offset_vectype,
3906 scale))
3908 *ifn_out = alt_ifn;
3909 *offset_vectype_out = offset_vectype;
3910 return true;
3913 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3914 && TYPE_PRECISION (offset_type) >= element_bits)
3915 return false;
3917 offset_type = build_nonstandard_integer_type
3918 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3922 /* STMT_INFO is a call to an internal gather load or scatter store function.
3923 Describe the operation in INFO. */
3925 static void
3926 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3927 gather_scatter_info *info)
3929 gcall *call = as_a <gcall *> (stmt_info->stmt);
3930 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3931 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3933 info->ifn = gimple_call_internal_fn (call);
3934 info->decl = NULL_TREE;
3935 info->base = gimple_call_arg (call, 0);
3936 info->offset = gimple_call_arg (call, 1);
3937 info->offset_dt = vect_unknown_def_type;
3938 info->offset_vectype = NULL_TREE;
3939 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3940 info->element_type = TREE_TYPE (vectype);
3941 info->memory_type = TREE_TYPE (DR_REF (dr));
3944 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3945 gather load or scatter store. Describe the operation in *INFO if so. */
3947 bool
3948 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3949 gather_scatter_info *info)
3951 HOST_WIDE_INT scale = 1;
3952 poly_int64 pbitpos, pbitsize;
3953 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3954 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3955 tree offtype = NULL_TREE;
3956 tree decl = NULL_TREE, base, off;
3957 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3958 tree memory_type = TREE_TYPE (DR_REF (dr));
3959 machine_mode pmode;
3960 int punsignedp, reversep, pvolatilep = 0;
3961 internal_fn ifn;
3962 tree offset_vectype;
3963 bool masked_p = false;
3965 /* See whether this is already a call to a gather/scatter internal function.
3966 If not, see whether it's a masked load or store. */
3967 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3968 if (call && gimple_call_internal_p (call))
3970 ifn = gimple_call_internal_fn (call);
3971 if (internal_gather_scatter_fn_p (ifn))
3973 vect_describe_gather_scatter_call (stmt_info, info);
3974 return true;
3976 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3979 /* True if we should aim to use internal functions rather than
3980 built-in functions. */
3981 bool use_ifn_p = (DR_IS_READ (dr)
3982 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3983 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3985 base = DR_REF (dr);
3986 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3987 see if we can use the def stmt of the address. */
3988 if (masked_p
3989 && TREE_CODE (base) == MEM_REF
3990 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3991 && integer_zerop (TREE_OPERAND (base, 1))
3992 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3994 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3995 if (is_gimple_assign (def_stmt)
3996 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3997 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4000 /* The gather and scatter builtins need address of the form
4001 loop_invariant + vector * {1, 2, 4, 8}
4003 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4004 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4005 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4006 multiplications and additions in it. To get a vector, we need
4007 a single SSA_NAME that will be defined in the loop and will
4008 contain everything that is not loop invariant and that can be
4009 vectorized. The following code attempts to find such a preexistng
4010 SSA_NAME OFF and put the loop invariants into a tree BASE
4011 that can be gimplified before the loop. */
4012 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4013 &punsignedp, &reversep, &pvolatilep);
4014 if (reversep)
4015 return false;
4017 /* PR 107346. Packed structs can have fields at offsets that are not
4018 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4019 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4020 return false;
4022 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4024 if (TREE_CODE (base) == MEM_REF)
4026 if (!integer_zerop (TREE_OPERAND (base, 1)))
4028 if (off == NULL_TREE)
4029 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4030 else
4031 off = size_binop (PLUS_EXPR, off,
4032 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4034 base = TREE_OPERAND (base, 0);
4036 else
4037 base = build_fold_addr_expr (base);
4039 if (off == NULL_TREE)
4040 off = size_zero_node;
4042 /* If base is not loop invariant, either off is 0, then we start with just
4043 the constant offset in the loop invariant BASE and continue with base
4044 as OFF, otherwise give up.
4045 We could handle that case by gimplifying the addition of base + off
4046 into some SSA_NAME and use that as off, but for now punt. */
4047 if (!expr_invariant_in_loop_p (loop, base))
4049 if (!integer_zerop (off))
4050 return false;
4051 off = base;
4052 base = size_int (pbytepos);
4054 /* Otherwise put base + constant offset into the loop invariant BASE
4055 and continue with OFF. */
4056 else
4058 base = fold_convert (sizetype, base);
4059 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4062 /* OFF at this point may be either a SSA_NAME or some tree expression
4063 from get_inner_reference. Try to peel off loop invariants from it
4064 into BASE as long as possible. */
4065 STRIP_NOPS (off);
4066 while (offtype == NULL_TREE)
4068 enum tree_code code;
4069 tree op0, op1, add = NULL_TREE;
4071 if (TREE_CODE (off) == SSA_NAME)
4073 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4075 if (expr_invariant_in_loop_p (loop, off))
4076 return false;
4078 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4079 break;
4081 op0 = gimple_assign_rhs1 (def_stmt);
4082 code = gimple_assign_rhs_code (def_stmt);
4083 op1 = gimple_assign_rhs2 (def_stmt);
4085 else
4087 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4088 return false;
4089 code = TREE_CODE (off);
4090 extract_ops_from_tree (off, &code, &op0, &op1);
4092 switch (code)
4094 case POINTER_PLUS_EXPR:
4095 case PLUS_EXPR:
4096 if (expr_invariant_in_loop_p (loop, op0))
4098 add = op0;
4099 off = op1;
4100 do_add:
4101 add = fold_convert (sizetype, add);
4102 if (scale != 1)
4103 add = size_binop (MULT_EXPR, add, size_int (scale));
4104 base = size_binop (PLUS_EXPR, base, add);
4105 continue;
4107 if (expr_invariant_in_loop_p (loop, op1))
4109 add = op1;
4110 off = op0;
4111 goto do_add;
4113 break;
4114 case MINUS_EXPR:
4115 if (expr_invariant_in_loop_p (loop, op1))
4117 add = fold_convert (sizetype, op1);
4118 add = size_binop (MINUS_EXPR, size_zero_node, add);
4119 off = op0;
4120 goto do_add;
4122 break;
4123 case MULT_EXPR:
4124 if (scale == 1 && tree_fits_shwi_p (op1))
4126 int new_scale = tree_to_shwi (op1);
4127 /* Only treat this as a scaling operation if the target
4128 supports it for at least some offset type. */
4129 if (use_ifn_p
4130 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4131 masked_p, vectype, memory_type,
4132 signed_char_type_node,
4133 new_scale, &ifn,
4134 &offset_vectype)
4135 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4136 masked_p, vectype, memory_type,
4137 unsigned_char_type_node,
4138 new_scale, &ifn,
4139 &offset_vectype))
4140 break;
4141 scale = new_scale;
4142 off = op0;
4143 continue;
4145 break;
4146 case SSA_NAME:
4147 off = op0;
4148 continue;
4149 CASE_CONVERT:
4150 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4151 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4152 break;
4154 /* Don't include the conversion if the target is happy with
4155 the current offset type. */
4156 if (use_ifn_p
4157 && TREE_CODE (off) == SSA_NAME
4158 && !POINTER_TYPE_P (TREE_TYPE (off))
4159 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4160 masked_p, vectype, memory_type,
4161 TREE_TYPE (off), scale, &ifn,
4162 &offset_vectype))
4163 break;
4165 if (TYPE_PRECISION (TREE_TYPE (op0))
4166 == TYPE_PRECISION (TREE_TYPE (off)))
4168 off = op0;
4169 continue;
4172 /* Include the conversion if it is widening and we're using
4173 the IFN path or the target can handle the converted from
4174 offset or the current size is not already the same as the
4175 data vector element size. */
4176 if ((TYPE_PRECISION (TREE_TYPE (op0))
4177 < TYPE_PRECISION (TREE_TYPE (off)))
4178 && (use_ifn_p
4179 || (DR_IS_READ (dr)
4180 ? (targetm.vectorize.builtin_gather
4181 && targetm.vectorize.builtin_gather (vectype,
4182 TREE_TYPE (op0),
4183 scale))
4184 : (targetm.vectorize.builtin_scatter
4185 && targetm.vectorize.builtin_scatter (vectype,
4186 TREE_TYPE (op0),
4187 scale)))
4188 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4189 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4191 off = op0;
4192 offtype = TREE_TYPE (off);
4193 STRIP_NOPS (off);
4194 continue;
4196 break;
4197 default:
4198 break;
4200 break;
4203 /* If at the end OFF still isn't a SSA_NAME or isn't
4204 defined in the loop, punt. */
4205 if (TREE_CODE (off) != SSA_NAME
4206 || expr_invariant_in_loop_p (loop, off))
4207 return false;
4209 if (offtype == NULL_TREE)
4210 offtype = TREE_TYPE (off);
4212 if (use_ifn_p)
4214 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4215 vectype, memory_type, offtype, scale,
4216 &ifn, &offset_vectype))
4217 ifn = IFN_LAST;
4218 decl = NULL_TREE;
4220 else
4222 if (DR_IS_READ (dr))
4224 if (targetm.vectorize.builtin_gather)
4225 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4227 else
4229 if (targetm.vectorize.builtin_scatter)
4230 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4232 ifn = IFN_LAST;
4233 /* The offset vector type will be read from DECL when needed. */
4234 offset_vectype = NULL_TREE;
4237 info->ifn = ifn;
4238 info->decl = decl;
4239 info->base = base;
4240 info->offset = off;
4241 info->offset_dt = vect_unknown_def_type;
4242 info->offset_vectype = offset_vectype;
4243 info->scale = scale;
4244 info->element_type = TREE_TYPE (vectype);
4245 info->memory_type = memory_type;
4246 return true;
4249 /* Find the data references in STMT, analyze them with respect to LOOP and
4250 append them to DATAREFS. Return false if datarefs in this stmt cannot
4251 be handled. */
4253 opt_result
4254 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4255 vec<data_reference_p> *datarefs,
4256 vec<int> *dataref_groups, int group_id)
4258 /* We can ignore clobbers for dataref analysis - they are removed during
4259 loop vectorization and BB vectorization checks dependences with a
4260 stmt walk. */
4261 if (gimple_clobber_p (stmt))
4262 return opt_result::success ();
4264 if (gimple_has_volatile_ops (stmt))
4265 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4266 stmt);
4268 if (stmt_can_throw_internal (cfun, stmt))
4269 return opt_result::failure_at (stmt,
4270 "not vectorized:"
4271 " statement can throw an exception: %G",
4272 stmt);
4274 auto_vec<data_reference_p, 2> refs;
4275 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4276 if (!res)
4277 return res;
4279 if (refs.is_empty ())
4280 return opt_result::success ();
4282 if (refs.length () > 1)
4284 while (!refs.is_empty ())
4285 free_data_ref (refs.pop ());
4286 return opt_result::failure_at (stmt,
4287 "not vectorized: more than one "
4288 "data ref in stmt: %G", stmt);
4291 data_reference_p dr = refs.pop ();
4292 if (gcall *call = dyn_cast <gcall *> (stmt))
4293 if (!gimple_call_internal_p (call)
4294 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4295 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4297 free_data_ref (dr);
4298 return opt_result::failure_at (stmt,
4299 "not vectorized: dr in a call %G", stmt);
4302 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4303 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4305 free_data_ref (dr);
4306 return opt_result::failure_at (stmt,
4307 "not vectorized:"
4308 " statement is an unsupported"
4309 " bitfield access %G", stmt);
4312 if (DR_BASE_ADDRESS (dr)
4313 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4315 free_data_ref (dr);
4316 return opt_result::failure_at (stmt,
4317 "not vectorized:"
4318 " base addr of dr is a constant\n");
4321 /* Check whether this may be a SIMD lane access and adjust the
4322 DR to make it easier for us to handle it. */
4323 if (loop
4324 && loop->simduid
4325 && (!DR_BASE_ADDRESS (dr)
4326 || !DR_OFFSET (dr)
4327 || !DR_INIT (dr)
4328 || !DR_STEP (dr)))
4330 struct data_reference *newdr
4331 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4332 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4333 if (DR_BASE_ADDRESS (newdr)
4334 && DR_OFFSET (newdr)
4335 && DR_INIT (newdr)
4336 && DR_STEP (newdr)
4337 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4338 && integer_zerop (DR_STEP (newdr)))
4340 tree base_address = DR_BASE_ADDRESS (newdr);
4341 tree off = DR_OFFSET (newdr);
4342 tree step = ssize_int (1);
4343 if (integer_zerop (off)
4344 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4346 off = TREE_OPERAND (base_address, 1);
4347 base_address = TREE_OPERAND (base_address, 0);
4349 STRIP_NOPS (off);
4350 if (TREE_CODE (off) == MULT_EXPR
4351 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4353 step = TREE_OPERAND (off, 1);
4354 off = TREE_OPERAND (off, 0);
4355 STRIP_NOPS (off);
4357 if (CONVERT_EXPR_P (off)
4358 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4359 < TYPE_PRECISION (TREE_TYPE (off))))
4360 off = TREE_OPERAND (off, 0);
4361 if (TREE_CODE (off) == SSA_NAME)
4363 gimple *def = SSA_NAME_DEF_STMT (off);
4364 /* Look through widening conversion. */
4365 if (is_gimple_assign (def)
4366 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4368 tree rhs1 = gimple_assign_rhs1 (def);
4369 if (TREE_CODE (rhs1) == SSA_NAME
4370 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4371 && (TYPE_PRECISION (TREE_TYPE (off))
4372 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4373 def = SSA_NAME_DEF_STMT (rhs1);
4375 if (is_gimple_call (def)
4376 && gimple_call_internal_p (def)
4377 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4379 tree arg = gimple_call_arg (def, 0);
4380 tree reft = TREE_TYPE (DR_REF (newdr));
4381 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4382 arg = SSA_NAME_VAR (arg);
4383 if (arg == loop->simduid
4384 /* For now. */
4385 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4387 DR_BASE_ADDRESS (newdr) = base_address;
4388 DR_OFFSET (newdr) = ssize_int (0);
4389 DR_STEP (newdr) = step;
4390 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4391 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4392 /* Mark as simd-lane access. */
4393 tree arg2 = gimple_call_arg (def, 1);
4394 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4395 free_data_ref (dr);
4396 datarefs->safe_push (newdr);
4397 if (dataref_groups)
4398 dataref_groups->safe_push (group_id);
4399 return opt_result::success ();
4404 free_data_ref (newdr);
4407 datarefs->safe_push (dr);
4408 if (dataref_groups)
4409 dataref_groups->safe_push (group_id);
4410 return opt_result::success ();
4413 /* Function vect_analyze_data_refs.
4415 Find all the data references in the loop or basic block.
4417 The general structure of the analysis of data refs in the vectorizer is as
4418 follows:
4419 1- vect_analyze_data_refs(loop/bb): call
4420 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4421 in the loop/bb and their dependences.
4422 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4423 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4424 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4428 opt_result
4429 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4431 class loop *loop = NULL;
4432 unsigned int i;
4433 struct data_reference *dr;
4434 tree scalar_type;
4436 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4438 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4439 loop = LOOP_VINFO_LOOP (loop_vinfo);
4441 /* Go through the data-refs, check that the analysis succeeded. Update
4442 pointer from stmt_vec_info struct to DR and vectype. */
4444 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4445 FOR_EACH_VEC_ELT (datarefs, i, dr)
4447 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4448 poly_uint64 vf;
4450 gcc_assert (DR_REF (dr));
4451 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4452 gcc_assert (!stmt_info->dr_aux.dr);
4453 stmt_info->dr_aux.dr = dr;
4454 stmt_info->dr_aux.stmt = stmt_info;
4456 /* Check that analysis of the data-ref succeeded. */
4457 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4458 || !DR_STEP (dr))
4460 bool maybe_gather
4461 = DR_IS_READ (dr)
4462 && !TREE_THIS_VOLATILE (DR_REF (dr));
4463 bool maybe_scatter
4464 = DR_IS_WRITE (dr)
4465 && !TREE_THIS_VOLATILE (DR_REF (dr));
4467 /* If target supports vector gather loads or scatter stores,
4468 see if they can't be used. */
4469 if (is_a <loop_vec_info> (vinfo)
4470 && !nested_in_vect_loop_p (loop, stmt_info))
4472 if (maybe_gather || maybe_scatter)
4474 if (maybe_gather)
4475 gatherscatter = GATHER;
4476 else
4477 gatherscatter = SCATTER;
4481 if (gatherscatter == SG_NONE)
4483 if (dump_enabled_p ())
4484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4485 "not vectorized: data ref analysis "
4486 "failed %G", stmt_info->stmt);
4487 if (is_a <bb_vec_info> (vinfo))
4489 /* In BB vectorization the ref can still participate
4490 in dependence analysis, we just can't vectorize it. */
4491 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4492 continue;
4494 return opt_result::failure_at (stmt_info->stmt,
4495 "not vectorized:"
4496 " data ref analysis failed: %G",
4497 stmt_info->stmt);
4501 /* See if this was detected as SIMD lane access. */
4502 if (dr->aux == (void *)-1
4503 || dr->aux == (void *)-2
4504 || dr->aux == (void *)-3
4505 || dr->aux == (void *)-4)
4507 if (nested_in_vect_loop_p (loop, stmt_info))
4508 return opt_result::failure_at (stmt_info->stmt,
4509 "not vectorized:"
4510 " data ref analysis failed: %G",
4511 stmt_info->stmt);
4512 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4513 = -(uintptr_t) dr->aux;
4516 tree base = get_base_address (DR_REF (dr));
4517 if (base && VAR_P (base) && DECL_NONALIASED (base))
4519 if (dump_enabled_p ())
4520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4521 "not vectorized: base object not addressable "
4522 "for stmt: %G", stmt_info->stmt);
4523 if (is_a <bb_vec_info> (vinfo))
4525 /* In BB vectorization the ref can still participate
4526 in dependence analysis, we just can't vectorize it. */
4527 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4528 continue;
4530 return opt_result::failure_at (stmt_info->stmt,
4531 "not vectorized: base object not"
4532 " addressable for stmt: %G",
4533 stmt_info->stmt);
4536 if (is_a <loop_vec_info> (vinfo)
4537 && DR_STEP (dr)
4538 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4540 if (nested_in_vect_loop_p (loop, stmt_info))
4541 return opt_result::failure_at (stmt_info->stmt,
4542 "not vectorized: "
4543 "not suitable for strided load %G",
4544 stmt_info->stmt);
4545 STMT_VINFO_STRIDED_P (stmt_info) = true;
4548 /* Update DR field in stmt_vec_info struct. */
4550 /* If the dataref is in an inner-loop of the loop that is considered for
4551 for vectorization, we also want to analyze the access relative to
4552 the outer-loop (DR contains information only relative to the
4553 inner-most enclosing loop). We do that by building a reference to the
4554 first location accessed by the inner-loop, and analyze it relative to
4555 the outer-loop. */
4556 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4558 /* Build a reference to the first location accessed by the
4559 inner loop: *(BASE + INIT + OFFSET). By construction,
4560 this address must be invariant in the inner loop, so we
4561 can consider it as being used in the outer loop. */
4562 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4563 tree offset = unshare_expr (DR_OFFSET (dr));
4564 tree init = unshare_expr (DR_INIT (dr));
4565 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4566 init, offset);
4567 tree init_addr = fold_build_pointer_plus (base, init_offset);
4568 tree init_ref = build_fold_indirect_ref (init_addr);
4570 if (dump_enabled_p ())
4571 dump_printf_loc (MSG_NOTE, vect_location,
4572 "analyze in outer loop: %T\n", init_ref);
4574 opt_result res
4575 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4576 init_ref, loop, stmt_info->stmt);
4577 if (!res)
4578 /* dr_analyze_innermost already explained the failure. */
4579 return res;
4581 if (dump_enabled_p ())
4582 dump_printf_loc (MSG_NOTE, vect_location,
4583 "\touter base_address: %T\n"
4584 "\touter offset from base address: %T\n"
4585 "\touter constant offset from base address: %T\n"
4586 "\touter step: %T\n"
4587 "\touter base alignment: %d\n\n"
4588 "\touter base misalignment: %d\n"
4589 "\touter offset alignment: %d\n"
4590 "\touter step alignment: %d\n",
4591 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4592 STMT_VINFO_DR_OFFSET (stmt_info),
4593 STMT_VINFO_DR_INIT (stmt_info),
4594 STMT_VINFO_DR_STEP (stmt_info),
4595 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4596 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4597 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4598 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4601 /* Set vectype for STMT. */
4602 scalar_type = TREE_TYPE (DR_REF (dr));
4603 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4604 if (!vectype)
4606 if (dump_enabled_p ())
4608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4609 "not vectorized: no vectype for stmt: %G",
4610 stmt_info->stmt);
4611 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4612 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4613 scalar_type);
4614 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4617 if (is_a <bb_vec_info> (vinfo))
4619 /* No vector type is fine, the ref can still participate
4620 in dependence analysis, we just can't vectorize it. */
4621 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4622 continue;
4624 if (fatal)
4625 *fatal = false;
4626 return opt_result::failure_at (stmt_info->stmt,
4627 "not vectorized:"
4628 " no vectype for stmt: %G"
4629 " scalar_type: %T\n",
4630 stmt_info->stmt, scalar_type);
4632 else
4634 if (dump_enabled_p ())
4635 dump_printf_loc (MSG_NOTE, vect_location,
4636 "got vectype for stmt: %G%T\n",
4637 stmt_info->stmt, vectype);
4640 /* Adjust the minimal vectorization factor according to the
4641 vector type. */
4642 vf = TYPE_VECTOR_SUBPARTS (vectype);
4643 *min_vf = upper_bound (*min_vf, vf);
4645 /* Leave the BB vectorizer to pick the vector type later, based on
4646 the final dataref group size and SLP node size. */
4647 if (is_a <loop_vec_info> (vinfo))
4648 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4650 if (gatherscatter != SG_NONE)
4652 gather_scatter_info gs_info;
4653 if (!vect_check_gather_scatter (stmt_info,
4654 as_a <loop_vec_info> (vinfo),
4655 &gs_info)
4656 || !get_vectype_for_scalar_type (vinfo,
4657 TREE_TYPE (gs_info.offset)))
4659 if (fatal)
4660 *fatal = false;
4661 return opt_result::failure_at
4662 (stmt_info->stmt,
4663 (gatherscatter == GATHER)
4664 ? "not vectorized: not suitable for gather load %G"
4665 : "not vectorized: not suitable for scatter store %G",
4666 stmt_info->stmt);
4668 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4672 /* We used to stop processing and prune the list here. Verify we no
4673 longer need to. */
4674 gcc_assert (i == datarefs.length ());
4676 return opt_result::success ();
4680 /* Function vect_get_new_vect_var.
4682 Returns a name for a new variable. The current naming scheme appends the
4683 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4684 the name of vectorizer generated variables, and appends that to NAME if
4685 provided. */
4687 tree
4688 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4690 const char *prefix;
4691 tree new_vect_var;
4693 switch (var_kind)
4695 case vect_simple_var:
4696 prefix = "vect";
4697 break;
4698 case vect_scalar_var:
4699 prefix = "stmp";
4700 break;
4701 case vect_mask_var:
4702 prefix = "mask";
4703 break;
4704 case vect_pointer_var:
4705 prefix = "vectp";
4706 break;
4707 default:
4708 gcc_unreachable ();
4711 if (name)
4713 char* tmp = concat (prefix, "_", name, NULL);
4714 new_vect_var = create_tmp_reg (type, tmp);
4715 free (tmp);
4717 else
4718 new_vect_var = create_tmp_reg (type, prefix);
4720 return new_vect_var;
4723 /* Like vect_get_new_vect_var but return an SSA name. */
4725 tree
4726 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4728 const char *prefix;
4729 tree new_vect_var;
4731 switch (var_kind)
4733 case vect_simple_var:
4734 prefix = "vect";
4735 break;
4736 case vect_scalar_var:
4737 prefix = "stmp";
4738 break;
4739 case vect_pointer_var:
4740 prefix = "vectp";
4741 break;
4742 default:
4743 gcc_unreachable ();
4746 if (name)
4748 char* tmp = concat (prefix, "_", name, NULL);
4749 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4750 free (tmp);
4752 else
4753 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4755 return new_vect_var;
4758 /* Duplicate points-to info on NAME from DR_INFO. */
4760 static void
4761 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4763 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4764 /* DR_PTR_INFO is for a base SSA name, not including constant or
4765 variable offsets in the ref so its alignment info does not apply. */
4766 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4769 /* Function vect_create_addr_base_for_vector_ref.
4771 Create an expression that computes the address of the first memory location
4772 that will be accessed for a data reference.
4774 Input:
4775 STMT_INFO: The statement containing the data reference.
4776 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4777 OFFSET: Optional. If supplied, it is be added to the initial address.
4778 LOOP: Specify relative to which loop-nest should the address be computed.
4779 For example, when the dataref is in an inner-loop nested in an
4780 outer-loop that is now being vectorized, LOOP can be either the
4781 outer-loop, or the inner-loop. The first memory location accessed
4782 by the following dataref ('in' points to short):
4784 for (i=0; i<N; i++)
4785 for (j=0; j<M; j++)
4786 s += in[i+j]
4788 is as follows:
4789 if LOOP=i_loop: &in (relative to i_loop)
4790 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4792 Output:
4793 1. Return an SSA_NAME whose value is the address of the memory location of
4794 the first vector of the data reference.
4795 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4796 these statement(s) which define the returned SSA_NAME.
4798 FORNOW: We are only handling array accesses with step 1. */
4800 tree
4801 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4802 gimple_seq *new_stmt_list,
4803 tree offset)
4805 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4806 struct data_reference *dr = dr_info->dr;
4807 const char *base_name;
4808 tree addr_base;
4809 tree dest;
4810 gimple_seq seq = NULL;
4811 tree vect_ptr_type;
4812 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4813 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4815 tree data_ref_base = unshare_expr (drb->base_address);
4816 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4817 tree init = unshare_expr (drb->init);
4819 if (loop_vinfo)
4820 base_name = get_name (data_ref_base);
4821 else
4823 base_offset = ssize_int (0);
4824 init = ssize_int (0);
4825 base_name = get_name (DR_REF (dr));
4828 /* Create base_offset */
4829 base_offset = size_binop (PLUS_EXPR,
4830 fold_convert (sizetype, base_offset),
4831 fold_convert (sizetype, init));
4833 if (offset)
4835 offset = fold_convert (sizetype, offset);
4836 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4837 base_offset, offset);
4840 /* base + base_offset */
4841 if (loop_vinfo)
4842 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4843 else
4844 addr_base = build1 (ADDR_EXPR,
4845 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4846 /* Strip zero offset components since we don't need
4847 them and they can confuse late diagnostics if
4848 we CSE them wrongly. See PR106904 for example. */
4849 unshare_expr (strip_zero_offset_components
4850 (DR_REF (dr))));
4852 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4853 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4854 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4855 gimple_seq_add_seq (new_stmt_list, seq);
4857 if (DR_PTR_INFO (dr)
4858 && TREE_CODE (addr_base) == SSA_NAME
4859 /* We should only duplicate pointer info to newly created SSA names. */
4860 && SSA_NAME_VAR (addr_base) == dest)
4862 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4863 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4866 if (dump_enabled_p ())
4867 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4869 return addr_base;
4873 /* Function vect_create_data_ref_ptr.
4875 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4876 location accessed in the loop by STMT_INFO, along with the def-use update
4877 chain to appropriately advance the pointer through the loop iterations.
4878 Also set aliasing information for the pointer. This pointer is used by
4879 the callers to this function to create a memory reference expression for
4880 vector load/store access.
4882 Input:
4883 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4884 GIMPLE_ASSIGN <name, data-ref> or
4885 GIMPLE_ASSIGN <data-ref, name>.
4886 2. AGGR_TYPE: the type of the reference, which should be either a vector
4887 or an array.
4888 3. AT_LOOP: the loop where the vector memref is to be created.
4889 4. OFFSET (optional): a byte offset to be added to the initial address
4890 accessed by the data-ref in STMT_INFO.
4891 5. BSI: location where the new stmts are to be placed if there is no loop
4892 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4893 pointing to the initial address.
4894 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4895 to the IV during each iteration of the loop. NULL says to move
4896 by one copy of AGGR_TYPE up or down, depending on the step of the
4897 data reference.
4899 Output:
4900 1. Declare a new ptr to vector_type, and have it point to the base of the
4901 data reference (initial addressed accessed by the data reference).
4902 For example, for vector of type V8HI, the following code is generated:
4904 v8hi *ap;
4905 ap = (v8hi *)initial_address;
4907 if OFFSET is not supplied:
4908 initial_address = &a[init];
4909 if OFFSET is supplied:
4910 initial_address = &a[init] + OFFSET;
4911 if BYTE_OFFSET is supplied:
4912 initial_address = &a[init] + BYTE_OFFSET;
4914 Return the initial_address in INITIAL_ADDRESS.
4916 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4917 update the pointer in each iteration of the loop.
4919 Return the increment stmt that updates the pointer in PTR_INCR.
4921 3. Return the pointer. */
4923 tree
4924 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4925 tree aggr_type, class loop *at_loop, tree offset,
4926 tree *initial_address, gimple_stmt_iterator *gsi,
4927 gimple **ptr_incr, bool only_init,
4928 tree iv_step)
4930 const char *base_name;
4931 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4932 class loop *loop = NULL;
4933 bool nested_in_vect_loop = false;
4934 class loop *containing_loop = NULL;
4935 tree aggr_ptr_type;
4936 tree aggr_ptr;
4937 tree new_temp;
4938 gimple_seq new_stmt_list = NULL;
4939 edge pe = NULL;
4940 basic_block new_bb;
4941 tree aggr_ptr_init;
4942 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4943 struct data_reference *dr = dr_info->dr;
4944 tree aptr;
4945 gimple_stmt_iterator incr_gsi;
4946 bool insert_after;
4947 tree indx_before_incr, indx_after_incr;
4948 gimple *incr;
4949 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4951 gcc_assert (iv_step != NULL_TREE
4952 || TREE_CODE (aggr_type) == ARRAY_TYPE
4953 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4955 if (loop_vinfo)
4957 loop = LOOP_VINFO_LOOP (loop_vinfo);
4958 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4959 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4960 pe = loop_preheader_edge (loop);
4962 else
4964 gcc_assert (bb_vinfo);
4965 only_init = true;
4966 *ptr_incr = NULL;
4969 /* Create an expression for the first address accessed by this load
4970 in LOOP. */
4971 base_name = get_name (DR_BASE_ADDRESS (dr));
4973 if (dump_enabled_p ())
4975 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4976 dump_printf_loc (MSG_NOTE, vect_location,
4977 "create %s-pointer variable to type: %T",
4978 get_tree_code_name (TREE_CODE (aggr_type)),
4979 aggr_type);
4980 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4981 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4982 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4983 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4984 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4985 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4986 else
4987 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4988 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4991 /* (1) Create the new aggregate-pointer variable.
4992 Vector and array types inherit the alias set of their component
4993 type by default so we need to use a ref-all pointer if the data
4994 reference does not conflict with the created aggregated data
4995 reference because it is not addressable. */
4996 bool need_ref_all = false;
4997 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4998 get_alias_set (DR_REF (dr))))
4999 need_ref_all = true;
5000 /* Likewise for any of the data references in the stmt group. */
5001 else if (DR_GROUP_SIZE (stmt_info) > 1)
5003 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5006 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5007 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5008 get_alias_set (DR_REF (sdr))))
5010 need_ref_all = true;
5011 break;
5013 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5015 while (sinfo);
5017 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5018 need_ref_all);
5019 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5022 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5023 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5024 def-use update cycles for the pointer: one relative to the outer-loop
5025 (LOOP), which is what steps (3) and (4) below do. The other is relative
5026 to the inner-loop (which is the inner-most loop containing the dataref),
5027 and this is done be step (5) below.
5029 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5030 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5031 redundant. Steps (3),(4) create the following:
5033 vp0 = &base_addr;
5034 LOOP: vp1 = phi(vp0,vp2)
5037 vp2 = vp1 + step
5038 goto LOOP
5040 If there is an inner-loop nested in loop, then step (5) will also be
5041 applied, and an additional update in the inner-loop will be created:
5043 vp0 = &base_addr;
5044 LOOP: vp1 = phi(vp0,vp2)
5046 inner: vp3 = phi(vp1,vp4)
5047 vp4 = vp3 + inner_step
5048 if () goto inner
5050 vp2 = vp1 + step
5051 if () goto LOOP */
5053 /* (2) Calculate the initial address of the aggregate-pointer, and set
5054 the aggregate-pointer to point to it before the loop. */
5056 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5058 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5059 stmt_info, &new_stmt_list,
5060 offset);
5061 if (new_stmt_list)
5063 if (pe)
5065 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5066 gcc_assert (!new_bb);
5068 else
5069 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5072 *initial_address = new_temp;
5073 aggr_ptr_init = new_temp;
5075 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5076 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5077 inner-loop nested in LOOP (during outer-loop vectorization). */
5079 /* No update in loop is required. */
5080 if (only_init && (!loop_vinfo || at_loop == loop))
5081 aptr = aggr_ptr_init;
5082 else
5084 /* Accesses to invariant addresses should be handled specially
5085 by the caller. */
5086 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5087 gcc_assert (!integer_zerop (step));
5089 if (iv_step == NULL_TREE)
5091 /* The step of the aggregate pointer is the type size,
5092 negated for downward accesses. */
5093 iv_step = TYPE_SIZE_UNIT (aggr_type);
5094 if (tree_int_cst_sgn (step) == -1)
5095 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5098 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5100 create_iv (aggr_ptr_init, PLUS_EXPR,
5101 fold_convert (aggr_ptr_type, iv_step),
5102 aggr_ptr, loop, &incr_gsi, insert_after,
5103 &indx_before_incr, &indx_after_incr);
5104 incr = gsi_stmt (incr_gsi);
5106 /* Copy the points-to information if it exists. */
5107 if (DR_PTR_INFO (dr))
5109 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5110 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5112 if (ptr_incr)
5113 *ptr_incr = incr;
5115 aptr = indx_before_incr;
5118 if (!nested_in_vect_loop || only_init)
5119 return aptr;
5122 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5123 nested in LOOP, if exists. */
5125 gcc_assert (nested_in_vect_loop);
5126 if (!only_init)
5128 standard_iv_increment_position (containing_loop, &incr_gsi,
5129 &insert_after);
5130 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5131 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5132 &indx_before_incr, &indx_after_incr);
5133 incr = gsi_stmt (incr_gsi);
5135 /* Copy the points-to information if it exists. */
5136 if (DR_PTR_INFO (dr))
5138 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5139 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5141 if (ptr_incr)
5142 *ptr_incr = incr;
5144 return indx_before_incr;
5146 else
5147 gcc_unreachable ();
5151 /* Function bump_vector_ptr
5153 Increment a pointer (to a vector type) by vector-size. If requested,
5154 i.e. if PTR-INCR is given, then also connect the new increment stmt
5155 to the existing def-use update-chain of the pointer, by modifying
5156 the PTR_INCR as illustrated below:
5158 The pointer def-use update-chain before this function:
5159 DATAREF_PTR = phi (p_0, p_2)
5160 ....
5161 PTR_INCR: p_2 = DATAREF_PTR + step
5163 The pointer def-use update-chain after this function:
5164 DATAREF_PTR = phi (p_0, p_2)
5165 ....
5166 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5167 ....
5168 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5170 Input:
5171 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5172 in the loop.
5173 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5174 the loop. The increment amount across iterations is expected
5175 to be vector_size.
5176 BSI - location where the new update stmt is to be placed.
5177 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5178 BUMP - optional. The offset by which to bump the pointer. If not given,
5179 the offset is assumed to be vector_size.
5181 Output: Return NEW_DATAREF_PTR as illustrated above.
5185 tree
5186 bump_vector_ptr (vec_info *vinfo,
5187 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5188 stmt_vec_info stmt_info, tree bump)
5190 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5191 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5192 tree update = TYPE_SIZE_UNIT (vectype);
5193 gimple *incr_stmt;
5194 ssa_op_iter iter;
5195 use_operand_p use_p;
5196 tree new_dataref_ptr;
5198 if (bump)
5199 update = bump;
5201 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5202 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5203 else if (is_gimple_min_invariant (dataref_ptr))
5204 /* When possible avoid emitting a separate increment stmt that will
5205 force the addressed object addressable. */
5206 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5207 fold_build2 (MEM_REF,
5208 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5209 dataref_ptr,
5210 fold_convert (ptr_type_node, update)));
5211 else
5212 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5213 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5214 dataref_ptr, update);
5215 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5216 /* Fold the increment, avoiding excessive chains use-def chains of
5217 those, leading to compile-time issues for passes until the next
5218 forwprop pass which would do this as well. */
5219 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5220 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5222 incr_stmt = gsi_stmt (fold_gsi);
5223 update_stmt (incr_stmt);
5226 /* Copy the points-to information if it exists. */
5227 if (DR_PTR_INFO (dr))
5229 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5230 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5233 if (!ptr_incr)
5234 return new_dataref_ptr;
5236 /* Update the vector-pointer's cross-iteration increment. */
5237 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5239 tree use = USE_FROM_PTR (use_p);
5241 if (use == dataref_ptr)
5242 SET_USE (use_p, new_dataref_ptr);
5243 else
5244 gcc_assert (operand_equal_p (use, update, 0));
5247 return new_dataref_ptr;
5251 /* Copy memory reference info such as base/clique from the SRC reference
5252 to the DEST MEM_REF. */
5254 void
5255 vect_copy_ref_info (tree dest, tree src)
5257 if (TREE_CODE (dest) != MEM_REF)
5258 return;
5260 tree src_base = src;
5261 while (handled_component_p (src_base))
5262 src_base = TREE_OPERAND (src_base, 0);
5263 if (TREE_CODE (src_base) != MEM_REF
5264 && TREE_CODE (src_base) != TARGET_MEM_REF)
5265 return;
5267 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5268 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5272 /* Function vect_create_destination_var.
5274 Create a new temporary of type VECTYPE. */
5276 tree
5277 vect_create_destination_var (tree scalar_dest, tree vectype)
5279 tree vec_dest;
5280 const char *name;
5281 char *new_name;
5282 tree type;
5283 enum vect_var_kind kind;
5285 kind = vectype
5286 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5287 ? vect_mask_var
5288 : vect_simple_var
5289 : vect_scalar_var;
5290 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5292 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5294 name = get_name (scalar_dest);
5295 if (name)
5296 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5297 else
5298 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5299 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5300 free (new_name);
5302 return vec_dest;
5305 /* Function vect_grouped_store_supported.
5307 Returns TRUE if interleave high and interleave low permutations
5308 are supported, and FALSE otherwise. */
5310 bool
5311 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5313 machine_mode mode = TYPE_MODE (vectype);
5315 /* vect_permute_store_chain requires the group size to be equal to 3 or
5316 be a power of two. */
5317 if (count != 3 && exact_log2 (count) == -1)
5319 if (dump_enabled_p ())
5320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5321 "the size of the group of accesses"
5322 " is not a power of 2 or not eqaul to 3\n");
5323 return false;
5326 /* Check that the permutation is supported. */
5327 if (VECTOR_MODE_P (mode))
5329 unsigned int i;
5330 if (count == 3)
5332 unsigned int j0 = 0, j1 = 0, j2 = 0;
5333 unsigned int i, j;
5335 unsigned int nelt;
5336 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5338 if (dump_enabled_p ())
5339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5340 "cannot handle groups of 3 stores for"
5341 " variable-length vectors\n");
5342 return false;
5345 vec_perm_builder sel (nelt, nelt, 1);
5346 sel.quick_grow (nelt);
5347 vec_perm_indices indices;
5348 for (j = 0; j < 3; j++)
5350 int nelt0 = ((3 - j) * nelt) % 3;
5351 int nelt1 = ((3 - j) * nelt + 1) % 3;
5352 int nelt2 = ((3 - j) * nelt + 2) % 3;
5353 for (i = 0; i < nelt; i++)
5355 if (3 * i + nelt0 < nelt)
5356 sel[3 * i + nelt0] = j0++;
5357 if (3 * i + nelt1 < nelt)
5358 sel[3 * i + nelt1] = nelt + j1++;
5359 if (3 * i + nelt2 < nelt)
5360 sel[3 * i + nelt2] = 0;
5362 indices.new_vector (sel, 2, nelt);
5363 if (!can_vec_perm_const_p (mode, mode, indices))
5365 if (dump_enabled_p ())
5366 dump_printf (MSG_MISSED_OPTIMIZATION,
5367 "permutation op not supported by target.\n");
5368 return false;
5371 for (i = 0; i < nelt; i++)
5373 if (3 * i + nelt0 < nelt)
5374 sel[3 * i + nelt0] = 3 * i + nelt0;
5375 if (3 * i + nelt1 < nelt)
5376 sel[3 * i + nelt1] = 3 * i + nelt1;
5377 if (3 * i + nelt2 < nelt)
5378 sel[3 * i + nelt2] = nelt + j2++;
5380 indices.new_vector (sel, 2, nelt);
5381 if (!can_vec_perm_const_p (mode, mode, indices))
5383 if (dump_enabled_p ())
5384 dump_printf (MSG_MISSED_OPTIMIZATION,
5385 "permutation op not supported by target.\n");
5386 return false;
5389 return true;
5391 else
5393 /* If length is not equal to 3 then only power of 2 is supported. */
5394 gcc_assert (pow2p_hwi (count));
5395 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5397 /* The encoding has 2 interleaved stepped patterns. */
5398 if(!multiple_p (nelt, 2))
5399 return false;
5400 vec_perm_builder sel (nelt, 2, 3);
5401 sel.quick_grow (6);
5402 for (i = 0; i < 3; i++)
5404 sel[i * 2] = i;
5405 sel[i * 2 + 1] = i + nelt;
5407 vec_perm_indices indices (sel, 2, nelt);
5408 if (can_vec_perm_const_p (mode, mode, indices))
5410 for (i = 0; i < 6; i++)
5411 sel[i] += exact_div (nelt, 2);
5412 indices.new_vector (sel, 2, nelt);
5413 if (can_vec_perm_const_p (mode, mode, indices))
5414 return true;
5419 if (dump_enabled_p ())
5420 dump_printf (MSG_MISSED_OPTIMIZATION,
5421 "permutation op not supported by target.\n");
5422 return false;
5426 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5427 type VECTYPE. MASKED_P says whether the masked form is needed. */
5429 bool
5430 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5431 bool masked_p)
5433 if (masked_p)
5434 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5435 vec_mask_store_lanes_optab,
5436 vectype, count);
5437 else
5438 return vect_lanes_optab_supported_p ("vec_store_lanes",
5439 vec_store_lanes_optab,
5440 vectype, count);
5444 /* Function vect_permute_store_chain.
5446 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5447 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5448 the data correctly for the stores. Return the final references for stores
5449 in RESULT_CHAIN.
5451 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5452 The input is 4 vectors each containing 8 elements. We assign a number to
5453 each element, the input sequence is:
5455 1st vec: 0 1 2 3 4 5 6 7
5456 2nd vec: 8 9 10 11 12 13 14 15
5457 3rd vec: 16 17 18 19 20 21 22 23
5458 4th vec: 24 25 26 27 28 29 30 31
5460 The output sequence should be:
5462 1st vec: 0 8 16 24 1 9 17 25
5463 2nd vec: 2 10 18 26 3 11 19 27
5464 3rd vec: 4 12 20 28 5 13 21 30
5465 4th vec: 6 14 22 30 7 15 23 31
5467 i.e., we interleave the contents of the four vectors in their order.
5469 We use interleave_high/low instructions to create such output. The input of
5470 each interleave_high/low operation is two vectors:
5471 1st vec 2nd vec
5472 0 1 2 3 4 5 6 7
5473 the even elements of the result vector are obtained left-to-right from the
5474 high/low elements of the first vector. The odd elements of the result are
5475 obtained left-to-right from the high/low elements of the second vector.
5476 The output of interleave_high will be: 0 4 1 5
5477 and of interleave_low: 2 6 3 7
5480 The permutation is done in log LENGTH stages. In each stage interleave_high
5481 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5482 where the first argument is taken from the first half of DR_CHAIN and the
5483 second argument from it's second half.
5484 In our example,
5486 I1: interleave_high (1st vec, 3rd vec)
5487 I2: interleave_low (1st vec, 3rd vec)
5488 I3: interleave_high (2nd vec, 4th vec)
5489 I4: interleave_low (2nd vec, 4th vec)
5491 The output for the first stage is:
5493 I1: 0 16 1 17 2 18 3 19
5494 I2: 4 20 5 21 6 22 7 23
5495 I3: 8 24 9 25 10 26 11 27
5496 I4: 12 28 13 29 14 30 15 31
5498 The output of the second stage, i.e. the final result is:
5500 I1: 0 8 16 24 1 9 17 25
5501 I2: 2 10 18 26 3 11 19 27
5502 I3: 4 12 20 28 5 13 21 30
5503 I4: 6 14 22 30 7 15 23 31. */
5505 void
5506 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5507 unsigned int length,
5508 stmt_vec_info stmt_info,
5509 gimple_stmt_iterator *gsi,
5510 vec<tree> *result_chain)
5512 tree vect1, vect2, high, low;
5513 gimple *perm_stmt;
5514 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5515 tree perm_mask_low, perm_mask_high;
5516 tree data_ref;
5517 tree perm3_mask_low, perm3_mask_high;
5518 unsigned int i, j, n, log_length = exact_log2 (length);
5520 result_chain->quick_grow (length);
5521 memcpy (result_chain->address (), dr_chain.address (),
5522 length * sizeof (tree));
5524 if (length == 3)
5526 /* vect_grouped_store_supported ensures that this is constant. */
5527 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5528 unsigned int j0 = 0, j1 = 0, j2 = 0;
5530 vec_perm_builder sel (nelt, nelt, 1);
5531 sel.quick_grow (nelt);
5532 vec_perm_indices indices;
5533 for (j = 0; j < 3; j++)
5535 int nelt0 = ((3 - j) * nelt) % 3;
5536 int nelt1 = ((3 - j) * nelt + 1) % 3;
5537 int nelt2 = ((3 - j) * nelt + 2) % 3;
5539 for (i = 0; i < nelt; i++)
5541 if (3 * i + nelt0 < nelt)
5542 sel[3 * i + nelt0] = j0++;
5543 if (3 * i + nelt1 < nelt)
5544 sel[3 * i + nelt1] = nelt + j1++;
5545 if (3 * i + nelt2 < nelt)
5546 sel[3 * i + nelt2] = 0;
5548 indices.new_vector (sel, 2, nelt);
5549 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5551 for (i = 0; i < nelt; i++)
5553 if (3 * i + nelt0 < nelt)
5554 sel[3 * i + nelt0] = 3 * i + nelt0;
5555 if (3 * i + nelt1 < nelt)
5556 sel[3 * i + nelt1] = 3 * i + nelt1;
5557 if (3 * i + nelt2 < nelt)
5558 sel[3 * i + nelt2] = nelt + j2++;
5560 indices.new_vector (sel, 2, nelt);
5561 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5563 vect1 = dr_chain[0];
5564 vect2 = dr_chain[1];
5566 /* Create interleaving stmt:
5567 low = VEC_PERM_EXPR <vect1, vect2,
5568 {j, nelt, *, j + 1, nelt + j + 1, *,
5569 j + 2, nelt + j + 2, *, ...}> */
5570 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5571 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5572 vect2, perm3_mask_low);
5573 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5575 vect1 = data_ref;
5576 vect2 = dr_chain[2];
5577 /* Create interleaving stmt:
5578 low = VEC_PERM_EXPR <vect1, vect2,
5579 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5580 6, 7, nelt + j + 2, ...}> */
5581 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5582 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5583 vect2, perm3_mask_high);
5584 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5585 (*result_chain)[j] = data_ref;
5588 else
5590 /* If length is not equal to 3 then only power of 2 is supported. */
5591 gcc_assert (pow2p_hwi (length));
5593 /* The encoding has 2 interleaved stepped patterns. */
5594 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5595 vec_perm_builder sel (nelt, 2, 3);
5596 sel.quick_grow (6);
5597 for (i = 0; i < 3; i++)
5599 sel[i * 2] = i;
5600 sel[i * 2 + 1] = i + nelt;
5602 vec_perm_indices indices (sel, 2, nelt);
5603 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5605 for (i = 0; i < 6; i++)
5606 sel[i] += exact_div (nelt, 2);
5607 indices.new_vector (sel, 2, nelt);
5608 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5610 for (i = 0, n = log_length; i < n; i++)
5612 for (j = 0; j < length/2; j++)
5614 vect1 = dr_chain[j];
5615 vect2 = dr_chain[j+length/2];
5617 /* Create interleaving stmt:
5618 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5619 ...}> */
5620 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5621 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5622 vect2, perm_mask_high);
5623 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5624 (*result_chain)[2*j] = high;
5626 /* Create interleaving stmt:
5627 low = VEC_PERM_EXPR <vect1, vect2,
5628 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5629 ...}> */
5630 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5631 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5632 vect2, perm_mask_low);
5633 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5634 (*result_chain)[2*j+1] = low;
5636 memcpy (dr_chain.address (), result_chain->address (),
5637 length * sizeof (tree));
5642 /* Function vect_setup_realignment
5644 This function is called when vectorizing an unaligned load using
5645 the dr_explicit_realign[_optimized] scheme.
5646 This function generates the following code at the loop prolog:
5648 p = initial_addr;
5649 x msq_init = *(floor(p)); # prolog load
5650 realignment_token = call target_builtin;
5651 loop:
5652 x msq = phi (msq_init, ---)
5654 The stmts marked with x are generated only for the case of
5655 dr_explicit_realign_optimized.
5657 The code above sets up a new (vector) pointer, pointing to the first
5658 location accessed by STMT_INFO, and a "floor-aligned" load using that
5659 pointer. It also generates code to compute the "realignment-token"
5660 (if the relevant target hook was defined), and creates a phi-node at the
5661 loop-header bb whose arguments are the result of the prolog-load (created
5662 by this function) and the result of a load that takes place in the loop
5663 (to be created by the caller to this function).
5665 For the case of dr_explicit_realign_optimized:
5666 The caller to this function uses the phi-result (msq) to create the
5667 realignment code inside the loop, and sets up the missing phi argument,
5668 as follows:
5669 loop:
5670 msq = phi (msq_init, lsq)
5671 lsq = *(floor(p')); # load in loop
5672 result = realign_load (msq, lsq, realignment_token);
5674 For the case of dr_explicit_realign:
5675 loop:
5676 msq = *(floor(p)); # load in loop
5677 p' = p + (VS-1);
5678 lsq = *(floor(p')); # load in loop
5679 result = realign_load (msq, lsq, realignment_token);
5681 Input:
5682 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5683 a memory location that may be unaligned.
5684 BSI - place where new code is to be inserted.
5685 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5686 is used.
5688 Output:
5689 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5690 target hook, if defined.
5691 Return value - the result of the loop-header phi node. */
5693 tree
5694 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5695 gimple_stmt_iterator *gsi, tree *realignment_token,
5696 enum dr_alignment_support alignment_support_scheme,
5697 tree init_addr,
5698 class loop **at_loop)
5700 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5701 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5702 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5703 struct data_reference *dr = dr_info->dr;
5704 class loop *loop = NULL;
5705 edge pe = NULL;
5706 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5707 tree vec_dest;
5708 gimple *inc;
5709 tree ptr;
5710 tree data_ref;
5711 basic_block new_bb;
5712 tree msq_init = NULL_TREE;
5713 tree new_temp;
5714 gphi *phi_stmt;
5715 tree msq = NULL_TREE;
5716 gimple_seq stmts = NULL;
5717 bool compute_in_loop = false;
5718 bool nested_in_vect_loop = false;
5719 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5720 class loop *loop_for_initial_load = NULL;
5722 if (loop_vinfo)
5724 loop = LOOP_VINFO_LOOP (loop_vinfo);
5725 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5728 gcc_assert (alignment_support_scheme == dr_explicit_realign
5729 || alignment_support_scheme == dr_explicit_realign_optimized);
5731 /* We need to generate three things:
5732 1. the misalignment computation
5733 2. the extra vector load (for the optimized realignment scheme).
5734 3. the phi node for the two vectors from which the realignment is
5735 done (for the optimized realignment scheme). */
5737 /* 1. Determine where to generate the misalignment computation.
5739 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5740 calculation will be generated by this function, outside the loop (in the
5741 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5742 caller, inside the loop.
5744 Background: If the misalignment remains fixed throughout the iterations of
5745 the loop, then both realignment schemes are applicable, and also the
5746 misalignment computation can be done outside LOOP. This is because we are
5747 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5748 are a multiple of VS (the Vector Size), and therefore the misalignment in
5749 different vectorized LOOP iterations is always the same.
5750 The problem arises only if the memory access is in an inner-loop nested
5751 inside LOOP, which is now being vectorized using outer-loop vectorization.
5752 This is the only case when the misalignment of the memory access may not
5753 remain fixed throughout the iterations of the inner-loop (as explained in
5754 detail in vect_supportable_dr_alignment). In this case, not only is the
5755 optimized realignment scheme not applicable, but also the misalignment
5756 computation (and generation of the realignment token that is passed to
5757 REALIGN_LOAD) have to be done inside the loop.
5759 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5760 or not, which in turn determines if the misalignment is computed inside
5761 the inner-loop, or outside LOOP. */
5763 if (init_addr != NULL_TREE || !loop_vinfo)
5765 compute_in_loop = true;
5766 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5770 /* 2. Determine where to generate the extra vector load.
5772 For the optimized realignment scheme, instead of generating two vector
5773 loads in each iteration, we generate a single extra vector load in the
5774 preheader of the loop, and in each iteration reuse the result of the
5775 vector load from the previous iteration. In case the memory access is in
5776 an inner-loop nested inside LOOP, which is now being vectorized using
5777 outer-loop vectorization, we need to determine whether this initial vector
5778 load should be generated at the preheader of the inner-loop, or can be
5779 generated at the preheader of LOOP. If the memory access has no evolution
5780 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5781 to be generated inside LOOP (in the preheader of the inner-loop). */
5783 if (nested_in_vect_loop)
5785 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5786 bool invariant_in_outerloop =
5787 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5788 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5790 else
5791 loop_for_initial_load = loop;
5792 if (at_loop)
5793 *at_loop = loop_for_initial_load;
5795 tree vuse = NULL_TREE;
5796 if (loop_for_initial_load)
5798 pe = loop_preheader_edge (loop_for_initial_load);
5799 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5800 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5802 if (!vuse)
5803 vuse = gimple_vuse (gsi_stmt (*gsi));
5805 /* 3. For the case of the optimized realignment, create the first vector
5806 load at the loop preheader. */
5808 if (alignment_support_scheme == dr_explicit_realign_optimized)
5810 /* Create msq_init = *(floor(p1)) in the loop preheader */
5811 gassign *new_stmt;
5813 gcc_assert (!compute_in_loop);
5814 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5815 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5816 loop_for_initial_load, NULL_TREE,
5817 &init_addr, NULL, &inc, true);
5818 if (TREE_CODE (ptr) == SSA_NAME)
5819 new_temp = copy_ssa_name (ptr);
5820 else
5821 new_temp = make_ssa_name (TREE_TYPE (ptr));
5822 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5823 tree type = TREE_TYPE (ptr);
5824 new_stmt = gimple_build_assign
5825 (new_temp, BIT_AND_EXPR, ptr,
5826 fold_build2 (MINUS_EXPR, type,
5827 build_int_cst (type, 0),
5828 build_int_cst (type, align)));
5829 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5830 gcc_assert (!new_bb);
5831 data_ref
5832 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5833 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5834 vect_copy_ref_info (data_ref, DR_REF (dr));
5835 new_stmt = gimple_build_assign (vec_dest, data_ref);
5836 new_temp = make_ssa_name (vec_dest, new_stmt);
5837 gimple_assign_set_lhs (new_stmt, new_temp);
5838 gimple_set_vuse (new_stmt, vuse);
5839 if (pe)
5841 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5842 gcc_assert (!new_bb);
5844 else
5845 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5847 msq_init = gimple_assign_lhs (new_stmt);
5850 /* 4. Create realignment token using a target builtin, if available.
5851 It is done either inside the containing loop, or before LOOP (as
5852 determined above). */
5854 if (targetm.vectorize.builtin_mask_for_load)
5856 gcall *new_stmt;
5857 tree builtin_decl;
5859 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5860 if (!init_addr)
5862 /* Generate the INIT_ADDR computation outside LOOP. */
5863 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5864 stmt_info, &stmts,
5865 NULL_TREE);
5866 if (loop)
5868 pe = loop_preheader_edge (loop);
5869 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5870 gcc_assert (!new_bb);
5872 else
5873 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5876 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5877 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5878 vec_dest =
5879 vect_create_destination_var (scalar_dest,
5880 gimple_call_return_type (new_stmt));
5881 new_temp = make_ssa_name (vec_dest, new_stmt);
5882 gimple_call_set_lhs (new_stmt, new_temp);
5884 if (compute_in_loop)
5885 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5886 else
5888 /* Generate the misalignment computation outside LOOP. */
5889 pe = loop_preheader_edge (loop);
5890 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5891 gcc_assert (!new_bb);
5894 *realignment_token = gimple_call_lhs (new_stmt);
5896 /* The result of the CALL_EXPR to this builtin is determined from
5897 the value of the parameter and no global variables are touched
5898 which makes the builtin a "const" function. Requiring the
5899 builtin to have the "const" attribute makes it unnecessary
5900 to call mark_call_clobbered. */
5901 gcc_assert (TREE_READONLY (builtin_decl));
5904 if (alignment_support_scheme == dr_explicit_realign)
5905 return msq;
5907 gcc_assert (!compute_in_loop);
5908 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5911 /* 5. Create msq = phi <msq_init, lsq> in loop */
5913 pe = loop_preheader_edge (containing_loop);
5914 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5915 msq = make_ssa_name (vec_dest);
5916 phi_stmt = create_phi_node (msq, containing_loop->header);
5917 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5919 return msq;
5923 /* Function vect_grouped_load_supported.
5925 COUNT is the size of the load group (the number of statements plus the
5926 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5927 only one statement, with a gap of COUNT - 1.
5929 Returns true if a suitable permute exists. */
5931 bool
5932 vect_grouped_load_supported (tree vectype, bool single_element_p,
5933 unsigned HOST_WIDE_INT count)
5935 machine_mode mode = TYPE_MODE (vectype);
5937 /* If this is single-element interleaving with an element distance
5938 that leaves unused vector loads around punt - we at least create
5939 very sub-optimal code in that case (and blow up memory,
5940 see PR65518). */
5941 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5943 if (dump_enabled_p ())
5944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5945 "single-element interleaving not supported "
5946 "for not adjacent vector loads\n");
5947 return false;
5950 /* vect_permute_load_chain requires the group size to be equal to 3 or
5951 be a power of two. */
5952 if (count != 3 && exact_log2 (count) == -1)
5954 if (dump_enabled_p ())
5955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5956 "the size of the group of accesses"
5957 " is not a power of 2 or not equal to 3\n");
5958 return false;
5961 /* Check that the permutation is supported. */
5962 if (VECTOR_MODE_P (mode))
5964 unsigned int i, j;
5965 if (count == 3)
5967 unsigned int nelt;
5968 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5970 if (dump_enabled_p ())
5971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5972 "cannot handle groups of 3 loads for"
5973 " variable-length vectors\n");
5974 return false;
5977 vec_perm_builder sel (nelt, nelt, 1);
5978 sel.quick_grow (nelt);
5979 vec_perm_indices indices;
5980 unsigned int k;
5981 for (k = 0; k < 3; k++)
5983 for (i = 0; i < nelt; i++)
5984 if (3 * i + k < 2 * nelt)
5985 sel[i] = 3 * i + k;
5986 else
5987 sel[i] = 0;
5988 indices.new_vector (sel, 2, nelt);
5989 if (!can_vec_perm_const_p (mode, mode, indices))
5991 if (dump_enabled_p ())
5992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5993 "shuffle of 3 loads is not supported by"
5994 " target\n");
5995 return false;
5997 for (i = 0, j = 0; i < nelt; i++)
5998 if (3 * i + k < 2 * nelt)
5999 sel[i] = i;
6000 else
6001 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6002 indices.new_vector (sel, 2, nelt);
6003 if (!can_vec_perm_const_p (mode, mode, indices))
6005 if (dump_enabled_p ())
6006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6007 "shuffle of 3 loads is not supported by"
6008 " target\n");
6009 return false;
6012 return true;
6014 else
6016 /* If length is not equal to 3 then only power of 2 is supported. */
6017 gcc_assert (pow2p_hwi (count));
6018 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6020 /* The encoding has a single stepped pattern. */
6021 vec_perm_builder sel (nelt, 1, 3);
6022 sel.quick_grow (3);
6023 for (i = 0; i < 3; i++)
6024 sel[i] = i * 2;
6025 vec_perm_indices indices (sel, 2, nelt);
6026 if (can_vec_perm_const_p (mode, mode, indices))
6028 for (i = 0; i < 3; i++)
6029 sel[i] = i * 2 + 1;
6030 indices.new_vector (sel, 2, nelt);
6031 if (can_vec_perm_const_p (mode, mode, indices))
6032 return true;
6037 if (dump_enabled_p ())
6038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6039 "extract even/odd not supported by target\n");
6040 return false;
6043 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6044 type VECTYPE. MASKED_P says whether the masked form is needed. */
6046 bool
6047 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6048 bool masked_p)
6050 if (masked_p)
6051 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6052 vec_mask_load_lanes_optab,
6053 vectype, count);
6054 else
6055 return vect_lanes_optab_supported_p ("vec_load_lanes",
6056 vec_load_lanes_optab,
6057 vectype, count);
6060 /* Function vect_permute_load_chain.
6062 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6063 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6064 the input data correctly. Return the final references for loads in
6065 RESULT_CHAIN.
6067 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6068 The input is 4 vectors each containing 8 elements. We assign a number to each
6069 element, the input sequence is:
6071 1st vec: 0 1 2 3 4 5 6 7
6072 2nd vec: 8 9 10 11 12 13 14 15
6073 3rd vec: 16 17 18 19 20 21 22 23
6074 4th vec: 24 25 26 27 28 29 30 31
6076 The output sequence should be:
6078 1st vec: 0 4 8 12 16 20 24 28
6079 2nd vec: 1 5 9 13 17 21 25 29
6080 3rd vec: 2 6 10 14 18 22 26 30
6081 4th vec: 3 7 11 15 19 23 27 31
6083 i.e., the first output vector should contain the first elements of each
6084 interleaving group, etc.
6086 We use extract_even/odd instructions to create such output. The input of
6087 each extract_even/odd operation is two vectors
6088 1st vec 2nd vec
6089 0 1 2 3 4 5 6 7
6091 and the output is the vector of extracted even/odd elements. The output of
6092 extract_even will be: 0 2 4 6
6093 and of extract_odd: 1 3 5 7
6096 The permutation is done in log LENGTH stages. In each stage extract_even
6097 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6098 their order. In our example,
6100 E1: extract_even (1st vec, 2nd vec)
6101 E2: extract_odd (1st vec, 2nd vec)
6102 E3: extract_even (3rd vec, 4th vec)
6103 E4: extract_odd (3rd vec, 4th vec)
6105 The output for the first stage will be:
6107 E1: 0 2 4 6 8 10 12 14
6108 E2: 1 3 5 7 9 11 13 15
6109 E3: 16 18 20 22 24 26 28 30
6110 E4: 17 19 21 23 25 27 29 31
6112 In order to proceed and create the correct sequence for the next stage (or
6113 for the correct output, if the second stage is the last one, as in our
6114 example), we first put the output of extract_even operation and then the
6115 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6116 The input for the second stage is:
6118 1st vec (E1): 0 2 4 6 8 10 12 14
6119 2nd vec (E3): 16 18 20 22 24 26 28 30
6120 3rd vec (E2): 1 3 5 7 9 11 13 15
6121 4th vec (E4): 17 19 21 23 25 27 29 31
6123 The output of the second stage:
6125 E1: 0 4 8 12 16 20 24 28
6126 E2: 2 6 10 14 18 22 26 30
6127 E3: 1 5 9 13 17 21 25 29
6128 E4: 3 7 11 15 19 23 27 31
6130 And RESULT_CHAIN after reordering:
6132 1st vec (E1): 0 4 8 12 16 20 24 28
6133 2nd vec (E3): 1 5 9 13 17 21 25 29
6134 3rd vec (E2): 2 6 10 14 18 22 26 30
6135 4th vec (E4): 3 7 11 15 19 23 27 31. */
6137 static void
6138 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6139 unsigned int length,
6140 stmt_vec_info stmt_info,
6141 gimple_stmt_iterator *gsi,
6142 vec<tree> *result_chain)
6144 tree data_ref, first_vect, second_vect;
6145 tree perm_mask_even, perm_mask_odd;
6146 tree perm3_mask_low, perm3_mask_high;
6147 gimple *perm_stmt;
6148 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6149 unsigned int i, j, log_length = exact_log2 (length);
6151 result_chain->quick_grow (length);
6152 memcpy (result_chain->address (), dr_chain.address (),
6153 length * sizeof (tree));
6155 if (length == 3)
6157 /* vect_grouped_load_supported ensures that this is constant. */
6158 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6159 unsigned int k;
6161 vec_perm_builder sel (nelt, nelt, 1);
6162 sel.quick_grow (nelt);
6163 vec_perm_indices indices;
6164 for (k = 0; k < 3; k++)
6166 for (i = 0; i < nelt; i++)
6167 if (3 * i + k < 2 * nelt)
6168 sel[i] = 3 * i + k;
6169 else
6170 sel[i] = 0;
6171 indices.new_vector (sel, 2, nelt);
6172 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6174 for (i = 0, j = 0; i < nelt; i++)
6175 if (3 * i + k < 2 * nelt)
6176 sel[i] = i;
6177 else
6178 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6179 indices.new_vector (sel, 2, nelt);
6180 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6182 first_vect = dr_chain[0];
6183 second_vect = dr_chain[1];
6185 /* Create interleaving stmt (low part of):
6186 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6187 ...}> */
6188 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6189 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6190 second_vect, perm3_mask_low);
6191 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6193 /* Create interleaving stmt (high part of):
6194 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6195 ...}> */
6196 first_vect = data_ref;
6197 second_vect = dr_chain[2];
6198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6200 second_vect, perm3_mask_high);
6201 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6202 (*result_chain)[k] = data_ref;
6205 else
6207 /* If length is not equal to 3 then only power of 2 is supported. */
6208 gcc_assert (pow2p_hwi (length));
6210 /* The encoding has a single stepped pattern. */
6211 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6212 vec_perm_builder sel (nelt, 1, 3);
6213 sel.quick_grow (3);
6214 for (i = 0; i < 3; ++i)
6215 sel[i] = i * 2;
6216 vec_perm_indices indices (sel, 2, nelt);
6217 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6219 for (i = 0; i < 3; ++i)
6220 sel[i] = i * 2 + 1;
6221 indices.new_vector (sel, 2, nelt);
6222 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6224 for (i = 0; i < log_length; i++)
6226 for (j = 0; j < length; j += 2)
6228 first_vect = dr_chain[j];
6229 second_vect = dr_chain[j+1];
6231 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6232 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6233 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6234 first_vect, second_vect,
6235 perm_mask_even);
6236 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6237 (*result_chain)[j/2] = data_ref;
6239 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6240 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6241 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6242 first_vect, second_vect,
6243 perm_mask_odd);
6244 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6245 (*result_chain)[j/2+length/2] = data_ref;
6247 memcpy (dr_chain.address (), result_chain->address (),
6248 length * sizeof (tree));
6253 /* Function vect_shift_permute_load_chain.
6255 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6256 sequence of stmts to reorder the input data accordingly.
6257 Return the final references for loads in RESULT_CHAIN.
6258 Return true if successed, false otherwise.
6260 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6261 The input is 3 vectors each containing 8 elements. We assign a
6262 number to each element, the input sequence is:
6264 1st vec: 0 1 2 3 4 5 6 7
6265 2nd vec: 8 9 10 11 12 13 14 15
6266 3rd vec: 16 17 18 19 20 21 22 23
6268 The output sequence should be:
6270 1st vec: 0 3 6 9 12 15 18 21
6271 2nd vec: 1 4 7 10 13 16 19 22
6272 3rd vec: 2 5 8 11 14 17 20 23
6274 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6276 First we shuffle all 3 vectors to get correct elements order:
6278 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6279 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6280 3rd vec: (16 19 22) (17 20 23) (18 21)
6282 Next we unite and shift vector 3 times:
6284 1st step:
6285 shift right by 6 the concatenation of:
6286 "1st vec" and "2nd vec"
6287 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6288 "2nd vec" and "3rd vec"
6289 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6290 "3rd vec" and "1st vec"
6291 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6292 | New vectors |
6294 So that now new vectors are:
6296 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6297 2nd vec: (10 13) (16 19 22) (17 20 23)
6298 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6300 2nd step:
6301 shift right by 5 the concatenation of:
6302 "1st vec" and "3rd vec"
6303 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6304 "2nd vec" and "1st vec"
6305 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6306 "3rd vec" and "2nd vec"
6307 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6308 | New vectors |
6310 So that now new vectors are:
6312 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6313 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6314 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6316 3rd step:
6317 shift right by 5 the concatenation of:
6318 "1st vec" and "1st vec"
6319 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6320 shift right by 3 the concatenation of:
6321 "2nd vec" and "2nd vec"
6322 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6323 | New vectors |
6325 So that now all vectors are READY:
6326 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6327 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6328 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6330 This algorithm is faster than one in vect_permute_load_chain if:
6331 1. "shift of a concatination" is faster than general permutation.
6332 This is usually so.
6333 2. The TARGET machine can't execute vector instructions in parallel.
6334 This is because each step of the algorithm depends on previous.
6335 The algorithm in vect_permute_load_chain is much more parallel.
6337 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6340 static bool
6341 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6342 unsigned int length,
6343 stmt_vec_info stmt_info,
6344 gimple_stmt_iterator *gsi,
6345 vec<tree> *result_chain)
6347 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6348 tree perm2_mask1, perm2_mask2, perm3_mask;
6349 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6350 gimple *perm_stmt;
6352 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6353 machine_mode vmode = TYPE_MODE (vectype);
6354 unsigned int i;
6355 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6357 unsigned HOST_WIDE_INT nelt, vf;
6358 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6359 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6360 /* Not supported for variable-length vectors. */
6361 return false;
6363 vec_perm_builder sel (nelt, nelt, 1);
6364 sel.quick_grow (nelt);
6366 result_chain->quick_grow (length);
6367 memcpy (result_chain->address (), dr_chain.address (),
6368 length * sizeof (tree));
6370 if (pow2p_hwi (length) && vf > 4)
6372 unsigned int j, log_length = exact_log2 (length);
6373 for (i = 0; i < nelt / 2; ++i)
6374 sel[i] = i * 2;
6375 for (i = 0; i < nelt / 2; ++i)
6376 sel[nelt / 2 + i] = i * 2 + 1;
6377 vec_perm_indices indices (sel, 2, nelt);
6378 if (!can_vec_perm_const_p (vmode, vmode, indices))
6380 if (dump_enabled_p ())
6381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6382 "shuffle of 2 fields structure is not \
6383 supported by target\n");
6384 return false;
6386 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6388 for (i = 0; i < nelt / 2; ++i)
6389 sel[i] = i * 2 + 1;
6390 for (i = 0; i < nelt / 2; ++i)
6391 sel[nelt / 2 + i] = i * 2;
6392 indices.new_vector (sel, 2, nelt);
6393 if (!can_vec_perm_const_p (vmode, vmode, indices))
6395 if (dump_enabled_p ())
6396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6397 "shuffle of 2 fields structure is not \
6398 supported by target\n");
6399 return false;
6401 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6403 /* Generating permutation constant to shift all elements.
6404 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6405 for (i = 0; i < nelt; i++)
6406 sel[i] = nelt / 2 + i;
6407 indices.new_vector (sel, 2, nelt);
6408 if (!can_vec_perm_const_p (vmode, vmode, indices))
6410 if (dump_enabled_p ())
6411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6412 "shift permutation is not supported by target\n");
6413 return false;
6415 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6417 /* Generating permutation constant to select vector from 2.
6418 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6419 for (i = 0; i < nelt / 2; i++)
6420 sel[i] = i;
6421 for (i = nelt / 2; i < nelt; i++)
6422 sel[i] = nelt + i;
6423 indices.new_vector (sel, 2, nelt);
6424 if (!can_vec_perm_const_p (vmode, vmode, indices))
6426 if (dump_enabled_p ())
6427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428 "select is not supported by target\n");
6429 return false;
6431 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6433 for (i = 0; i < log_length; i++)
6435 for (j = 0; j < length; j += 2)
6437 first_vect = dr_chain[j];
6438 second_vect = dr_chain[j + 1];
6440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6442 first_vect, first_vect,
6443 perm2_mask1);
6444 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6445 vect[0] = data_ref;
6447 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6448 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6449 second_vect, second_vect,
6450 perm2_mask2);
6451 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6452 vect[1] = data_ref;
6454 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6455 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6456 vect[0], vect[1], shift1_mask);
6457 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6458 (*result_chain)[j/2 + length/2] = data_ref;
6460 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6461 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6462 vect[0], vect[1], select_mask);
6463 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6464 (*result_chain)[j/2] = data_ref;
6466 memcpy (dr_chain.address (), result_chain->address (),
6467 length * sizeof (tree));
6469 return true;
6471 if (length == 3 && vf > 2)
6473 unsigned int k = 0, l = 0;
6475 /* Generating permutation constant to get all elements in rigth order.
6476 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6477 for (i = 0; i < nelt; i++)
6479 if (3 * k + (l % 3) >= nelt)
6481 k = 0;
6482 l += (3 - (nelt % 3));
6484 sel[i] = 3 * k + (l % 3);
6485 k++;
6487 vec_perm_indices indices (sel, 2, nelt);
6488 if (!can_vec_perm_const_p (vmode, vmode, indices))
6490 if (dump_enabled_p ())
6491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492 "shuffle of 3 fields structure is not \
6493 supported by target\n");
6494 return false;
6496 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6498 /* Generating permutation constant to shift all elements.
6499 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6500 for (i = 0; i < nelt; i++)
6501 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6502 indices.new_vector (sel, 2, nelt);
6503 if (!can_vec_perm_const_p (vmode, vmode, indices))
6505 if (dump_enabled_p ())
6506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6507 "shift permutation is not supported by target\n");
6508 return false;
6510 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6512 /* Generating permutation constant to shift all elements.
6513 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6514 for (i = 0; i < nelt; i++)
6515 sel[i] = 2 * (nelt / 3) + 1 + i;
6516 indices.new_vector (sel, 2, nelt);
6517 if (!can_vec_perm_const_p (vmode, vmode, indices))
6519 if (dump_enabled_p ())
6520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6521 "shift permutation is not supported by target\n");
6522 return false;
6524 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6526 /* Generating permutation constant to shift all elements.
6527 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6528 for (i = 0; i < nelt; i++)
6529 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6530 indices.new_vector (sel, 2, nelt);
6531 if (!can_vec_perm_const_p (vmode, vmode, indices))
6533 if (dump_enabled_p ())
6534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6535 "shift permutation is not supported by target\n");
6536 return false;
6538 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6540 /* Generating permutation constant to shift all elements.
6541 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6542 for (i = 0; i < nelt; i++)
6543 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6544 indices.new_vector (sel, 2, nelt);
6545 if (!can_vec_perm_const_p (vmode, vmode, indices))
6547 if (dump_enabled_p ())
6548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6549 "shift permutation is not supported by target\n");
6550 return false;
6552 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6554 for (k = 0; k < 3; k++)
6556 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6557 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6558 dr_chain[k], dr_chain[k],
6559 perm3_mask);
6560 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6561 vect[k] = data_ref;
6564 for (k = 0; k < 3; k++)
6566 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6567 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6568 vect[k % 3], vect[(k + 1) % 3],
6569 shift1_mask);
6570 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6571 vect_shift[k] = data_ref;
6574 for (k = 0; k < 3; k++)
6576 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6577 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6578 vect_shift[(4 - k) % 3],
6579 vect_shift[(3 - k) % 3],
6580 shift2_mask);
6581 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6582 vect[k] = data_ref;
6585 (*result_chain)[3 - (nelt % 3)] = vect[2];
6587 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6588 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6589 vect[0], shift3_mask);
6590 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6591 (*result_chain)[nelt % 3] = data_ref;
6593 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6594 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6595 vect[1], shift4_mask);
6596 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6597 (*result_chain)[0] = data_ref;
6598 return true;
6600 return false;
6603 /* Function vect_transform_grouped_load.
6605 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6606 to perform their permutation and ascribe the result vectorized statements to
6607 the scalar statements.
6610 void
6611 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6612 vec<tree> dr_chain,
6613 int size, gimple_stmt_iterator *gsi)
6615 machine_mode mode;
6616 vec<tree> result_chain = vNULL;
6618 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6619 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6620 vectors, that are ready for vector computation. */
6621 result_chain.create (size);
6623 /* If reassociation width for vector type is 2 or greater target machine can
6624 execute 2 or more vector instructions in parallel. Otherwise try to
6625 get chain for loads group using vect_shift_permute_load_chain. */
6626 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6627 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6628 || pow2p_hwi (size)
6629 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6630 gsi, &result_chain))
6631 vect_permute_load_chain (vinfo, dr_chain,
6632 size, stmt_info, gsi, &result_chain);
6633 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6634 result_chain.release ();
6637 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6638 generated as part of the vectorization of STMT_INFO. Assign the statement
6639 for each vector to the associated scalar statement. */
6641 void
6642 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6643 vec<tree> result_chain)
6645 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6646 unsigned int i, gap_count;
6647 tree tmp_data_ref;
6649 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6650 Since we scan the chain starting from it's first node, their order
6651 corresponds the order of data-refs in RESULT_CHAIN. */
6652 stmt_vec_info next_stmt_info = first_stmt_info;
6653 gap_count = 1;
6654 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6656 if (!next_stmt_info)
6657 break;
6659 /* Skip the gaps. Loads created for the gaps will be removed by dead
6660 code elimination pass later. No need to check for the first stmt in
6661 the group, since it always exists.
6662 DR_GROUP_GAP is the number of steps in elements from the previous
6663 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6664 correspond to the gaps. */
6665 if (next_stmt_info != first_stmt_info
6666 && gap_count < DR_GROUP_GAP (next_stmt_info))
6668 gap_count++;
6669 continue;
6672 /* ??? The following needs cleanup after the removal of
6673 DR_GROUP_SAME_DR_STMT. */
6674 if (next_stmt_info)
6676 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6677 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6678 copies, and we put the new vector statement last. */
6679 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6681 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6682 gap_count = 1;
6687 /* Function vect_force_dr_alignment_p.
6689 Returns whether the alignment of a DECL can be forced to be aligned
6690 on ALIGNMENT bit boundary. */
6692 bool
6693 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6695 if (!VAR_P (decl))
6696 return false;
6698 if (decl_in_symtab_p (decl)
6699 && !symtab_node::get (decl)->can_increase_alignment_p ())
6700 return false;
6702 if (TREE_STATIC (decl))
6703 return (known_le (alignment,
6704 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6705 else
6706 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6709 /* Return whether the data reference DR_INFO is supported with respect to its
6710 alignment.
6711 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6712 it is aligned, i.e., check if it is possible to vectorize it with different
6713 alignment. */
6715 enum dr_alignment_support
6716 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6717 tree vectype, int misalignment)
6719 data_reference *dr = dr_info->dr;
6720 stmt_vec_info stmt_info = dr_info->stmt;
6721 machine_mode mode = TYPE_MODE (vectype);
6722 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6723 class loop *vect_loop = NULL;
6724 bool nested_in_vect_loop = false;
6726 if (misalignment == 0)
6727 return dr_aligned;
6729 /* For now assume all conditional loads/stores support unaligned
6730 access without any special code. */
6731 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6732 if (gimple_call_internal_p (stmt)
6733 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6734 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6735 return dr_unaligned_supported;
6737 if (loop_vinfo)
6739 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6740 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6743 /* Possibly unaligned access. */
6745 /* We can choose between using the implicit realignment scheme (generating
6746 a misaligned_move stmt) and the explicit realignment scheme (generating
6747 aligned loads with a REALIGN_LOAD). There are two variants to the
6748 explicit realignment scheme: optimized, and unoptimized.
6749 We can optimize the realignment only if the step between consecutive
6750 vector loads is equal to the vector size. Since the vector memory
6751 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6752 is guaranteed that the misalignment amount remains the same throughout the
6753 execution of the vectorized loop. Therefore, we can create the
6754 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6755 at the loop preheader.
6757 However, in the case of outer-loop vectorization, when vectorizing a
6758 memory access in the inner-loop nested within the LOOP that is now being
6759 vectorized, while it is guaranteed that the misalignment of the
6760 vectorized memory access will remain the same in different outer-loop
6761 iterations, it is *not* guaranteed that is will remain the same throughout
6762 the execution of the inner-loop. This is because the inner-loop advances
6763 with the original scalar step (and not in steps of VS). If the inner-loop
6764 step happens to be a multiple of VS, then the misalignment remains fixed
6765 and we can use the optimized realignment scheme. For example:
6767 for (i=0; i<N; i++)
6768 for (j=0; j<M; j++)
6769 s += a[i+j];
6771 When vectorizing the i-loop in the above example, the step between
6772 consecutive vector loads is 1, and so the misalignment does not remain
6773 fixed across the execution of the inner-loop, and the realignment cannot
6774 be optimized (as illustrated in the following pseudo vectorized loop):
6776 for (i=0; i<N; i+=4)
6777 for (j=0; j<M; j++){
6778 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6779 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6780 // (assuming that we start from an aligned address).
6783 We therefore have to use the unoptimized realignment scheme:
6785 for (i=0; i<N; i+=4)
6786 for (j=k; j<M; j+=4)
6787 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6788 // that the misalignment of the initial address is
6789 // 0).
6791 The loop can then be vectorized as follows:
6793 for (k=0; k<4; k++){
6794 rt = get_realignment_token (&vp[k]);
6795 for (i=0; i<N; i+=4){
6796 v1 = vp[i+k];
6797 for (j=k; j<M; j+=4){
6798 v2 = vp[i+j+VS-1];
6799 va = REALIGN_LOAD <v1,v2,rt>;
6800 vs += va;
6801 v1 = v2;
6804 } */
6806 if (DR_IS_READ (dr))
6808 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6809 && (!targetm.vectorize.builtin_mask_for_load
6810 || targetm.vectorize.builtin_mask_for_load ()))
6812 /* If we are doing SLP then the accesses need not have the
6813 same alignment, instead it depends on the SLP group size. */
6814 if (loop_vinfo
6815 && STMT_SLP_TYPE (stmt_info)
6816 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6817 * (DR_GROUP_SIZE
6818 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6819 TYPE_VECTOR_SUBPARTS (vectype)))
6821 else if (!loop_vinfo
6822 || (nested_in_vect_loop
6823 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6824 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6825 return dr_explicit_realign;
6826 else
6827 return dr_explicit_realign_optimized;
6831 bool is_packed = false;
6832 tree type = TREE_TYPE (DR_REF (dr));
6833 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6834 is_packed = not_size_aligned (DR_REF (dr));
6835 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6836 is_packed))
6837 return dr_unaligned_supported;
6839 /* Unsupported. */
6840 return dr_unaligned_unsupported;