ada: Fix minor glitch in finish_record_type
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blob40ab568fe355964b878d770010aa9eeaef63eeac
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 HOST_WIDE_INT lhs, rhs;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
150 unsigned int i = 0;
151 if (gimple_call_internal_p (call))
153 internal_fn ifn = gimple_call_internal_fn (call);
154 if (internal_load_fn_p (ifn))
155 /* For loads the LHS type does the trick. */
156 i = ~0U;
157 else if (internal_store_fn_p (ifn))
159 /* For stores use the tyep of the stored value. */
160 i = internal_fn_stored_value_index (ifn);
161 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
162 i = ~0U;
164 else if (internal_fn_mask_index (ifn) == 0)
165 i = 1;
167 if (i < gimple_call_num_args (call))
169 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
170 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
172 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
173 if (rhs < lhs)
174 scalar_type = rhs_type;
179 return scalar_type;
183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
184 tested at run-time. Return TRUE if DDR was successfully inserted.
185 Return false if versioning is not supported. */
187 static opt_result
188 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
190 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
192 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
193 return opt_result::failure_at (vect_location,
194 "will not create alias checks, as"
195 " --param vect-max-version-for-alias-checks"
196 " == 0\n");
198 opt_result res
199 = runtime_alias_check_p (ddr, loop,
200 optimize_loop_nest_for_speed_p (loop));
201 if (!res)
202 return res;
204 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
205 return opt_result::success ();
208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
210 static void
211 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
213 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
214 for (unsigned int i = 0; i < checks.length(); ++i)
215 if (checks[i] == value)
216 return;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location,
220 "need run-time check that %T is nonzero\n",
221 value);
222 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 /* Return true if we know that the order of vectorized DR_INFO_A and
226 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
227 DR_INFO_B. At least one of the accesses is a write. */
229 static bool
230 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
232 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
233 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
235 /* Single statements are always kept in their original order. */
236 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
237 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
238 return true;
240 /* STMT_A and STMT_B belong to overlapping groups. All loads are
241 emitted at the position of the first scalar load.
242 Stores in a group are emitted at the position of the last scalar store.
243 Compute that position and check whether the resulting order matches
244 the current one. */
245 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246 if (il_a)
248 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
250 s = DR_GROUP_NEXT_ELEMENT (s))
251 il_a = get_later_stmt (il_a, s);
252 else /* DR_IS_READ */
253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 s = DR_GROUP_NEXT_ELEMENT (s))
255 if (get_later_stmt (il_a, s) == il_a)
256 il_a = s;
258 else
259 il_a = stmtinfo_a;
260 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
261 if (il_b)
263 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
264 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
265 s = DR_GROUP_NEXT_ELEMENT (s))
266 il_b = get_later_stmt (il_b, s);
267 else /* DR_IS_READ */
268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
269 s = DR_GROUP_NEXT_ELEMENT (s))
270 if (get_later_stmt (il_b, s) == il_b)
271 il_b = s;
273 else
274 il_b = stmtinfo_b;
275 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
276 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 /* A subroutine of vect_analyze_data_ref_dependence. Handle
280 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
281 distances. These distances are conservatively correct but they don't
282 reflect a guaranteed dependence.
284 Return true if this function does all the work necessary to avoid
285 an alias or false if the caller should use the dependence distances
286 to limit the vectorization factor in the usual way. LOOP_DEPTH is
287 the depth of the loop described by LOOP_VINFO and the other arguments
288 are as for vect_analyze_data_ref_dependence. */
290 static bool
291 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
292 loop_vec_info loop_vinfo,
293 int loop_depth, unsigned int *max_vf)
295 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
296 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
298 int dist = dist_v[loop_depth];
299 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
301 /* If the user asserted safelen >= DIST consecutive iterations
302 can be executed concurrently, assume independence.
304 ??? An alternative would be to add the alias check even
305 in this case, and vectorize the fallback loop with the
306 maximum VF set to safelen. However, if the user has
307 explicitly given a length, it's less likely that that
308 would be a win. */
309 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
311 if ((unsigned int) loop->safelen < *max_vf)
312 *max_vf = loop->safelen;
313 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
314 continue;
317 /* For dependence distances of 2 or more, we have the option
318 of limiting VF or checking for an alias at runtime.
319 Prefer to check at runtime if we can, to avoid limiting
320 the VF unnecessarily when the bases are in fact independent.
322 Note that the alias checks will be removed if the VF ends up
323 being small enough. */
324 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
325 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
326 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
327 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
328 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 return true;
335 /* Function vect_analyze_data_ref_dependence.
337 FIXME: I needed to change the sense of the returned flag.
339 Return FALSE if there (might) exist a dependence between a memory-reference
340 DRA and a memory-reference DRB. When versioning for alias may check a
341 dependence at run-time, return TRUE. Adjust *MAX_VF according to
342 the data dependence. */
344 static opt_result
345 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
346 loop_vec_info loop_vinfo,
347 unsigned int *max_vf)
349 unsigned int i;
350 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
351 struct data_reference *dra = DDR_A (ddr);
352 struct data_reference *drb = DDR_B (ddr);
353 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
354 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
355 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
356 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
357 lambda_vector dist_v;
358 unsigned int loop_depth;
360 /* If user asserted safelen consecutive iterations can be
361 executed concurrently, assume independence. */
362 auto apply_safelen = [&]()
364 if (loop->safelen >= 2)
366 if ((unsigned int) loop->safelen < *max_vf)
367 *max_vf = loop->safelen;
368 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
369 return true;
371 return false;
374 /* In loop analysis all data references should be vectorizable. */
375 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
376 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
377 gcc_unreachable ();
379 /* Independent data accesses. */
380 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
381 return opt_result::success ();
383 if (dra == drb
384 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
385 return opt_result::success ();
387 /* We do not have to consider dependences between accesses that belong
388 to the same group, unless the stride could be smaller than the
389 group size. */
390 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
391 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
393 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
394 return opt_result::success ();
396 /* Even if we have an anti-dependence then, as the vectorized loop covers at
397 least two scalar iterations, there is always also a true dependence.
398 As the vectorizer does not re-order loads and stores we can ignore
399 the anti-dependence if TBAA can disambiguate both DRs similar to the
400 case with known negative distance anti-dependences (positive
401 distance anti-dependences would violate TBAA constraints). */
402 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
403 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
404 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
405 get_alias_set (DR_REF (drb))))
406 return opt_result::success ();
408 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
409 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
411 if (apply_safelen ())
412 return opt_result::success ();
414 return opt_result::failure_at
415 (stmtinfo_a->stmt,
416 "possible alias involving gather/scatter between %T and %T\n",
417 DR_REF (dra), DR_REF (drb));
420 /* Unknown data dependence. */
421 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
423 if (apply_safelen ())
424 return opt_result::success ();
426 if (dump_enabled_p ())
427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
428 "versioning for alias required: "
429 "can't determine dependence between %T and %T\n",
430 DR_REF (dra), DR_REF (drb));
432 /* Add to list of ddrs that need to be tested at run-time. */
433 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436 /* Known data dependence. */
437 if (DDR_NUM_DIST_VECTS (ddr) == 0)
439 if (apply_safelen ())
440 return opt_result::success ();
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
444 "versioning for alias required: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra), DR_REF (drb));
447 /* Add to list of ddrs that need to be tested at run-time. */
448 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
453 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
454 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
455 loop_depth, max_vf))
456 return opt_result::success ();
458 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
460 int dist = dist_v[loop_depth];
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "dependence distance = %d.\n", dist);
466 if (dist == 0)
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance == 0 between %T and %T\n",
471 DR_REF (dra), DR_REF (drb));
473 /* When we perform grouped accesses and perform implicit CSE
474 by detecting equal accesses and doing disambiguation with
475 runtime alias tests like for
476 .. = a[i];
477 .. = a[i+1];
478 a[i] = ..;
479 a[i+1] = ..;
480 *p = ..;
481 .. = a[i];
482 .. = a[i+1];
483 where we will end up loading { a[i], a[i+1] } once, make
484 sure that inserting group loads before the first load and
485 stores after the last store will do the right thing.
486 Similar for groups like
487 a[i] = ...;
488 ... = a[i];
489 a[i+1] = ...;
490 where loads from the group interleave with the store. */
491 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
492 return opt_result::failure_at (stmtinfo_a->stmt,
493 "READ_WRITE dependence"
494 " in interleaving.\n");
496 if (loop->safelen < 2)
498 tree indicator = dr_zero_step_indicator (dra);
499 if (!indicator || integer_zerop (indicator))
500 return opt_result::failure_at (stmtinfo_a->stmt,
501 "access also has a zero step\n");
502 else if (TREE_CODE (indicator) != INTEGER_CST)
503 vect_check_nonzero_value (loop_vinfo, indicator);
505 continue;
508 if (dist > 0 && DDR_REVERSED_P (ddr))
510 /* If DDR_REVERSED_P the order of the data-refs in DDR was
511 reversed (to make distance vector positive), and the actual
512 distance is negative. */
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance negative.\n");
516 /* When doing outer loop vectorization, we need to check if there is
517 a backward dependence at the inner loop level if the dependence
518 at the outer loop is reversed. See PR81740. */
519 if (nested_in_vect_loop_p (loop, stmtinfo_a)
520 || nested_in_vect_loop_p (loop, stmtinfo_b))
522 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
523 DDR_LOOP_NEST (ddr));
524 if (dist_v[inner_depth] < 0)
525 return opt_result::failure_at (stmtinfo_a->stmt,
526 "not vectorized, dependence "
527 "between data-refs %T and %T\n",
528 DR_REF (dra), DR_REF (drb));
530 /* Record a negative dependence distance to later limit the
531 amount of stmt copying / unrolling we can perform.
532 Only need to handle read-after-write dependence. */
533 if (DR_IS_READ (drb)
534 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
535 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
536 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
537 continue;
540 unsigned int abs_dist = abs (dist);
541 if (abs_dist >= 2 && abs_dist < *max_vf)
543 /* The dependence distance requires reduction of the maximal
544 vectorization factor. */
545 *max_vf = abs_dist;
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE, vect_location,
548 "adjusting maximal vectorization factor to %i\n",
549 *max_vf);
552 if (abs_dist >= *max_vf)
554 /* Dependence distance does not create dependence, as far as
555 vectorization is concerned, in this case. */
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "dependence distance >= VF.\n");
559 continue;
562 return opt_result::failure_at (stmtinfo_a->stmt,
563 "not vectorized, possible dependence "
564 "between data-refs %T and %T\n",
565 DR_REF (dra), DR_REF (drb));
568 return opt_result::success ();
571 /* Function vect_analyze_data_ref_dependences.
573 Examine all the data references in the loop, and make sure there do not
574 exist any data dependences between them. Set *MAX_VF according to
575 the maximum vectorization factor the data dependences allow. */
577 opt_result
578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
579 unsigned int *max_vf)
581 unsigned int i;
582 struct data_dependence_relation *ddr;
584 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
586 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
588 LOOP_VINFO_DDRS (loop_vinfo)
589 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
590 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
591 /* We do not need read-read dependences. */
592 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
593 &LOOP_VINFO_DDRS (loop_vinfo),
594 LOOP_VINFO_LOOP_NEST (loop_vinfo),
595 false);
596 gcc_assert (res);
599 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
601 /* For epilogues we either have no aliases or alias versioning
602 was applied to original loop. Therefore we may just get max_vf
603 using VF of original loop. */
604 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
605 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
606 else
607 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
609 opt_result res
610 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
611 if (!res)
612 return res;
615 return opt_result::success ();
619 /* Function vect_slp_analyze_data_ref_dependence.
621 Return TRUE if there (might) exist a dependence between a memory-reference
622 DRA and a memory-reference DRB for VINFO. When versioning for alias
623 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
624 according to the data dependence. */
626 static bool
627 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
628 struct data_dependence_relation *ddr)
630 struct data_reference *dra = DDR_A (ddr);
631 struct data_reference *drb = DDR_B (ddr);
632 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
633 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
635 /* We need to check dependences of statements marked as unvectorizable
636 as well, they still can prohibit vectorization. */
638 /* Independent data accesses. */
639 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
640 return false;
642 if (dra == drb)
643 return false;
645 /* Read-read is OK. */
646 if (DR_IS_READ (dra) && DR_IS_READ (drb))
647 return false;
649 /* If dra and drb are part of the same interleaving chain consider
650 them independent. */
651 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
652 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
653 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
654 return false;
656 /* Unknown data dependence. */
657 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
661 "can't determine dependence between %T and %T\n",
662 DR_REF (dra), DR_REF (drb));
664 else if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "determined dependence between %T and %T\n",
667 DR_REF (dra), DR_REF (drb));
669 return true;
673 /* Analyze dependences involved in the transform of a store SLP NODE. */
675 static bool
676 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
678 /* This walks over all stmts involved in the SLP store done
679 in NODE verifying we can sink them up to the last stmt in the
680 group. */
681 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
682 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
684 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
686 stmt_vec_info access_info
687 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
688 if (access_info == last_access_info)
689 continue;
690 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
691 ao_ref ref;
692 bool ref_initialized_p = false;
693 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
694 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
696 gimple *stmt = gsi_stmt (gsi);
697 if (! gimple_vuse (stmt))
698 continue;
700 /* If we couldn't record a (single) data reference for this
701 stmt we have to resort to the alias oracle. */
702 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
703 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
704 if (!dr_b)
706 /* We are moving a store - this means
707 we cannot use TBAA for disambiguation. */
708 if (!ref_initialized_p)
709 ao_ref_init (&ref, DR_REF (dr_a));
710 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
711 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
712 return false;
713 continue;
716 gcc_assert (!gimple_visited_p (stmt));
718 ddr_p ddr = initialize_data_dependence_relation (dr_a,
719 dr_b, vNULL);
720 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
721 free_dependence_relation (ddr);
722 if (dependent)
723 return false;
726 return true;
729 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
730 contain the vector of scalar stores of this instance if we are
731 disambiguating the loads. */
733 static bool
734 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
735 vec<stmt_vec_info> stores,
736 stmt_vec_info last_store_info)
738 /* This walks over all stmts involved in the SLP load done
739 in NODE verifying we can hoist them up to the first stmt in the
740 group. */
741 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
742 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
744 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
746 stmt_vec_info access_info
747 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
748 if (access_info == first_access_info)
749 continue;
750 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
751 ao_ref ref;
752 bool ref_initialized_p = false;
753 hash_set<stmt_vec_info> grp_visited;
754 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
755 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
757 gimple *stmt = gsi_stmt (gsi);
758 if (! gimple_vdef (stmt))
759 continue;
761 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
763 /* If we run into a store of this same instance (we've just
764 marked those) then delay dependence checking until we run
765 into the last store because this is where it will have
766 been sunk to (and we verified that we can do that already). */
767 if (gimple_visited_p (stmt))
769 if (stmt_info != last_store_info)
770 continue;
772 for (stmt_vec_info &store_info : stores)
774 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
775 ddr_p ddr = initialize_data_dependence_relation
776 (dr_a, store_dr, vNULL);
777 bool dependent
778 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
779 free_dependence_relation (ddr);
780 if (dependent)
781 return false;
783 continue;
786 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
788 /* We are hoisting a load - this means we can use TBAA for
789 disambiguation. */
790 if (!ref_initialized_p)
791 ao_ref_init (&ref, DR_REF (dr_a));
792 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
794 /* If we couldn't record a (single) data reference for this
795 stmt we have to give up now. */
796 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
797 if (!dr_b)
798 return false;
799 ddr_p ddr = initialize_data_dependence_relation (dr_a,
800 dr_b, vNULL);
801 bool dependent
802 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
803 free_dependence_relation (ddr);
804 if (dependent)
805 return false;
807 /* No dependence. */
808 return true;
810 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
812 /* When we run into a store group we have to honor
813 that earlier stores might be moved here. We don't
814 know exactly which and where to since we lack a
815 back-mapping from DR to SLP node, so assume all
816 earlier stores are sunk here. It's enough to
817 consider the last stmt of a group for this.
818 ??? Both this and the fact that we disregard that
819 the conflicting instance might be removed later
820 is overly conservative. */
821 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
822 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
823 store_info != NULL;
824 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
825 if ((store_info == stmt_info
826 || get_later_stmt (store_info, stmt_info) == stmt_info)
827 && !check_hoist (store_info))
828 return false;
830 else
832 if (!check_hoist (stmt_info))
833 return false;
837 return true;
841 /* Function vect_analyze_data_ref_dependences.
843 Examine all the data references in the basic-block, and make sure there
844 do not exist any data dependences between them. Set *MAX_VF according to
845 the maximum vectorization factor the data dependences allow. */
847 bool
848 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
850 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
852 /* The stores of this instance are at the root of the SLP tree. */
853 slp_tree store = NULL;
854 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
855 store = SLP_INSTANCE_TREE (instance);
857 /* Verify we can sink stores to the vectorized stmt insert location. */
858 stmt_vec_info last_store_info = NULL;
859 if (store)
861 if (! vect_slp_analyze_store_dependences (vinfo, store))
862 return false;
864 /* Mark stores in this instance and remember the last one. */
865 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
866 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
867 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
870 bool res = true;
872 /* Verify we can sink loads to the vectorized stmt insert location,
873 special-casing stores of this instance. */
874 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
875 if (! vect_slp_analyze_load_dependences (vinfo, load,
876 store
877 ? SLP_TREE_SCALAR_STMTS (store)
878 : vNULL, last_store_info))
880 res = false;
881 break;
884 /* Unset the visited flag. */
885 if (store)
886 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
887 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
889 return res;
892 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
893 applied. */
896 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
898 HOST_WIDE_INT diff = 0;
899 /* Alignment is only analyzed for the first element of a DR group,
900 use that but adjust misalignment by the offset of the access. */
901 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
903 dr_vec_info *first_dr
904 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
905 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
906 INTEGER_CSTs and the first element in the group has the lowest
907 address. */
908 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
909 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
910 gcc_assert (diff >= 0);
911 dr_info = first_dr;
914 int misalign = dr_info->misalignment;
915 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
916 if (misalign == DR_MISALIGNMENT_UNKNOWN)
917 return misalign;
919 /* If the access is only aligned for a vector type with smaller alignment
920 requirement the access has unknown misalignment. */
921 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
922 targetm.vectorize.preferred_vector_alignment (vectype)))
923 return DR_MISALIGNMENT_UNKNOWN;
925 /* Apply the offset from the DR group start and the externally supplied
926 offset which can for example result from a negative stride access. */
927 poly_int64 misalignment = misalign + diff + offset;
929 /* vect_compute_data_ref_alignment will have ensured that target_alignment
930 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
931 unsigned HOST_WIDE_INT target_alignment_c
932 = dr_info->target_alignment.to_constant ();
933 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
934 return DR_MISALIGNMENT_UNKNOWN;
935 return misalign;
938 /* Record the base alignment guarantee given by DRB, which occurs
939 in STMT_INFO. */
941 static void
942 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
943 innermost_loop_behavior *drb)
945 bool existed;
946 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
947 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
948 if (!existed || entry.second->base_alignment < drb->base_alignment)
950 entry = std::make_pair (stmt_info, drb);
951 if (dump_enabled_p ())
952 dump_printf_loc (MSG_NOTE, vect_location,
953 "recording new base alignment for %T\n"
954 " alignment: %d\n"
955 " misalignment: %d\n"
956 " based on: %G",
957 drb->base_address,
958 drb->base_alignment,
959 drb->base_misalignment,
960 stmt_info->stmt);
964 /* If the region we're going to vectorize is reached, all unconditional
965 data references occur at least once. We can therefore pool the base
966 alignment guarantees from each unconditional reference. Do this by
967 going through all the data references in VINFO and checking whether
968 the containing statement makes the reference unconditionally. If so,
969 record the alignment of the base address in VINFO so that it can be
970 used for all other references with the same base. */
972 void
973 vect_record_base_alignments (vec_info *vinfo)
975 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
976 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
977 for (data_reference *dr : vinfo->shared->datarefs)
979 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
980 stmt_vec_info stmt_info = dr_info->stmt;
981 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
982 && STMT_VINFO_VECTORIZABLE (stmt_info)
983 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
985 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
987 /* If DR is nested in the loop that is being vectorized, we can also
988 record the alignment of the base wrt the outer loop. */
989 if (loop && nested_in_vect_loop_p (loop, stmt_info))
990 vect_record_base_alignment
991 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
996 /* Function vect_compute_data_ref_alignment
998 Compute the misalignment of the data reference DR_INFO when vectorizing
999 with VECTYPE.
1001 Output:
1002 1. initialized misalignment info for DR_INFO
1004 FOR NOW: No analysis is actually performed. Misalignment is calculated
1005 only for trivial cases. TODO. */
1007 static void
1008 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1009 tree vectype)
1011 stmt_vec_info stmt_info = dr_info->stmt;
1012 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1013 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1014 class loop *loop = NULL;
1015 tree ref = DR_REF (dr_info->dr);
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_NOTE, vect_location,
1019 "vect_compute_data_ref_alignment:\n");
1021 if (loop_vinfo)
1022 loop = LOOP_VINFO_LOOP (loop_vinfo);
1024 /* Initialize misalignment to unknown. */
1025 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1027 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1028 return;
1030 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1031 bool step_preserves_misalignment_p;
1033 poly_uint64 vector_alignment
1034 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1035 BITS_PER_UNIT);
1036 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1038 /* If the main loop has peeled for alignment we have no way of knowing
1039 whether the data accesses in the epilogues are aligned. We can't at
1040 compile time answer the question whether we have entered the main loop or
1041 not. Fixes PR 92351. */
1042 if (loop_vinfo)
1044 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1045 if (orig_loop_vinfo
1046 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1047 return;
1050 unsigned HOST_WIDE_INT vect_align_c;
1051 if (!vector_alignment.is_constant (&vect_align_c))
1052 return;
1054 /* No step for BB vectorization. */
1055 if (!loop)
1057 gcc_assert (integer_zerop (drb->step));
1058 step_preserves_misalignment_p = true;
1061 /* In case the dataref is in an inner-loop of the loop that is being
1062 vectorized (LOOP), we use the base and misalignment information
1063 relative to the outer-loop (LOOP). This is ok only if the misalignment
1064 stays the same throughout the execution of the inner-loop, which is why
1065 we have to check that the stride of the dataref in the inner-loop evenly
1066 divides by the vector alignment. */
1067 else if (nested_in_vect_loop_p (loop, stmt_info))
1069 step_preserves_misalignment_p
1070 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1072 if (dump_enabled_p ())
1074 if (step_preserves_misalignment_p)
1075 dump_printf_loc (MSG_NOTE, vect_location,
1076 "inner step divides the vector alignment.\n");
1077 else
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 "inner step doesn't divide the vector"
1080 " alignment.\n");
1084 /* Similarly we can only use base and misalignment information relative to
1085 an innermost loop if the misalignment stays the same throughout the
1086 execution of the loop. As above, this is the case if the stride of
1087 the dataref evenly divides by the alignment. */
1088 else
1090 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1091 step_preserves_misalignment_p
1092 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1094 if (!step_preserves_misalignment_p && dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "step doesn't divide the vector alignment.\n");
1099 unsigned int base_alignment = drb->base_alignment;
1100 unsigned int base_misalignment = drb->base_misalignment;
1102 /* Calculate the maximum of the pooled base address alignment and the
1103 alignment that we can compute for DR itself. */
1104 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1105 = base_alignments->get (drb->base_address);
1106 if (entry
1107 && base_alignment < (*entry).second->base_alignment
1108 && (loop_vinfo
1109 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1110 gimple_bb (entry->first->stmt))
1111 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1112 || (entry->first->dr_aux.group <= dr_info->group)))))
1114 base_alignment = entry->second->base_alignment;
1115 base_misalignment = entry->second->base_misalignment;
1118 if (drb->offset_alignment < vect_align_c
1119 || !step_preserves_misalignment_p
1120 /* We need to know whether the step wrt the vectorized loop is
1121 negative when computing the starting misalignment below. */
1122 || TREE_CODE (drb->step) != INTEGER_CST)
1124 if (dump_enabled_p ())
1125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1126 "Unknown alignment for access: %T\n", ref);
1127 return;
1130 if (base_alignment < vect_align_c)
1132 unsigned int max_alignment;
1133 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1134 if (max_alignment < vect_align_c
1135 || !vect_can_force_dr_alignment_p (base,
1136 vect_align_c * BITS_PER_UNIT))
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_NOTE, vect_location,
1140 "can't force alignment of ref: %T\n", ref);
1141 return;
1144 /* Force the alignment of the decl.
1145 NOTE: This is the only change to the code we make during
1146 the analysis phase, before deciding to vectorize the loop. */
1147 if (dump_enabled_p ())
1148 dump_printf_loc (MSG_NOTE, vect_location,
1149 "force alignment of %T\n", ref);
1151 dr_info->base_decl = base;
1152 dr_info->base_misaligned = true;
1153 base_misalignment = 0;
1155 poly_int64 misalignment
1156 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1158 unsigned int const_misalignment;
1159 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1161 if (dump_enabled_p ())
1162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163 "Non-constant misalignment for access: %T\n", ref);
1164 return;
1167 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1169 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "misalign = %d bytes of ref %T\n",
1172 const_misalignment, ref);
1174 return;
1177 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1178 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1179 is made aligned via peeling. */
1181 static bool
1182 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1183 dr_vec_info *dr_peel_info)
1185 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1186 DR_TARGET_ALIGNMENT (dr_info)))
1188 poly_offset_int diff
1189 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1190 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1191 if (known_eq (diff, 0)
1192 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1193 return true;
1195 return false;
1198 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1199 aligned via peeling. */
1201 static bool
1202 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1203 dr_vec_info *dr_peel_info)
1205 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1206 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1207 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1208 DR_OFFSET (dr_peel_info->dr), 0)
1209 || !operand_equal_p (DR_STEP (dr_info->dr),
1210 DR_STEP (dr_peel_info->dr), 0))
1211 return false;
1213 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1216 /* Compute the value for dr_info->misalign so that the access appears
1217 aligned. This is used by peeling to compensate for dr_misalignment
1218 applying the offset for negative step. */
1221 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1223 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1224 return 0;
1226 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1227 poly_int64 misalignment
1228 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1229 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1231 unsigned HOST_WIDE_INT target_alignment_c;
1232 int misalign;
1233 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1234 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1235 return DR_MISALIGNMENT_UNKNOWN;
1236 return misalign;
1239 /* Function vect_update_misalignment_for_peel.
1240 Sets DR_INFO's misalignment
1241 - to 0 if it has the same alignment as DR_PEEL_INFO,
1242 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1243 - to -1 (unknown) otherwise.
1245 DR_INFO - the data reference whose misalignment is to be adjusted.
1246 DR_PEEL_INFO - the data reference whose misalignment is being made
1247 zero in the vector loop by the peel.
1248 NPEEL - the number of iterations in the peel loop if the misalignment
1249 of DR_PEEL_INFO is known at compile time. */
1251 static void
1252 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1253 dr_vec_info *dr_peel_info, int npeel)
1255 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1256 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1258 SET_DR_MISALIGNMENT (dr_info,
1259 vect_dr_misalign_for_aligned_access (dr_peel_info));
1260 return;
1263 unsigned HOST_WIDE_INT alignment;
1264 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1265 && known_alignment_for_access_p (dr_info,
1266 STMT_VINFO_VECTYPE (dr_info->stmt))
1267 && known_alignment_for_access_p (dr_peel_info,
1268 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1270 int misal = dr_info->misalignment;
1271 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1272 misal &= alignment - 1;
1273 set_dr_misalignment (dr_info, misal);
1274 return;
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1279 "to unknown (-1).\n");
1280 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1283 /* Return true if alignment is relevant for DR_INFO. */
1285 static bool
1286 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1288 stmt_vec_info stmt_info = dr_info->stmt;
1290 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1291 return false;
1293 /* For interleaving, only the alignment of the first access matters. */
1294 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1295 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1296 return false;
1298 /* Scatter-gather and invariant accesses continue to address individual
1299 scalars, so vector-level alignment is irrelevant. */
1300 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301 || integer_zerop (DR_STEP (dr_info->dr)))
1302 return false;
1304 /* Strided accesses perform only component accesses, alignment is
1305 irrelevant for them. */
1306 if (STMT_VINFO_STRIDED_P (stmt_info)
1307 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1308 return false;
1310 return true;
1313 /* Given an memory reference EXP return whether its alignment is less
1314 than its size. */
1316 static bool
1317 not_size_aligned (tree exp)
1319 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1320 return true;
1322 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1323 > get_object_alignment (exp));
1326 /* Function vector_alignment_reachable_p
1328 Return true if vector alignment for DR_INFO is reachable by peeling
1329 a few loop iterations. Return false otherwise. */
1331 static bool
1332 vector_alignment_reachable_p (dr_vec_info *dr_info)
1334 stmt_vec_info stmt_info = dr_info->stmt;
1335 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1337 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1339 /* For interleaved access we peel only if number of iterations in
1340 the prolog loop ({VF - misalignment}), is a multiple of the
1341 number of the interleaved accesses. */
1342 int elem_size, mis_in_elements;
1344 /* FORNOW: handle only known alignment. */
1345 if (!known_alignment_for_access_p (dr_info, vectype))
1346 return false;
1348 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1349 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1350 elem_size = vector_element_size (vector_size, nelements);
1351 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1353 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1354 return false;
1357 /* If misalignment is known at the compile time then allow peeling
1358 only if natural alignment is reachable through peeling. */
1359 if (known_alignment_for_access_p (dr_info, vectype)
1360 && !aligned_access_p (dr_info, vectype))
1362 HOST_WIDE_INT elmsize =
1363 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1364 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_NOTE, vect_location,
1367 "data size = %wd. misalignment = %d.\n", elmsize,
1368 dr_misalignment (dr_info, vectype));
1370 if (dr_misalignment (dr_info, vectype) % elmsize)
1372 if (dump_enabled_p ())
1373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 "data size does not divide the misalignment.\n");
1375 return false;
1379 if (!known_alignment_for_access_p (dr_info, vectype))
1381 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1382 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1385 "Unknown misalignment, %snaturally aligned\n",
1386 is_packed ? "not " : "");
1387 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1390 return true;
1394 /* Calculate the cost of the memory access represented by DR_INFO. */
1396 static void
1397 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1398 dr_alignment_support alignment_support_scheme,
1399 int misalignment,
1400 unsigned int *inside_cost,
1401 unsigned int *outside_cost,
1402 stmt_vector_for_cost *body_cost_vec,
1403 stmt_vector_for_cost *prologue_cost_vec)
1405 stmt_vec_info stmt_info = dr_info->stmt;
1406 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1407 int ncopies;
1409 if (PURE_SLP_STMT (stmt_info))
1410 ncopies = 1;
1411 else
1412 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1414 if (DR_IS_READ (dr_info->dr))
1415 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1416 misalignment, true, inside_cost,
1417 outside_cost, prologue_cost_vec, body_cost_vec, false);
1418 else
1419 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1420 misalignment, inside_cost, body_cost_vec);
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "vect_get_data_access_cost: inside_cost = %d, "
1425 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1429 typedef struct _vect_peel_info
1431 dr_vec_info *dr_info;
1432 int npeel;
1433 unsigned int count;
1434 } *vect_peel_info;
1436 typedef struct _vect_peel_extended_info
1438 vec_info *vinfo;
1439 struct _vect_peel_info peel_info;
1440 unsigned int inside_cost;
1441 unsigned int outside_cost;
1442 } *vect_peel_extended_info;
1445 /* Peeling hashtable helpers. */
1447 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1449 static inline hashval_t hash (const _vect_peel_info *);
1450 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1453 inline hashval_t
1454 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1456 return (hashval_t) peel_info->npeel;
1459 inline bool
1460 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1462 return (a->npeel == b->npeel);
1466 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1468 static void
1469 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1470 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1471 int npeel, bool supportable_if_not_aligned)
1473 struct _vect_peel_info elem, *slot;
1474 _vect_peel_info **new_slot;
1476 elem.npeel = npeel;
1477 slot = peeling_htab->find (&elem);
1478 if (slot)
1479 slot->count++;
1480 else
1482 slot = XNEW (struct _vect_peel_info);
1483 slot->npeel = npeel;
1484 slot->dr_info = dr_info;
1485 slot->count = 1;
1486 new_slot = peeling_htab->find_slot (slot, INSERT);
1487 *new_slot = slot;
1490 /* If this DR is not supported with unknown misalignment then bias
1491 this slot when the cost model is disabled. */
1492 if (!supportable_if_not_aligned
1493 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1494 slot->count += VECT_MAX_COST;
1498 /* Traverse peeling hash table to find peeling option that aligns maximum
1499 number of data accesses. */
1502 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1503 _vect_peel_extended_info *max)
1505 vect_peel_info elem = *slot;
1507 if (elem->count > max->peel_info.count
1508 || (elem->count == max->peel_info.count
1509 && max->peel_info.npeel > elem->npeel))
1511 max->peel_info.npeel = elem->npeel;
1512 max->peel_info.count = elem->count;
1513 max->peel_info.dr_info = elem->dr_info;
1516 return 1;
1519 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1520 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1521 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1522 after peeling. */
1524 static void
1525 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1526 dr_vec_info *dr0_info,
1527 unsigned int *inside_cost,
1528 unsigned int *outside_cost,
1529 stmt_vector_for_cost *body_cost_vec,
1530 stmt_vector_for_cost *prologue_cost_vec,
1531 unsigned int npeel)
1533 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1535 bool dr0_alignment_known_p
1536 = (dr0_info
1537 && known_alignment_for_access_p (dr0_info,
1538 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1540 for (data_reference *dr : datarefs)
1542 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1543 if (!vect_relevant_for_alignment_p (dr_info))
1544 continue;
1546 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1547 dr_alignment_support alignment_support_scheme;
1548 int misalignment;
1549 unsigned HOST_WIDE_INT alignment;
1551 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1552 size_zero_node) < 0;
1553 poly_int64 off = 0;
1554 if (negative)
1555 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1556 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1558 if (npeel == 0)
1559 misalignment = dr_misalignment (dr_info, vectype, off);
1560 else if (dr_info == dr0_info
1561 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1562 misalignment = 0;
1563 else if (!dr0_alignment_known_p
1564 || !known_alignment_for_access_p (dr_info, vectype)
1565 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1566 misalignment = DR_MISALIGNMENT_UNKNOWN;
1567 else
1569 misalignment = dr_misalignment (dr_info, vectype, off);
1570 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1571 misalignment &= alignment - 1;
1573 alignment_support_scheme
1574 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1575 misalignment);
1577 vect_get_data_access_cost (loop_vinfo, dr_info,
1578 alignment_support_scheme, misalignment,
1579 inside_cost, outside_cost,
1580 body_cost_vec, prologue_cost_vec);
1584 /* Traverse peeling hash table and calculate cost for each peeling option.
1585 Find the one with the lowest cost. */
1588 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1589 _vect_peel_extended_info *min)
1591 vect_peel_info elem = *slot;
1592 int dummy;
1593 unsigned int inside_cost = 0, outside_cost = 0;
1594 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1595 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1596 epilogue_cost_vec;
1598 prologue_cost_vec.create (2);
1599 body_cost_vec.create (2);
1600 epilogue_cost_vec.create (2);
1602 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1603 &outside_cost, &body_cost_vec,
1604 &prologue_cost_vec, elem->npeel);
1606 body_cost_vec.release ();
1608 outside_cost += vect_get_known_peeling_cost
1609 (loop_vinfo, elem->npeel, &dummy,
1610 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1611 &prologue_cost_vec, &epilogue_cost_vec);
1613 /* Prologue and epilogue costs are added to the target model later.
1614 These costs depend only on the scalar iteration cost, the
1615 number of peeling iterations finally chosen, and the number of
1616 misaligned statements. So discard the information found here. */
1617 prologue_cost_vec.release ();
1618 epilogue_cost_vec.release ();
1620 if (inside_cost < min->inside_cost
1621 || (inside_cost == min->inside_cost
1622 && outside_cost < min->outside_cost))
1624 min->inside_cost = inside_cost;
1625 min->outside_cost = outside_cost;
1626 min->peel_info.dr_info = elem->dr_info;
1627 min->peel_info.npeel = elem->npeel;
1628 min->peel_info.count = elem->count;
1631 return 1;
1635 /* Choose best peeling option by traversing peeling hash table and either
1636 choosing an option with the lowest cost (if cost model is enabled) or the
1637 option that aligns as many accesses as possible. */
1639 static struct _vect_peel_extended_info
1640 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1641 loop_vec_info loop_vinfo)
1643 struct _vect_peel_extended_info res;
1645 res.peel_info.dr_info = NULL;
1646 res.vinfo = loop_vinfo;
1648 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1650 res.inside_cost = INT_MAX;
1651 res.outside_cost = INT_MAX;
1652 peeling_htab->traverse <_vect_peel_extended_info *,
1653 vect_peeling_hash_get_lowest_cost> (&res);
1655 else
1657 res.peel_info.count = 0;
1658 peeling_htab->traverse <_vect_peel_extended_info *,
1659 vect_peeling_hash_get_most_frequent> (&res);
1660 res.inside_cost = 0;
1661 res.outside_cost = 0;
1664 return res;
1667 /* Return true if the new peeling NPEEL is supported. */
1669 static bool
1670 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1671 unsigned npeel)
1673 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1674 enum dr_alignment_support supportable_dr_alignment;
1676 bool dr0_alignment_known_p
1677 = known_alignment_for_access_p (dr0_info,
1678 STMT_VINFO_VECTYPE (dr0_info->stmt));
1680 /* Ensure that all data refs can be vectorized after the peel. */
1681 for (data_reference *dr : datarefs)
1683 if (dr == dr0_info->dr)
1684 continue;
1686 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1687 if (!vect_relevant_for_alignment_p (dr_info)
1688 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1689 continue;
1691 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1692 int misalignment;
1693 unsigned HOST_WIDE_INT alignment;
1694 if (!dr0_alignment_known_p
1695 || !known_alignment_for_access_p (dr_info, vectype)
1696 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1697 misalignment = DR_MISALIGNMENT_UNKNOWN;
1698 else
1700 misalignment = dr_misalignment (dr_info, vectype);
1701 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1702 misalignment &= alignment - 1;
1704 supportable_dr_alignment
1705 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1706 misalignment);
1707 if (supportable_dr_alignment == dr_unaligned_unsupported)
1708 return false;
1711 return true;
1714 /* Compare two data-references DRA and DRB to group them into chunks
1715 with related alignment. */
1717 static int
1718 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1720 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1721 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1722 int cmp;
1724 /* Stabilize sort. */
1725 if (dra == drb)
1726 return 0;
1728 /* Ordering of DRs according to base. */
1729 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1730 DR_BASE_ADDRESS (drb));
1731 if (cmp != 0)
1732 return cmp;
1734 /* And according to DR_OFFSET. */
1735 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1736 if (cmp != 0)
1737 return cmp;
1739 /* And after step. */
1740 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1741 if (cmp != 0)
1742 return cmp;
1744 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1745 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1746 if (cmp == 0)
1747 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1748 return cmp;
1751 /* Function vect_enhance_data_refs_alignment
1753 This pass will use loop versioning and loop peeling in order to enhance
1754 the alignment of data references in the loop.
1756 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1757 original loop is to be vectorized. Any other loops that are created by
1758 the transformations performed in this pass - are not supposed to be
1759 vectorized. This restriction will be relaxed.
1761 This pass will require a cost model to guide it whether to apply peeling
1762 or versioning or a combination of the two. For example, the scheme that
1763 intel uses when given a loop with several memory accesses, is as follows:
1764 choose one memory access ('p') which alignment you want to force by doing
1765 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1766 other accesses are not necessarily aligned, or (2) use loop versioning to
1767 generate one loop in which all accesses are aligned, and another loop in
1768 which only 'p' is necessarily aligned.
1770 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1771 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1772 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1774 Devising a cost model is the most critical aspect of this work. It will
1775 guide us on which access to peel for, whether to use loop versioning, how
1776 many versions to create, etc. The cost model will probably consist of
1777 generic considerations as well as target specific considerations (on
1778 powerpc for example, misaligned stores are more painful than misaligned
1779 loads).
1781 Here are the general steps involved in alignment enhancements:
1783 -- original loop, before alignment analysis:
1784 for (i=0; i<N; i++){
1785 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1786 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1789 -- After vect_compute_data_refs_alignment:
1790 for (i=0; i<N; i++){
1791 x = q[i]; # DR_MISALIGNMENT(q) = 3
1792 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1795 -- Possibility 1: we do loop versioning:
1796 if (p is aligned) {
1797 for (i=0; i<N; i++){ # loop 1A
1798 x = q[i]; # DR_MISALIGNMENT(q) = 3
1799 p[i] = y; # DR_MISALIGNMENT(p) = 0
1802 else {
1803 for (i=0; i<N; i++){ # loop 1B
1804 x = q[i]; # DR_MISALIGNMENT(q) = 3
1805 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1809 -- Possibility 2: we do loop peeling:
1810 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1811 x = q[i];
1812 p[i] = y;
1814 for (i = 3; i < N; i++){ # loop 2A
1815 x = q[i]; # DR_MISALIGNMENT(q) = 0
1816 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1819 -- Possibility 3: combination of loop peeling and versioning:
1820 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1821 x = q[i];
1822 p[i] = y;
1824 if (p is aligned) {
1825 for (i = 3; i<N; i++){ # loop 3A
1826 x = q[i]; # DR_MISALIGNMENT(q) = 0
1827 p[i] = y; # DR_MISALIGNMENT(p) = 0
1830 else {
1831 for (i = 3; i<N; i++){ # loop 3B
1832 x = q[i]; # DR_MISALIGNMENT(q) = 0
1833 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1837 These loops are later passed to loop_transform to be vectorized. The
1838 vectorizer will use the alignment information to guide the transformation
1839 (whether to generate regular loads/stores, or with special handling for
1840 misalignment). */
1842 opt_result
1843 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1845 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1846 dr_vec_info *first_store = NULL;
1847 dr_vec_info *dr0_info = NULL;
1848 struct data_reference *dr;
1849 unsigned int i;
1850 bool do_peeling = false;
1851 bool do_versioning = false;
1852 unsigned int npeel = 0;
1853 bool one_misalignment_known = false;
1854 bool one_misalignment_unknown = false;
1855 bool one_dr_unsupportable = false;
1856 dr_vec_info *unsupportable_dr_info = NULL;
1857 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1858 hash_table<peel_info_hasher> peeling_htab (1);
1860 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1862 /* Reset data so we can safely be called multiple times. */
1863 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1864 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1866 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1867 return opt_result::success ();
1869 /* Sort the vector of datarefs so DRs that have the same or dependent
1870 alignment are next to each other. */
1871 auto_vec<data_reference_p> datarefs
1872 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1873 datarefs.qsort (dr_align_group_sort_cmp);
1875 /* Compute the number of DRs that become aligned when we peel
1876 a dataref so it becomes aligned. */
1877 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1878 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1879 unsigned i0;
1880 for (i0 = 0; i0 < datarefs.length (); ++i0)
1881 if (DR_BASE_ADDRESS (datarefs[i0]))
1882 break;
1883 for (i = i0 + 1; i <= datarefs.length (); ++i)
1885 if (i == datarefs.length ()
1886 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1887 DR_BASE_ADDRESS (datarefs[i]), 0)
1888 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1889 DR_OFFSET (datarefs[i]), 0)
1890 || !operand_equal_p (DR_STEP (datarefs[i0]),
1891 DR_STEP (datarefs[i]), 0))
1893 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1894 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1895 will get known misalignment if we align one of the refs
1896 with the largest DR_TARGET_ALIGNMENT. */
1897 for (unsigned j = i0; j < i; ++j)
1899 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1900 for (unsigned k = i0; k < i; ++k)
1902 if (k == j)
1903 continue;
1904 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1905 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1906 dr_infoj))
1907 n_same_align_refs[j]++;
1910 i0 = i;
1914 /* While cost model enhancements are expected in the future, the high level
1915 view of the code at this time is as follows:
1917 A) If there is a misaligned access then see if peeling to align
1918 this access can make all data references satisfy
1919 vect_supportable_dr_alignment. If so, update data structures
1920 as needed and return true.
1922 B) If peeling wasn't possible and there is a data reference with an
1923 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1924 then see if loop versioning checks can be used to make all data
1925 references satisfy vect_supportable_dr_alignment. If so, update
1926 data structures as needed and return true.
1928 C) If neither peeling nor versioning were successful then return false if
1929 any data reference does not satisfy vect_supportable_dr_alignment.
1931 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1933 Note, Possibility 3 above (which is peeling and versioning together) is not
1934 being done at this time. */
1936 /* (1) Peeling to force alignment. */
1938 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1939 Considerations:
1940 + How many accesses will become aligned due to the peeling
1941 - How many accesses will become unaligned due to the peeling,
1942 and the cost of misaligned accesses.
1943 - The cost of peeling (the extra runtime checks, the increase
1944 in code size). */
1946 FOR_EACH_VEC_ELT (datarefs, i, dr)
1948 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1949 if (!vect_relevant_for_alignment_p (dr_info))
1950 continue;
1952 stmt_vec_info stmt_info = dr_info->stmt;
1953 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1954 do_peeling = vector_alignment_reachable_p (dr_info);
1955 if (do_peeling)
1957 if (known_alignment_for_access_p (dr_info, vectype))
1959 unsigned int npeel_tmp = 0;
1960 bool negative = tree_int_cst_compare (DR_STEP (dr),
1961 size_zero_node) < 0;
1963 /* If known_alignment_for_access_p then we have set
1964 DR_MISALIGNMENT which is only done if we know it at compiler
1965 time, so it is safe to assume target alignment is constant.
1967 unsigned int target_align =
1968 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1969 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1970 poly_int64 off = 0;
1971 if (negative)
1972 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1973 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1974 mis = negative ? mis : -mis;
1975 if (mis != 0)
1976 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1978 /* For multiple types, it is possible that the bigger type access
1979 will have more than one peeling option. E.g., a loop with two
1980 types: one of size (vector size / 4), and the other one of
1981 size (vector size / 8). Vectorization factor will 8. If both
1982 accesses are misaligned by 3, the first one needs one scalar
1983 iteration to be aligned, and the second one needs 5. But the
1984 first one will be aligned also by peeling 5 scalar
1985 iterations, and in that case both accesses will be aligned.
1986 Hence, except for the immediate peeling amount, we also want
1987 to try to add full vector size, while we don't exceed
1988 vectorization factor.
1989 We do this automatically for cost model, since we calculate
1990 cost for every peeling option. */
1991 poly_uint64 nscalars = npeel_tmp;
1992 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1994 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1995 nscalars = (STMT_SLP_TYPE (stmt_info)
1996 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1999 /* Save info about DR in the hash table. Also include peeling
2000 amounts according to the explanation above. Indicate
2001 the alignment status when the ref is not aligned.
2002 ??? Rather than using unknown alignment here we should
2003 prune all entries from the peeling hashtable which cause
2004 DRs to be not supported. */
2005 bool supportable_if_not_aligned
2006 = vect_supportable_dr_alignment
2007 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2008 while (known_le (npeel_tmp, nscalars))
2010 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2011 dr_info, npeel_tmp,
2012 supportable_if_not_aligned);
2013 npeel_tmp += MAX (1, target_align / dr_size);
2016 one_misalignment_known = true;
2018 else
2020 /* If we don't know any misalignment values, we prefer
2021 peeling for data-ref that has the maximum number of data-refs
2022 with the same alignment, unless the target prefers to align
2023 stores over load. */
2024 unsigned same_align_drs = n_same_align_refs[i];
2025 if (!dr0_info
2026 || dr0_same_align_drs < same_align_drs)
2028 dr0_same_align_drs = same_align_drs;
2029 dr0_info = dr_info;
2031 /* For data-refs with the same number of related
2032 accesses prefer the one where the misalign
2033 computation will be invariant in the outermost loop. */
2034 else if (dr0_same_align_drs == same_align_drs)
2036 class loop *ivloop0, *ivloop;
2037 ivloop0 = outermost_invariant_loop_for_expr
2038 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2039 ivloop = outermost_invariant_loop_for_expr
2040 (loop, DR_BASE_ADDRESS (dr));
2041 if ((ivloop && !ivloop0)
2042 || (ivloop && ivloop0
2043 && flow_loop_nested_p (ivloop, ivloop0)))
2044 dr0_info = dr_info;
2047 one_misalignment_unknown = true;
2049 /* Check for data refs with unsupportable alignment that
2050 can be peeled. */
2051 enum dr_alignment_support supportable_dr_alignment
2052 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2053 DR_MISALIGNMENT_UNKNOWN);
2054 if (supportable_dr_alignment == dr_unaligned_unsupported)
2056 one_dr_unsupportable = true;
2057 unsupportable_dr_info = dr_info;
2060 if (!first_store && DR_IS_WRITE (dr))
2062 first_store = dr_info;
2063 first_store_same_align_drs = same_align_drs;
2067 else
2069 if (!aligned_access_p (dr_info, vectype))
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073 "vector alignment may not be reachable\n");
2074 break;
2079 /* Check if we can possibly peel the loop. */
2080 if (!vect_can_advance_ivs_p (loop_vinfo)
2081 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2082 || loop->inner)
2083 do_peeling = false;
2085 struct _vect_peel_extended_info peel_for_known_alignment;
2086 struct _vect_peel_extended_info peel_for_unknown_alignment;
2087 struct _vect_peel_extended_info best_peel;
2089 peel_for_unknown_alignment.inside_cost = INT_MAX;
2090 peel_for_unknown_alignment.outside_cost = INT_MAX;
2091 peel_for_unknown_alignment.peel_info.count = 0;
2093 if (do_peeling
2094 && one_misalignment_unknown)
2096 /* Check if the target requires to prefer stores over loads, i.e., if
2097 misaligned stores are more expensive than misaligned loads (taking
2098 drs with same alignment into account). */
2099 unsigned int load_inside_cost = 0;
2100 unsigned int load_outside_cost = 0;
2101 unsigned int store_inside_cost = 0;
2102 unsigned int store_outside_cost = 0;
2103 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2105 stmt_vector_for_cost dummy;
2106 dummy.create (2);
2107 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2108 &load_inside_cost,
2109 &load_outside_cost,
2110 &dummy, &dummy, estimated_npeels);
2111 dummy.release ();
2113 if (first_store)
2115 dummy.create (2);
2116 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2117 &store_inside_cost,
2118 &store_outside_cost,
2119 &dummy, &dummy,
2120 estimated_npeels);
2121 dummy.release ();
2123 else
2125 store_inside_cost = INT_MAX;
2126 store_outside_cost = INT_MAX;
2129 if (load_inside_cost > store_inside_cost
2130 || (load_inside_cost == store_inside_cost
2131 && load_outside_cost > store_outside_cost))
2133 dr0_info = first_store;
2134 dr0_same_align_drs = first_store_same_align_drs;
2135 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2136 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2138 else
2140 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2141 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2144 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2145 prologue_cost_vec.create (2);
2146 epilogue_cost_vec.create (2);
2148 int dummy2;
2149 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2150 (loop_vinfo, estimated_npeels, &dummy2,
2151 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2152 &prologue_cost_vec, &epilogue_cost_vec);
2154 prologue_cost_vec.release ();
2155 epilogue_cost_vec.release ();
2157 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2160 peel_for_unknown_alignment.peel_info.npeel = 0;
2161 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2163 best_peel = peel_for_unknown_alignment;
2165 peel_for_known_alignment.inside_cost = INT_MAX;
2166 peel_for_known_alignment.outside_cost = INT_MAX;
2167 peel_for_known_alignment.peel_info.count = 0;
2168 peel_for_known_alignment.peel_info.dr_info = NULL;
2170 if (do_peeling && one_misalignment_known)
2172 /* Peeling is possible, but there is no data access that is not supported
2173 unless aligned. So we try to choose the best possible peeling from
2174 the hash table. */
2175 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2176 (&peeling_htab, loop_vinfo);
2179 /* Compare costs of peeling for known and unknown alignment. */
2180 if (peel_for_known_alignment.peel_info.dr_info != NULL
2181 && peel_for_unknown_alignment.inside_cost
2182 >= peel_for_known_alignment.inside_cost)
2184 best_peel = peel_for_known_alignment;
2186 /* If the best peeling for known alignment has NPEEL == 0, perform no
2187 peeling at all except if there is an unsupportable dr that we can
2188 align. */
2189 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2190 do_peeling = false;
2193 /* If there is an unsupportable data ref, prefer this over all choices so far
2194 since we'd have to discard a chosen peeling except when it accidentally
2195 aligned the unsupportable data ref. */
2196 if (one_dr_unsupportable)
2197 dr0_info = unsupportable_dr_info;
2198 else if (do_peeling)
2200 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2201 TODO: Use nopeel_outside_cost or get rid of it? */
2202 unsigned nopeel_inside_cost = 0;
2203 unsigned nopeel_outside_cost = 0;
2205 stmt_vector_for_cost dummy;
2206 dummy.create (2);
2207 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2208 &nopeel_outside_cost, &dummy, &dummy, 0);
2209 dummy.release ();
2211 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2212 costs will be recorded. */
2213 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2214 prologue_cost_vec.create (2);
2215 epilogue_cost_vec.create (2);
2217 int dummy2;
2218 nopeel_outside_cost += vect_get_known_peeling_cost
2219 (loop_vinfo, 0, &dummy2,
2220 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2221 &prologue_cost_vec, &epilogue_cost_vec);
2223 prologue_cost_vec.release ();
2224 epilogue_cost_vec.release ();
2226 npeel = best_peel.peel_info.npeel;
2227 dr0_info = best_peel.peel_info.dr_info;
2229 /* If no peeling is not more expensive than the best peeling we
2230 have so far, don't perform any peeling. */
2231 if (nopeel_inside_cost <= best_peel.inside_cost)
2232 do_peeling = false;
2235 if (do_peeling)
2237 stmt_vec_info stmt_info = dr0_info->stmt;
2238 if (known_alignment_for_access_p (dr0_info,
2239 STMT_VINFO_VECTYPE (stmt_info)))
2241 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2242 size_zero_node) < 0;
2243 if (!npeel)
2245 /* Since it's known at compile time, compute the number of
2246 iterations in the peeled loop (the peeling factor) for use in
2247 updating DR_MISALIGNMENT values. The peeling factor is the
2248 vectorization factor minus the misalignment as an element
2249 count. */
2250 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2251 poly_int64 off = 0;
2252 if (negative)
2253 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2254 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2255 unsigned int mis
2256 = dr_misalignment (dr0_info, vectype, off);
2257 mis = negative ? mis : -mis;
2258 /* If known_alignment_for_access_p then we have set
2259 DR_MISALIGNMENT which is only done if we know it at compiler
2260 time, so it is safe to assume target alignment is constant.
2262 unsigned int target_align =
2263 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2264 npeel = ((mis & (target_align - 1))
2265 / vect_get_scalar_dr_size (dr0_info));
2268 /* For interleaved data access every iteration accesses all the
2269 members of the group, therefore we divide the number of iterations
2270 by the group size. */
2271 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2272 npeel /= DR_GROUP_SIZE (stmt_info);
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE, vect_location,
2276 "Try peeling by %d\n", npeel);
2279 /* Ensure that all datarefs can be vectorized after the peel. */
2280 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2281 do_peeling = false;
2283 /* Check if all datarefs are supportable and log. */
2284 if (do_peeling
2285 && npeel == 0
2286 && known_alignment_for_access_p (dr0_info,
2287 STMT_VINFO_VECTYPE (stmt_info)))
2288 return opt_result::success ();
2290 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2291 if (do_peeling)
2293 unsigned max_allowed_peel
2294 = param_vect_max_peeling_for_alignment;
2295 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2296 max_allowed_peel = 0;
2297 if (max_allowed_peel != (unsigned)-1)
2299 unsigned max_peel = npeel;
2300 if (max_peel == 0)
2302 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2303 unsigned HOST_WIDE_INT target_align_c;
2304 if (target_align.is_constant (&target_align_c))
2305 max_peel =
2306 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2307 else
2309 do_peeling = false;
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE, vect_location,
2312 "Disable peeling, max peels set and vector"
2313 " alignment unknown\n");
2316 if (max_peel > max_allowed_peel)
2318 do_peeling = false;
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "Disable peeling, max peels reached: %d\n", max_peel);
2326 /* Cost model #2 - if peeling may result in a remaining loop not
2327 iterating enough to be vectorized then do not peel. Since this
2328 is a cost heuristic rather than a correctness decision, use the
2329 most likely runtime value for variable vectorization factors. */
2330 if (do_peeling
2331 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2333 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2334 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2335 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2336 < assumed_vf + max_peel)
2337 do_peeling = false;
2340 if (do_peeling)
2342 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2343 If the misalignment of DR_i is identical to that of dr0 then set
2344 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2345 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2346 by the peeling factor times the element size of DR_i (MOD the
2347 vectorization factor times the size). Otherwise, the
2348 misalignment of DR_i must be set to unknown. */
2349 FOR_EACH_VEC_ELT (datarefs, i, dr)
2350 if (dr != dr0_info->dr)
2352 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2353 if (!vect_relevant_for_alignment_p (dr_info))
2354 continue;
2356 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2359 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2360 if (npeel)
2361 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2362 else
2363 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2364 SET_DR_MISALIGNMENT (dr0_info,
2365 vect_dr_misalign_for_aligned_access (dr0_info));
2366 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "Alignment of access forced using peeling.\n");
2370 dump_printf_loc (MSG_NOTE, vect_location,
2371 "Peeling for alignment will be applied.\n");
2374 /* The inside-loop cost will be accounted for in vectorizable_load
2375 and vectorizable_store correctly with adjusted alignments.
2376 Drop the body_cst_vec on the floor here. */
2377 return opt_result::success ();
2381 /* (2) Versioning to force alignment. */
2383 /* Try versioning if:
2384 1) optimize loop for speed and the cost-model is not cheap
2385 2) there is at least one unsupported misaligned data ref with an unknown
2386 misalignment, and
2387 3) all misaligned data refs with a known misalignment are supported, and
2388 4) the number of runtime alignment checks is within reason. */
2390 do_versioning
2391 = (optimize_loop_nest_for_speed_p (loop)
2392 && !loop->inner /* FORNOW */
2393 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2395 if (do_versioning)
2397 FOR_EACH_VEC_ELT (datarefs, i, dr)
2399 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2400 if (!vect_relevant_for_alignment_p (dr_info))
2401 continue;
2403 stmt_vec_info stmt_info = dr_info->stmt;
2404 if (STMT_VINFO_STRIDED_P (stmt_info))
2406 do_versioning = false;
2407 break;
2410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2411 bool negative = tree_int_cst_compare (DR_STEP (dr),
2412 size_zero_node) < 0;
2413 poly_int64 off = 0;
2414 if (negative)
2415 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2416 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2417 int misalignment;
2418 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2419 continue;
2421 enum dr_alignment_support supportable_dr_alignment
2422 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2423 misalignment);
2424 if (supportable_dr_alignment == dr_unaligned_unsupported)
2426 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2427 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2428 >= (unsigned) param_vect_max_version_for_alignment_checks))
2430 do_versioning = false;
2431 break;
2434 /* At present we don't support versioning for alignment
2435 with variable VF, since there's no guarantee that the
2436 VF is a power of two. We could relax this if we added
2437 a way of enforcing a power-of-two size. */
2438 unsigned HOST_WIDE_INT size;
2439 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2441 do_versioning = false;
2442 break;
2445 /* Forcing alignment in the first iteration is no good if
2446 we don't keep it across iterations. For now, just disable
2447 versioning in this case.
2448 ?? We could actually unroll the loop to achieve the required
2449 overall step alignment, and forcing the alignment could be
2450 done by doing some iterations of the non-vectorized loop. */
2451 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2452 * DR_STEP_ALIGNMENT (dr),
2453 DR_TARGET_ALIGNMENT (dr_info)))
2455 do_versioning = false;
2456 break;
2459 /* The rightmost bits of an aligned address must be zeros.
2460 Construct the mask needed for this test. For example,
2461 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2462 mask must be 15 = 0xf. */
2463 int mask = size - 1;
2465 /* FORNOW: use the same mask to test all potentially unaligned
2466 references in the loop. */
2467 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2468 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2470 do_versioning = false;
2471 break;
2474 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2475 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2479 /* Versioning requires at least one misaligned data reference. */
2480 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2481 do_versioning = false;
2482 else if (!do_versioning)
2483 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2486 if (do_versioning)
2488 const vec<stmt_vec_info> &may_misalign_stmts
2489 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2490 stmt_vec_info stmt_info;
2492 /* It can now be assumed that the data references in the statements
2493 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2494 of the loop being vectorized. */
2495 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2497 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2498 SET_DR_MISALIGNMENT (dr_info,
2499 vect_dr_misalign_for_aligned_access (dr_info));
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_NOTE, vect_location,
2502 "Alignment of access forced using versioning.\n");
2505 if (dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE, vect_location,
2507 "Versioning for alignment will be applied.\n");
2509 /* Peeling and versioning can't be done together at this time. */
2510 gcc_assert (! (do_peeling && do_versioning));
2512 return opt_result::success ();
2515 /* This point is reached if neither peeling nor versioning is being done. */
2516 gcc_assert (! (do_peeling || do_versioning));
2518 return opt_result::success ();
2522 /* Function vect_analyze_data_refs_alignment
2524 Analyze the alignment of the data-references in the loop.
2525 Return FALSE if a data reference is found that cannot be vectorized. */
2527 opt_result
2528 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2530 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2532 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2533 struct data_reference *dr;
2534 unsigned int i;
2536 vect_record_base_alignments (loop_vinfo);
2537 FOR_EACH_VEC_ELT (datarefs, i, dr)
2539 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2540 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2542 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2543 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2544 continue;
2545 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2546 STMT_VINFO_VECTYPE (dr_info->stmt));
2550 return opt_result::success ();
2554 /* Analyze alignment of DRs of stmts in NODE. */
2556 static bool
2557 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2559 /* Alignment is maintained in the first element of the group. */
2560 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2561 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2562 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2563 tree vectype = SLP_TREE_VECTYPE (node);
2564 poly_uint64 vector_alignment
2565 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2566 BITS_PER_UNIT);
2567 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2568 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2569 /* Re-analyze alignment when we're facing a vectorization with a bigger
2570 alignment requirement. */
2571 else if (known_lt (dr_info->target_alignment, vector_alignment))
2573 poly_uint64 old_target_alignment = dr_info->target_alignment;
2574 int old_misalignment = dr_info->misalignment;
2575 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2576 /* But keep knowledge about a smaller alignment. */
2577 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2578 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2580 dr_info->target_alignment = old_target_alignment;
2581 dr_info->misalignment = old_misalignment;
2584 /* When we ever face unordered target alignments the first one wins in terms
2585 of analyzing and the other will become unknown in dr_misalignment. */
2586 return true;
2589 /* Function vect_slp_analyze_instance_alignment
2591 Analyze the alignment of the data-references in the SLP instance.
2592 Return FALSE if a data reference is found that cannot be vectorized. */
2594 bool
2595 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2596 slp_instance instance)
2598 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2600 slp_tree node;
2601 unsigned i;
2602 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2603 if (! vect_slp_analyze_node_alignment (vinfo, node))
2604 return false;
2606 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2607 && ! vect_slp_analyze_node_alignment
2608 (vinfo, SLP_INSTANCE_TREE (instance)))
2609 return false;
2611 return true;
2615 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2616 accesses of legal size, step, etc. Detect gaps, single element
2617 interleaving, and other special cases. Set grouped access info.
2618 Collect groups of strided stores for further use in SLP analysis.
2619 Worker for vect_analyze_group_access. */
2621 static bool
2622 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2624 data_reference *dr = dr_info->dr;
2625 tree step = DR_STEP (dr);
2626 tree scalar_type = TREE_TYPE (DR_REF (dr));
2627 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2628 stmt_vec_info stmt_info = dr_info->stmt;
2629 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2630 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2631 HOST_WIDE_INT dr_step = -1;
2632 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2633 bool slp_impossible = false;
2635 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2636 size of the interleaving group (including gaps). */
2637 if (tree_fits_shwi_p (step))
2639 dr_step = tree_to_shwi (step);
2640 /* Check that STEP is a multiple of type size. Otherwise there is
2641 a non-element-sized gap at the end of the group which we
2642 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2643 ??? As we can handle non-constant step fine here we should
2644 simply remove uses of DR_GROUP_GAP between the last and first
2645 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2646 simply not include that gap. */
2647 if ((dr_step % type_size) != 0)
2649 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_NOTE, vect_location,
2651 "Step %T is not a multiple of the element size"
2652 " for %T\n",
2653 step, DR_REF (dr));
2654 return false;
2656 groupsize = absu_hwi (dr_step) / type_size;
2658 else
2659 groupsize = 0;
2661 /* Not consecutive access is possible only if it is a part of interleaving. */
2662 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2664 /* Check if it this DR is a part of interleaving, and is a single
2665 element of the group that is accessed in the loop. */
2667 /* Gaps are supported only for loads. STEP must be a multiple of the type
2668 size. */
2669 if (DR_IS_READ (dr)
2670 && (dr_step % type_size) == 0
2671 && groupsize > 0
2672 /* This could be UINT_MAX but as we are generating code in a very
2673 inefficient way we have to cap earlier.
2674 See PR91403 for example. */
2675 && groupsize <= 4096)
2677 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2678 DR_GROUP_SIZE (stmt_info) = groupsize;
2679 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "Detected single element interleaving %T"
2683 " step %T\n",
2684 DR_REF (dr), step);
2686 return true;
2689 if (dump_enabled_p ())
2690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2691 "not consecutive access %G", stmt_info->stmt);
2693 if (bb_vinfo)
2695 /* Mark the statement as unvectorizable. */
2696 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2697 return true;
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2702 STMT_VINFO_STRIDED_P (stmt_info) = true;
2703 return true;
2706 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2708 /* First stmt in the interleaving chain. Check the chain. */
2709 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2710 struct data_reference *data_ref = dr;
2711 unsigned int count = 1;
2712 tree prev_init = DR_INIT (data_ref);
2713 HOST_WIDE_INT diff, gaps = 0;
2715 /* By construction, all group members have INTEGER_CST DR_INITs. */
2716 while (next)
2718 /* We never have the same DR multiple times. */
2719 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2720 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2722 data_ref = STMT_VINFO_DATA_REF (next);
2724 /* All group members have the same STEP by construction. */
2725 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2727 /* Check that the distance between two accesses is equal to the type
2728 size. Otherwise, we have gaps. */
2729 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2730 - TREE_INT_CST_LOW (prev_init)) / type_size;
2731 if (diff < 1 || diff > UINT_MAX)
2733 /* For artificial testcases with array accesses with large
2734 constant indices we can run into overflow issues which
2735 can end up fooling the groupsize constraint below so
2736 check the individual gaps (which are represented as
2737 unsigned int) as well. */
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2740 "interleaved access with gap larger "
2741 "than representable\n");
2742 return false;
2744 if (diff != 1)
2746 /* FORNOW: SLP of accesses with gaps is not supported. */
2747 slp_impossible = true;
2748 if (DR_IS_WRITE (data_ref))
2750 if (dump_enabled_p ())
2751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2752 "interleaved store with gaps\n");
2753 return false;
2756 gaps += diff - 1;
2759 last_accessed_element += diff;
2761 /* Store the gap from the previous member of the group. If there is no
2762 gap in the access, DR_GROUP_GAP is always 1. */
2763 DR_GROUP_GAP (next) = diff;
2765 prev_init = DR_INIT (data_ref);
2766 next = DR_GROUP_NEXT_ELEMENT (next);
2767 /* Count the number of data-refs in the chain. */
2768 count++;
2771 if (groupsize == 0)
2772 groupsize = count + gaps;
2774 /* This could be UINT_MAX but as we are generating code in a very
2775 inefficient way we have to cap earlier. See PR78699 for example. */
2776 if (groupsize > 4096)
2778 if (dump_enabled_p ())
2779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2780 "group is too large\n");
2781 return false;
2784 /* Check that the size of the interleaving is equal to count for stores,
2785 i.e., that there are no gaps. */
2786 if (groupsize != count
2787 && !DR_IS_READ (dr))
2789 groupsize = count;
2790 STMT_VINFO_STRIDED_P (stmt_info) = true;
2793 /* If there is a gap after the last load in the group it is the
2794 difference between the groupsize and the last accessed
2795 element.
2796 When there is no gap, this difference should be 0. */
2797 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2799 DR_GROUP_SIZE (stmt_info) = groupsize;
2800 if (dump_enabled_p ())
2802 dump_printf_loc (MSG_NOTE, vect_location,
2803 "Detected interleaving ");
2804 if (DR_IS_READ (dr))
2805 dump_printf (MSG_NOTE, "load ");
2806 else if (STMT_VINFO_STRIDED_P (stmt_info))
2807 dump_printf (MSG_NOTE, "strided store ");
2808 else
2809 dump_printf (MSG_NOTE, "store ");
2810 dump_printf (MSG_NOTE, "of size %u\n",
2811 (unsigned)groupsize);
2812 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2813 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2814 while (next)
2816 if (DR_GROUP_GAP (next) != 1)
2817 dump_printf_loc (MSG_NOTE, vect_location,
2818 "\t<gap of %d elements>\n",
2819 DR_GROUP_GAP (next) - 1);
2820 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2821 next = DR_GROUP_NEXT_ELEMENT (next);
2823 if (DR_GROUP_GAP (stmt_info) != 0)
2824 dump_printf_loc (MSG_NOTE, vect_location,
2825 "\t<gap of %d elements>\n",
2826 DR_GROUP_GAP (stmt_info));
2829 /* SLP: create an SLP data structure for every interleaving group of
2830 stores for further analysis in vect_analyse_slp. */
2831 if (DR_IS_WRITE (dr) && !slp_impossible)
2833 if (loop_vinfo)
2834 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2835 if (bb_vinfo)
2836 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2840 return true;
2843 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2844 accesses of legal size, step, etc. Detect gaps, single element
2845 interleaving, and other special cases. Set grouped access info.
2846 Collect groups of strided stores for further use in SLP analysis. */
2848 static bool
2849 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2851 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2853 /* Dissolve the group if present. */
2854 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2855 while (stmt_info)
2857 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2858 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2859 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2860 stmt_info = next;
2862 return false;
2864 return true;
2867 /* Analyze the access pattern of the data-reference DR_INFO.
2868 In case of non-consecutive accesses call vect_analyze_group_access() to
2869 analyze groups of accesses. */
2871 static bool
2872 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2874 data_reference *dr = dr_info->dr;
2875 tree step = DR_STEP (dr);
2876 tree scalar_type = TREE_TYPE (DR_REF (dr));
2877 stmt_vec_info stmt_info = dr_info->stmt;
2878 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2879 class loop *loop = NULL;
2881 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2882 return true;
2884 if (loop_vinfo)
2885 loop = LOOP_VINFO_LOOP (loop_vinfo);
2887 if (loop_vinfo && !step)
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891 "bad data-ref access in loop\n");
2892 return false;
2895 /* Allow loads with zero step in inner-loop vectorization. */
2896 if (loop_vinfo && integer_zerop (step))
2898 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2899 if (!nested_in_vect_loop_p (loop, stmt_info))
2900 return DR_IS_READ (dr);
2901 /* Allow references with zero step for outer loops marked
2902 with pragma omp simd only - it guarantees absence of
2903 loop-carried dependencies between inner loop iterations. */
2904 if (loop->safelen < 2)
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_NOTE, vect_location,
2908 "zero step in inner loop of nest\n");
2909 return false;
2913 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2915 /* Interleaved accesses are not yet supported within outer-loop
2916 vectorization for references in the inner-loop. */
2917 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2919 /* For the rest of the analysis we use the outer-loop step. */
2920 step = STMT_VINFO_DR_STEP (stmt_info);
2921 if (integer_zerop (step))
2923 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_NOTE, vect_location,
2925 "zero step in outer loop.\n");
2926 return DR_IS_READ (dr);
2930 /* Consecutive? */
2931 if (TREE_CODE (step) == INTEGER_CST)
2933 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2934 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2935 || (dr_step < 0
2936 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2938 /* Mark that it is not interleaving. */
2939 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2940 return true;
2944 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2946 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_NOTE, vect_location,
2948 "grouped access in outer loop.\n");
2949 return false;
2953 /* Assume this is a DR handled by non-constant strided load case. */
2954 if (TREE_CODE (step) != INTEGER_CST)
2955 return (STMT_VINFO_STRIDED_P (stmt_info)
2956 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2957 || vect_analyze_group_access (vinfo, dr_info)));
2959 /* Not consecutive access - check if it's a part of interleaving group. */
2960 return vect_analyze_group_access (vinfo, dr_info);
2963 /* Compare two data-references DRA and DRB to group them into chunks
2964 suitable for grouping. */
2966 static int
2967 dr_group_sort_cmp (const void *dra_, const void *drb_)
2969 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2970 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2971 data_reference_p dra = dra_info->dr;
2972 data_reference_p drb = drb_info->dr;
2973 int cmp;
2975 /* Stabilize sort. */
2976 if (dra == drb)
2977 return 0;
2979 /* Different group IDs lead never belong to the same group. */
2980 if (dra_info->group != drb_info->group)
2981 return dra_info->group < drb_info->group ? -1 : 1;
2983 /* Ordering of DRs according to base. */
2984 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2985 DR_BASE_ADDRESS (drb));
2986 if (cmp != 0)
2987 return cmp;
2989 /* And according to DR_OFFSET. */
2990 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2991 if (cmp != 0)
2992 return cmp;
2994 /* Put reads before writes. */
2995 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2996 return DR_IS_READ (dra) ? -1 : 1;
2998 /* Then sort after access size. */
2999 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3000 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3001 if (cmp != 0)
3002 return cmp;
3004 /* And after step. */
3005 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3006 if (cmp != 0)
3007 return cmp;
3009 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3010 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3011 if (cmp == 0)
3012 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3013 return cmp;
3016 /* If OP is the result of a conversion, return the unconverted value,
3017 otherwise return null. */
3019 static tree
3020 strip_conversion (tree op)
3022 if (TREE_CODE (op) != SSA_NAME)
3023 return NULL_TREE;
3024 gimple *stmt = SSA_NAME_DEF_STMT (op);
3025 if (!is_gimple_assign (stmt)
3026 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3027 return NULL_TREE;
3028 return gimple_assign_rhs1 (stmt);
3031 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3032 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3033 be grouped in SLP mode. */
3035 static bool
3036 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3037 bool allow_slp_p)
3039 if (gimple_assign_single_p (stmt1_info->stmt))
3040 return gimple_assign_single_p (stmt2_info->stmt);
3042 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3043 if (call1 && gimple_call_internal_p (call1))
3045 /* Check for two masked loads or two masked stores. */
3046 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3047 if (!call2 || !gimple_call_internal_p (call2))
3048 return false;
3049 internal_fn ifn = gimple_call_internal_fn (call1);
3050 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3051 return false;
3052 if (ifn != gimple_call_internal_fn (call2))
3053 return false;
3055 /* Check that the masks are the same. Cope with casts of masks,
3056 like those created by build_mask_conversion. */
3057 tree mask1 = gimple_call_arg (call1, 2);
3058 tree mask2 = gimple_call_arg (call2, 2);
3059 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3061 mask1 = strip_conversion (mask1);
3062 if (!mask1)
3063 return false;
3064 mask2 = strip_conversion (mask2);
3065 if (!mask2)
3066 return false;
3067 if (!operand_equal_p (mask1, mask2, 0))
3068 return false;
3070 return true;
3073 return false;
3076 /* Function vect_analyze_data_ref_accesses.
3078 Analyze the access pattern of all the data references in the loop.
3080 FORNOW: the only access pattern that is considered vectorizable is a
3081 simple step 1 (consecutive) access.
3083 FORNOW: handle only arrays and pointer accesses. */
3085 opt_result
3086 vect_analyze_data_ref_accesses (vec_info *vinfo,
3087 vec<int> *dataref_groups)
3089 unsigned int i;
3090 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3092 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3094 if (datarefs.is_empty ())
3095 return opt_result::success ();
3097 /* Sort the array of datarefs to make building the interleaving chains
3098 linear. Don't modify the original vector's order, it is needed for
3099 determining what dependencies are reversed. */
3100 vec<dr_vec_info *> datarefs_copy;
3101 datarefs_copy.create (datarefs.length ());
3102 for (unsigned i = 0; i < datarefs.length (); i++)
3104 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3105 /* If the caller computed DR grouping use that, otherwise group by
3106 basic blocks. */
3107 if (dataref_groups)
3108 dr_info->group = (*dataref_groups)[i];
3109 else
3110 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3111 datarefs_copy.quick_push (dr_info);
3113 datarefs_copy.qsort (dr_group_sort_cmp);
3114 hash_set<stmt_vec_info> to_fixup;
3116 /* Build the interleaving chains. */
3117 for (i = 0; i < datarefs_copy.length () - 1;)
3119 dr_vec_info *dr_info_a = datarefs_copy[i];
3120 data_reference_p dra = dr_info_a->dr;
3121 int dra_group_id = dr_info_a->group;
3122 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3123 stmt_vec_info lastinfo = NULL;
3124 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3125 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3127 ++i;
3128 continue;
3130 for (i = i + 1; i < datarefs_copy.length (); ++i)
3132 dr_vec_info *dr_info_b = datarefs_copy[i];
3133 data_reference_p drb = dr_info_b->dr;
3134 int drb_group_id = dr_info_b->group;
3135 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3136 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3137 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3138 break;
3140 /* ??? Imperfect sorting (non-compatible types, non-modulo
3141 accesses, same accesses) can lead to a group to be artificially
3142 split here as we don't just skip over those. If it really
3143 matters we can push those to a worklist and re-iterate
3144 over them. The we can just skip ahead to the next DR here. */
3146 /* DRs in a different DR group should not be put into the same
3147 interleaving group. */
3148 if (dra_group_id != drb_group_id)
3149 break;
3151 /* Check that the data-refs have same first location (except init)
3152 and they are both either store or load (not load and store,
3153 not masked loads or stores). */
3154 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3155 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3156 DR_BASE_ADDRESS (drb)) != 0
3157 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3158 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3159 break;
3161 /* Check that the data-refs have the same constant size. */
3162 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3163 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3164 if (!tree_fits_uhwi_p (sza)
3165 || !tree_fits_uhwi_p (szb)
3166 || !tree_int_cst_equal (sza, szb))
3167 break;
3169 /* Check that the data-refs have the same step. */
3170 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3171 break;
3173 /* Check the types are compatible.
3174 ??? We don't distinguish this during sorting. */
3175 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3176 TREE_TYPE (DR_REF (drb))))
3177 break;
3179 /* Check that the DR_INITs are compile-time constants. */
3180 if (!tree_fits_shwi_p (DR_INIT (dra))
3181 || !tree_fits_shwi_p (DR_INIT (drb)))
3182 break;
3184 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3185 just hold extra information. */
3186 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3187 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3188 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3189 break;
3191 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3192 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3193 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3194 HOST_WIDE_INT init_prev
3195 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3196 gcc_assert (init_a <= init_b
3197 && init_a <= init_prev
3198 && init_prev <= init_b);
3200 /* Do not place the same access in the interleaving chain twice. */
3201 if (init_b == init_prev)
3203 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3204 < gimple_uid (DR_STMT (drb)));
3205 /* Simply link in duplicates and fix up the chain below. */
3207 else
3209 /* If init_b == init_a + the size of the type * k, we have an
3210 interleaving, and DRA is accessed before DRB. */
3211 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3212 if (type_size_a == 0
3213 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3214 % type_size_a != 0))
3215 break;
3217 /* If we have a store, the accesses are adjacent. This splits
3218 groups into chunks we support (we don't support vectorization
3219 of stores with gaps). */
3220 if (!DR_IS_READ (dra)
3221 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3222 != type_size_a))
3223 break;
3225 /* If the step (if not zero or non-constant) is smaller than the
3226 difference between data-refs' inits this splits groups into
3227 suitable sizes. */
3228 if (tree_fits_shwi_p (DR_STEP (dra)))
3230 unsigned HOST_WIDE_INT step
3231 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3232 if (step != 0
3233 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3234 break;
3238 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE, vect_location,
3240 DR_IS_READ (dra)
3241 ? "Detected interleaving load %T and %T\n"
3242 : "Detected interleaving store %T and %T\n",
3243 DR_REF (dra), DR_REF (drb));
3245 /* Link the found element into the group list. */
3246 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3248 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3249 lastinfo = stmtinfo_a;
3251 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3252 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3253 lastinfo = stmtinfo_b;
3255 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3256 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3258 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3259 dump_printf_loc (MSG_NOTE, vect_location,
3260 "Load suitable for SLP vectorization only.\n");
3262 if (init_b == init_prev
3263 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3264 && dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE, vect_location,
3266 "Queuing group with duplicate access for fixup\n");
3270 /* Fixup groups with duplicate entries by splitting it. */
3271 while (1)
3273 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3274 if (!(it != to_fixup.end ()))
3275 break;
3276 stmt_vec_info grp = *it;
3277 to_fixup.remove (grp);
3279 /* Find the earliest duplicate group member. */
3280 unsigned first_duplicate = -1u;
3281 stmt_vec_info next, g = grp;
3282 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3284 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3285 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3286 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3287 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3288 g = next;
3290 if (first_duplicate == -1U)
3291 continue;
3293 /* Then move all stmts after the first duplicate to a new group.
3294 Note this is a heuristic but one with the property that *it
3295 is fixed up completely. */
3296 g = grp;
3297 stmt_vec_info newgroup = NULL, ng = grp;
3298 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3300 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3302 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3303 if (!newgroup)
3304 newgroup = next;
3305 else
3306 DR_GROUP_NEXT_ELEMENT (ng) = next;
3307 ng = next;
3308 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3310 else
3311 g = DR_GROUP_NEXT_ELEMENT (g);
3313 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3315 /* Fixup the new group which still may contain duplicates. */
3316 to_fixup.add (newgroup);
3319 dr_vec_info *dr_info;
3320 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3322 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3323 && !vect_analyze_data_ref_access (vinfo, dr_info))
3325 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3327 "not vectorized: complicated access pattern.\n");
3329 if (is_a <bb_vec_info> (vinfo))
3331 /* Mark the statement as not vectorizable. */
3332 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3333 continue;
3335 else
3337 datarefs_copy.release ();
3338 return opt_result::failure_at (dr_info->stmt->stmt,
3339 "not vectorized:"
3340 " complicated access pattern.\n");
3345 datarefs_copy.release ();
3346 return opt_result::success ();
3349 /* Function vect_vfa_segment_size.
3351 Input:
3352 DR_INFO: The data reference.
3353 LENGTH_FACTOR: segment length to consider.
3355 Return a value suitable for the dr_with_seg_len::seg_len field.
3356 This is the "distance travelled" by the pointer from the first
3357 iteration in the segment to the last. Note that it does not include
3358 the size of the access; in effect it only describes the first byte. */
3360 static tree
3361 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3363 length_factor = size_binop (MINUS_EXPR,
3364 fold_convert (sizetype, length_factor),
3365 size_one_node);
3366 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3367 length_factor);
3370 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3371 gives the worst-case number of bytes covered by the segment. */
3373 static unsigned HOST_WIDE_INT
3374 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3376 stmt_vec_info stmt_vinfo = dr_info->stmt;
3377 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3378 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3379 unsigned HOST_WIDE_INT access_size = ref_size;
3380 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3382 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3383 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3385 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3386 int misalignment;
3387 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3388 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3389 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3390 == dr_explicit_realign_optimized))
3392 /* We might access a full vector's worth. */
3393 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3395 return access_size;
3398 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3399 describes. */
3401 static unsigned int
3402 vect_vfa_align (dr_vec_info *dr_info)
3404 return dr_alignment (dr_info->dr);
3407 /* Function vect_no_alias_p.
3409 Given data references A and B with equal base and offset, see whether
3410 the alias relation can be decided at compilation time. Return 1 if
3411 it can and the references alias, 0 if it can and the references do
3412 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3413 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3414 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3416 static int
3417 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3418 tree segment_length_a, tree segment_length_b,
3419 unsigned HOST_WIDE_INT access_size_a,
3420 unsigned HOST_WIDE_INT access_size_b)
3422 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3423 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3424 poly_uint64 const_length_a;
3425 poly_uint64 const_length_b;
3427 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3428 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3429 [a, a+12) */
3430 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3432 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3433 offset_a -= const_length_a;
3435 else
3436 const_length_a = tree_to_poly_uint64 (segment_length_a);
3437 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3439 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3440 offset_b -= const_length_b;
3442 else
3443 const_length_b = tree_to_poly_uint64 (segment_length_b);
3445 const_length_a += access_size_a;
3446 const_length_b += access_size_b;
3448 if (ranges_known_overlap_p (offset_a, const_length_a,
3449 offset_b, const_length_b))
3450 return 1;
3452 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3453 offset_b, const_length_b))
3454 return 0;
3456 return -1;
3459 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3460 in DDR is >= VF. */
3462 static bool
3463 dependence_distance_ge_vf (data_dependence_relation *ddr,
3464 unsigned int loop_depth, poly_uint64 vf)
3466 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3467 || DDR_NUM_DIST_VECTS (ddr) == 0)
3468 return false;
3470 /* If the dependence is exact, we should have limited the VF instead. */
3471 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3473 unsigned int i;
3474 lambda_vector dist_v;
3475 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3477 HOST_WIDE_INT dist = dist_v[loop_depth];
3478 if (dist != 0
3479 && !(dist > 0 && DDR_REVERSED_P (ddr))
3480 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3481 return false;
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE, vect_location,
3486 "dependence distance between %T and %T is >= VF\n",
3487 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3489 return true;
3492 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3494 static void
3495 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3497 dump_printf (dump_kind, "%s (%T) >= ",
3498 lower_bound.unsigned_p ? "unsigned" : "abs",
3499 lower_bound.expr);
3500 dump_dec (dump_kind, lower_bound.min_value);
3503 /* Record that the vectorized loop requires the vec_lower_bound described
3504 by EXPR, UNSIGNED_P and MIN_VALUE. */
3506 static void
3507 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3508 poly_uint64 min_value)
3510 vec<vec_lower_bound> &lower_bounds
3511 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3512 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3513 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3515 unsigned_p &= lower_bounds[i].unsigned_p;
3516 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3517 if (lower_bounds[i].unsigned_p != unsigned_p
3518 || maybe_lt (lower_bounds[i].min_value, min_value))
3520 lower_bounds[i].unsigned_p = unsigned_p;
3521 lower_bounds[i].min_value = min_value;
3522 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE, vect_location,
3525 "updating run-time check to ");
3526 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3527 dump_printf (MSG_NOTE, "\n");
3530 return;
3533 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3534 if (dump_enabled_p ())
3536 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3537 dump_lower_bound (MSG_NOTE, lower_bound);
3538 dump_printf (MSG_NOTE, "\n");
3540 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3543 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3544 will span fewer than GAP bytes. */
3546 static bool
3547 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3548 poly_int64 gap)
3550 stmt_vec_info stmt_info = dr_info->stmt;
3551 HOST_WIDE_INT count
3552 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3553 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3554 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3555 return (estimated_poly_value (gap)
3556 <= count * vect_get_scalar_dr_size (dr_info));
3559 /* Return true if we know that there is no alias between DR_INFO_A and
3560 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3561 When returning true, set *LOWER_BOUND_OUT to this N. */
3563 static bool
3564 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3565 poly_uint64 *lower_bound_out)
3567 /* Check that there is a constant gap of known sign between DR_A
3568 and DR_B. */
3569 data_reference *dr_a = dr_info_a->dr;
3570 data_reference *dr_b = dr_info_b->dr;
3571 poly_int64 init_a, init_b;
3572 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3573 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3574 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3575 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3576 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3577 || !ordered_p (init_a, init_b))
3578 return false;
3580 /* Sort DR_A and DR_B by the address they access. */
3581 if (maybe_lt (init_b, init_a))
3583 std::swap (init_a, init_b);
3584 std::swap (dr_info_a, dr_info_b);
3585 std::swap (dr_a, dr_b);
3588 /* If the two accesses could be dependent within a scalar iteration,
3589 make sure that we'd retain their order. */
3590 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3591 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3592 return false;
3594 /* There is no alias if abs (DR_STEP) is greater than or equal to
3595 the bytes spanned by the combination of the two accesses. */
3596 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3597 return true;
3600 /* Function vect_prune_runtime_alias_test_list.
3602 Prune a list of ddrs to be tested at run-time by versioning for alias.
3603 Merge several alias checks into one if possible.
3604 Return FALSE if resulting list of ddrs is longer then allowed by
3605 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3607 opt_result
3608 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3610 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3611 hash_set <tree_pair_hash> compared_objects;
3613 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3614 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3615 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3616 const vec<vec_object_pair> &check_unequal_addrs
3617 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3618 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3619 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3621 ddr_p ddr;
3622 unsigned int i;
3623 tree length_factor;
3625 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3627 /* Step values are irrelevant for aliasing if the number of vector
3628 iterations is equal to the number of scalar iterations (which can
3629 happen for fully-SLP loops). */
3630 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3632 if (!vf_one_p)
3634 /* Convert the checks for nonzero steps into bound tests. */
3635 tree value;
3636 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3637 vect_check_lower_bound (loop_vinfo, value, true, 1);
3640 if (may_alias_ddrs.is_empty ())
3641 return opt_result::success ();
3643 comp_alias_ddrs.create (may_alias_ddrs.length ());
3645 unsigned int loop_depth
3646 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3647 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3649 /* First, we collect all data ref pairs for aliasing checks. */
3650 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3652 poly_uint64 lower_bound;
3653 tree segment_length_a, segment_length_b;
3654 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3655 unsigned int align_a, align_b;
3657 /* Ignore the alias if the VF we chose ended up being no greater
3658 than the dependence distance. */
3659 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3660 continue;
3662 if (DDR_OBJECT_A (ddr))
3664 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3665 if (!compared_objects.add (new_pair))
3667 if (dump_enabled_p ())
3668 dump_printf_loc (MSG_NOTE, vect_location,
3669 "checking that %T and %T"
3670 " have different addresses\n",
3671 new_pair.first, new_pair.second);
3672 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3674 continue;
3677 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3678 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3680 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3681 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3683 bool preserves_scalar_order_p
3684 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3685 bool ignore_step_p
3686 = (vf_one_p
3687 && (preserves_scalar_order_p
3688 || operand_equal_p (DR_STEP (dr_info_a->dr),
3689 DR_STEP (dr_info_b->dr))));
3691 /* Skip the pair if inter-iteration dependencies are irrelevant
3692 and intra-iteration dependencies are guaranteed to be honored. */
3693 if (ignore_step_p
3694 && (preserves_scalar_order_p
3695 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3696 &lower_bound)))
3698 if (dump_enabled_p ())
3699 dump_printf_loc (MSG_NOTE, vect_location,
3700 "no need for alias check between "
3701 "%T and %T when VF is 1\n",
3702 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3703 continue;
3706 /* See whether we can handle the alias using a bounds check on
3707 the step, and whether that's likely to be the best approach.
3708 (It might not be, for example, if the minimum step is much larger
3709 than the number of bytes handled by one vector iteration.) */
3710 if (!ignore_step_p
3711 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3712 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3713 &lower_bound)
3714 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3715 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3717 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3718 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3721 "%T and %T when the step %T is outside ",
3722 DR_REF (dr_info_a->dr),
3723 DR_REF (dr_info_b->dr),
3724 DR_STEP (dr_info_a->dr));
3725 if (unsigned_p)
3726 dump_printf (MSG_NOTE, "[0");
3727 else
3729 dump_printf (MSG_NOTE, "(");
3730 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3732 dump_printf (MSG_NOTE, ", ");
3733 dump_dec (MSG_NOTE, lower_bound);
3734 dump_printf (MSG_NOTE, ")\n");
3736 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3737 unsigned_p, lower_bound);
3738 continue;
3741 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3742 if (dr_group_first_a)
3744 stmt_info_a = dr_group_first_a;
3745 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3748 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3749 if (dr_group_first_b)
3751 stmt_info_b = dr_group_first_b;
3752 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3755 if (ignore_step_p)
3757 segment_length_a = size_zero_node;
3758 segment_length_b = size_zero_node;
3760 else
3762 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3763 DR_STEP (dr_info_b->dr), 0))
3764 length_factor = scalar_loop_iters;
3765 else
3766 length_factor = size_int (vect_factor);
3767 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3768 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3770 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3771 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3772 align_a = vect_vfa_align (dr_info_a);
3773 align_b = vect_vfa_align (dr_info_b);
3775 /* See whether the alias is known at compilation time. */
3776 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3777 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3778 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3779 DR_OFFSET (dr_info_b->dr), 0)
3780 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3781 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3782 && poly_int_tree_p (segment_length_a)
3783 && poly_int_tree_p (segment_length_b))
3785 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3786 segment_length_a,
3787 segment_length_b,
3788 access_size_a,
3789 access_size_b);
3790 if (res >= 0 && dump_enabled_p ())
3792 dump_printf_loc (MSG_NOTE, vect_location,
3793 "can tell at compile time that %T and %T",
3794 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3795 if (res == 0)
3796 dump_printf (MSG_NOTE, " do not alias\n");
3797 else
3798 dump_printf (MSG_NOTE, " alias\n");
3801 if (res == 0)
3802 continue;
3804 if (res == 1)
3805 return opt_result::failure_at (stmt_info_b->stmt,
3806 "not vectorized:"
3807 " compilation time alias: %G%G",
3808 stmt_info_a->stmt,
3809 stmt_info_b->stmt);
3812 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3813 access_size_a, align_a);
3814 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3815 access_size_b, align_b);
3816 /* Canonicalize the order to be the one that's needed for accurate
3817 RAW, WAR and WAW flags, in cases where the data references are
3818 well-ordered. The order doesn't really matter otherwise,
3819 but we might as well be consistent. */
3820 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3821 std::swap (dr_a, dr_b);
3823 dr_with_seg_len_pair_t dr_with_seg_len_pair
3824 (dr_a, dr_b, (preserves_scalar_order_p
3825 ? dr_with_seg_len_pair_t::WELL_ORDERED
3826 : dr_with_seg_len_pair_t::REORDERED));
3828 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3831 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3833 unsigned int count = (comp_alias_ddrs.length ()
3834 + check_unequal_addrs.length ());
3836 if (count
3837 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3838 == VECT_COST_MODEL_VERY_CHEAP))
3839 return opt_result::failure_at
3840 (vect_location, "would need a runtime alias check\n");
3842 if (dump_enabled_p ())
3843 dump_printf_loc (MSG_NOTE, vect_location,
3844 "improved number of alias checks from %d to %d\n",
3845 may_alias_ddrs.length (), count);
3846 unsigned limit = param_vect_max_version_for_alias_checks;
3847 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3848 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3849 if (count > limit)
3850 return opt_result::failure_at
3851 (vect_location,
3852 "number of versioning for alias run-time tests exceeds %d "
3853 "(--param vect-max-version-for-alias-checks)\n", limit);
3855 return opt_result::success ();
3858 /* Check whether we can use an internal function for a gather load
3859 or scatter store. READ_P is true for loads and false for stores.
3860 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3861 the type of the memory elements being loaded or stored. OFFSET_TYPE
3862 is the type of the offset that is being applied to the invariant
3863 base address. SCALE is the amount by which the offset should
3864 be multiplied *after* it has been converted to address width.
3866 Return true if the function is supported, storing the function id in
3867 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3869 bool
3870 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3871 tree vectype, tree memory_type, tree offset_type,
3872 int scale, internal_fn *ifn_out,
3873 tree *offset_vectype_out)
3875 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3876 unsigned int element_bits = vector_element_bits (vectype);
3877 if (element_bits != memory_bits)
3878 /* For now the vector elements must be the same width as the
3879 memory elements. */
3880 return false;
3882 /* Work out which function we need. */
3883 internal_fn ifn, alt_ifn, alt_ifn2;
3884 if (read_p)
3886 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3887 alt_ifn = IFN_MASK_GATHER_LOAD;
3888 /* When target supports MASK_LEN_GATHER_LOAD, we always
3889 use MASK_LEN_GATHER_LOAD regardless whether len and
3890 mask are valid or not. */
3891 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
3893 else
3895 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3896 alt_ifn = IFN_MASK_SCATTER_STORE;
3897 /* When target supports MASK_LEN_SCATTER_STORE, we always
3898 use MASK_LEN_SCATTER_STORE regardless whether len and
3899 mask are valid or not. */
3900 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
3903 for (;;)
3905 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3906 if (!offset_vectype)
3907 return false;
3909 /* Test whether the target supports this combination. */
3910 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3911 offset_vectype, scale))
3913 *ifn_out = ifn;
3914 *offset_vectype_out = offset_vectype;
3915 return true;
3917 else if (!masked_p
3918 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3919 memory_type,
3920 offset_vectype,
3921 scale))
3923 *ifn_out = alt_ifn;
3924 *offset_vectype_out = offset_vectype;
3925 return true;
3927 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
3928 memory_type,
3929 offset_vectype, scale))
3931 *ifn_out = alt_ifn2;
3932 *offset_vectype_out = offset_vectype;
3933 return true;
3936 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3937 && TYPE_PRECISION (offset_type) >= element_bits)
3938 return false;
3940 offset_type = build_nonstandard_integer_type
3941 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3945 /* STMT_INFO is a call to an internal gather load or scatter store function.
3946 Describe the operation in INFO. */
3948 static void
3949 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3950 gather_scatter_info *info)
3952 gcall *call = as_a <gcall *> (stmt_info->stmt);
3953 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3954 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3956 info->ifn = gimple_call_internal_fn (call);
3957 info->decl = NULL_TREE;
3958 info->base = gimple_call_arg (call, 0);
3959 info->offset = gimple_call_arg (call, 1);
3960 info->offset_dt = vect_unknown_def_type;
3961 info->offset_vectype = NULL_TREE;
3962 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3963 info->element_type = TREE_TYPE (vectype);
3964 info->memory_type = TREE_TYPE (DR_REF (dr));
3967 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3968 gather load or scatter store. Describe the operation in *INFO if so. */
3970 bool
3971 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3972 gather_scatter_info *info)
3974 HOST_WIDE_INT scale = 1;
3975 poly_int64 pbitpos, pbitsize;
3976 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3977 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3978 tree offtype = NULL_TREE;
3979 tree decl = NULL_TREE, base, off;
3980 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3981 tree memory_type = TREE_TYPE (DR_REF (dr));
3982 machine_mode pmode;
3983 int punsignedp, reversep, pvolatilep = 0;
3984 internal_fn ifn;
3985 tree offset_vectype;
3986 bool masked_p = false;
3988 /* See whether this is already a call to a gather/scatter internal function.
3989 If not, see whether it's a masked load or store. */
3990 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3991 if (call && gimple_call_internal_p (call))
3993 ifn = gimple_call_internal_fn (call);
3994 if (internal_gather_scatter_fn_p (ifn))
3996 vect_describe_gather_scatter_call (stmt_info, info);
3997 return true;
3999 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4002 /* True if we should aim to use internal functions rather than
4003 built-in functions. */
4004 bool use_ifn_p = (DR_IS_READ (dr)
4005 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4006 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4008 base = DR_REF (dr);
4009 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4010 see if we can use the def stmt of the address. */
4011 if (masked_p
4012 && TREE_CODE (base) == MEM_REF
4013 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4014 && integer_zerop (TREE_OPERAND (base, 1))
4015 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4017 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4018 if (is_gimple_assign (def_stmt)
4019 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4020 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4023 /* The gather and scatter builtins need address of the form
4024 loop_invariant + vector * {1, 2, 4, 8}
4026 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4027 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4028 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4029 multiplications and additions in it. To get a vector, we need
4030 a single SSA_NAME that will be defined in the loop and will
4031 contain everything that is not loop invariant and that can be
4032 vectorized. The following code attempts to find such a preexistng
4033 SSA_NAME OFF and put the loop invariants into a tree BASE
4034 that can be gimplified before the loop. */
4035 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4036 &punsignedp, &reversep, &pvolatilep);
4037 if (reversep)
4038 return false;
4040 /* PR 107346. Packed structs can have fields at offsets that are not
4041 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4042 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4043 return false;
4045 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4047 if (TREE_CODE (base) == MEM_REF)
4049 if (!integer_zerop (TREE_OPERAND (base, 1)))
4051 if (off == NULL_TREE)
4052 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4053 else
4054 off = size_binop (PLUS_EXPR, off,
4055 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4057 base = TREE_OPERAND (base, 0);
4059 else
4060 base = build_fold_addr_expr (base);
4062 if (off == NULL_TREE)
4063 off = size_zero_node;
4065 /* If base is not loop invariant, either off is 0, then we start with just
4066 the constant offset in the loop invariant BASE and continue with base
4067 as OFF, otherwise give up.
4068 We could handle that case by gimplifying the addition of base + off
4069 into some SSA_NAME and use that as off, but for now punt. */
4070 if (!expr_invariant_in_loop_p (loop, base))
4072 if (!integer_zerop (off))
4073 return false;
4074 off = base;
4075 base = size_int (pbytepos);
4077 /* Otherwise put base + constant offset into the loop invariant BASE
4078 and continue with OFF. */
4079 else
4081 base = fold_convert (sizetype, base);
4082 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4085 /* OFF at this point may be either a SSA_NAME or some tree expression
4086 from get_inner_reference. Try to peel off loop invariants from it
4087 into BASE as long as possible. */
4088 STRIP_NOPS (off);
4089 while (offtype == NULL_TREE)
4091 enum tree_code code;
4092 tree op0, op1, add = NULL_TREE;
4094 if (TREE_CODE (off) == SSA_NAME)
4096 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4098 if (expr_invariant_in_loop_p (loop, off))
4099 return false;
4101 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4102 break;
4104 op0 = gimple_assign_rhs1 (def_stmt);
4105 code = gimple_assign_rhs_code (def_stmt);
4106 op1 = gimple_assign_rhs2 (def_stmt);
4108 else
4110 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4111 return false;
4112 code = TREE_CODE (off);
4113 extract_ops_from_tree (off, &code, &op0, &op1);
4115 switch (code)
4117 case POINTER_PLUS_EXPR:
4118 case PLUS_EXPR:
4119 if (expr_invariant_in_loop_p (loop, op0))
4121 add = op0;
4122 off = op1;
4123 do_add:
4124 add = fold_convert (sizetype, add);
4125 if (scale != 1)
4126 add = size_binop (MULT_EXPR, add, size_int (scale));
4127 base = size_binop (PLUS_EXPR, base, add);
4128 continue;
4130 if (expr_invariant_in_loop_p (loop, op1))
4132 add = op1;
4133 off = op0;
4134 goto do_add;
4136 break;
4137 case MINUS_EXPR:
4138 if (expr_invariant_in_loop_p (loop, op1))
4140 add = fold_convert (sizetype, op1);
4141 add = size_binop (MINUS_EXPR, size_zero_node, add);
4142 off = op0;
4143 goto do_add;
4145 break;
4146 case MULT_EXPR:
4147 if (scale == 1 && tree_fits_shwi_p (op1))
4149 int new_scale = tree_to_shwi (op1);
4150 /* Only treat this as a scaling operation if the target
4151 supports it for at least some offset type. */
4152 if (use_ifn_p
4153 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4154 masked_p, vectype, memory_type,
4155 signed_char_type_node,
4156 new_scale, &ifn,
4157 &offset_vectype)
4158 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4159 masked_p, vectype, memory_type,
4160 unsigned_char_type_node,
4161 new_scale, &ifn,
4162 &offset_vectype))
4163 break;
4164 scale = new_scale;
4165 off = op0;
4166 continue;
4168 break;
4169 case SSA_NAME:
4170 off = op0;
4171 continue;
4172 CASE_CONVERT:
4173 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4174 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4175 break;
4177 /* Don't include the conversion if the target is happy with
4178 the current offset type. */
4179 if (use_ifn_p
4180 && TREE_CODE (off) == SSA_NAME
4181 && !POINTER_TYPE_P (TREE_TYPE (off))
4182 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4183 masked_p, vectype, memory_type,
4184 TREE_TYPE (off), scale, &ifn,
4185 &offset_vectype))
4186 break;
4188 if (TYPE_PRECISION (TREE_TYPE (op0))
4189 == TYPE_PRECISION (TREE_TYPE (off)))
4191 off = op0;
4192 continue;
4195 /* Include the conversion if it is widening and we're using
4196 the IFN path or the target can handle the converted from
4197 offset or the current size is not already the same as the
4198 data vector element size. */
4199 if ((TYPE_PRECISION (TREE_TYPE (op0))
4200 < TYPE_PRECISION (TREE_TYPE (off)))
4201 && (use_ifn_p
4202 || (DR_IS_READ (dr)
4203 ? (targetm.vectorize.builtin_gather
4204 && targetm.vectorize.builtin_gather (vectype,
4205 TREE_TYPE (op0),
4206 scale))
4207 : (targetm.vectorize.builtin_scatter
4208 && targetm.vectorize.builtin_scatter (vectype,
4209 TREE_TYPE (op0),
4210 scale)))
4211 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4212 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4214 off = op0;
4215 offtype = TREE_TYPE (off);
4216 STRIP_NOPS (off);
4217 continue;
4219 break;
4220 default:
4221 break;
4223 break;
4226 /* If at the end OFF still isn't a SSA_NAME or isn't
4227 defined in the loop, punt. */
4228 if (TREE_CODE (off) != SSA_NAME
4229 || expr_invariant_in_loop_p (loop, off))
4230 return false;
4232 if (offtype == NULL_TREE)
4233 offtype = TREE_TYPE (off);
4235 if (use_ifn_p)
4237 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4238 vectype, memory_type, offtype, scale,
4239 &ifn, &offset_vectype))
4240 ifn = IFN_LAST;
4241 decl = NULL_TREE;
4243 else
4245 if (DR_IS_READ (dr))
4247 if (targetm.vectorize.builtin_gather)
4248 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4250 else
4252 if (targetm.vectorize.builtin_scatter)
4253 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4255 ifn = IFN_LAST;
4256 /* The offset vector type will be read from DECL when needed. */
4257 offset_vectype = NULL_TREE;
4260 info->ifn = ifn;
4261 info->decl = decl;
4262 info->base = base;
4263 info->offset = off;
4264 info->offset_dt = vect_unknown_def_type;
4265 info->offset_vectype = offset_vectype;
4266 info->scale = scale;
4267 info->element_type = TREE_TYPE (vectype);
4268 info->memory_type = memory_type;
4269 return true;
4272 /* Find the data references in STMT, analyze them with respect to LOOP and
4273 append them to DATAREFS. Return false if datarefs in this stmt cannot
4274 be handled. */
4276 opt_result
4277 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4278 vec<data_reference_p> *datarefs,
4279 vec<int> *dataref_groups, int group_id)
4281 /* We can ignore clobbers for dataref analysis - they are removed during
4282 loop vectorization and BB vectorization checks dependences with a
4283 stmt walk. */
4284 if (gimple_clobber_p (stmt))
4285 return opt_result::success ();
4287 if (gimple_has_volatile_ops (stmt))
4288 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4289 stmt);
4291 if (stmt_can_throw_internal (cfun, stmt))
4292 return opt_result::failure_at (stmt,
4293 "not vectorized:"
4294 " statement can throw an exception: %G",
4295 stmt);
4297 auto_vec<data_reference_p, 2> refs;
4298 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4299 if (!res)
4300 return res;
4302 if (refs.is_empty ())
4303 return opt_result::success ();
4305 if (refs.length () > 1)
4307 while (!refs.is_empty ())
4308 free_data_ref (refs.pop ());
4309 return opt_result::failure_at (stmt,
4310 "not vectorized: more than one "
4311 "data ref in stmt: %G", stmt);
4314 data_reference_p dr = refs.pop ();
4315 if (gcall *call = dyn_cast <gcall *> (stmt))
4316 if (!gimple_call_internal_p (call)
4317 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4318 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4320 free_data_ref (dr);
4321 return opt_result::failure_at (stmt,
4322 "not vectorized: dr in a call %G", stmt);
4325 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4326 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4328 free_data_ref (dr);
4329 return opt_result::failure_at (stmt,
4330 "not vectorized:"
4331 " statement is an unsupported"
4332 " bitfield access %G", stmt);
4335 if (DR_BASE_ADDRESS (dr)
4336 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4338 free_data_ref (dr);
4339 return opt_result::failure_at (stmt,
4340 "not vectorized:"
4341 " base addr of dr is a constant\n");
4344 /* Check whether this may be a SIMD lane access and adjust the
4345 DR to make it easier for us to handle it. */
4346 if (loop
4347 && loop->simduid
4348 && (!DR_BASE_ADDRESS (dr)
4349 || !DR_OFFSET (dr)
4350 || !DR_INIT (dr)
4351 || !DR_STEP (dr)))
4353 struct data_reference *newdr
4354 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4355 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4356 if (DR_BASE_ADDRESS (newdr)
4357 && DR_OFFSET (newdr)
4358 && DR_INIT (newdr)
4359 && DR_STEP (newdr)
4360 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4361 && integer_zerop (DR_STEP (newdr)))
4363 tree base_address = DR_BASE_ADDRESS (newdr);
4364 tree off = DR_OFFSET (newdr);
4365 tree step = ssize_int (1);
4366 if (integer_zerop (off)
4367 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4369 off = TREE_OPERAND (base_address, 1);
4370 base_address = TREE_OPERAND (base_address, 0);
4372 STRIP_NOPS (off);
4373 if (TREE_CODE (off) == MULT_EXPR
4374 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4376 step = TREE_OPERAND (off, 1);
4377 off = TREE_OPERAND (off, 0);
4378 STRIP_NOPS (off);
4380 if (CONVERT_EXPR_P (off)
4381 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4382 < TYPE_PRECISION (TREE_TYPE (off))))
4383 off = TREE_OPERAND (off, 0);
4384 if (TREE_CODE (off) == SSA_NAME)
4386 gimple *def = SSA_NAME_DEF_STMT (off);
4387 /* Look through widening conversion. */
4388 if (is_gimple_assign (def)
4389 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4391 tree rhs1 = gimple_assign_rhs1 (def);
4392 if (TREE_CODE (rhs1) == SSA_NAME
4393 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4394 && (TYPE_PRECISION (TREE_TYPE (off))
4395 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4396 def = SSA_NAME_DEF_STMT (rhs1);
4398 if (is_gimple_call (def)
4399 && gimple_call_internal_p (def)
4400 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4402 tree arg = gimple_call_arg (def, 0);
4403 tree reft = TREE_TYPE (DR_REF (newdr));
4404 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4405 arg = SSA_NAME_VAR (arg);
4406 if (arg == loop->simduid
4407 /* For now. */
4408 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4410 DR_BASE_ADDRESS (newdr) = base_address;
4411 DR_OFFSET (newdr) = ssize_int (0);
4412 DR_STEP (newdr) = step;
4413 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4414 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4415 /* Mark as simd-lane access. */
4416 tree arg2 = gimple_call_arg (def, 1);
4417 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4418 free_data_ref (dr);
4419 datarefs->safe_push (newdr);
4420 if (dataref_groups)
4421 dataref_groups->safe_push (group_id);
4422 return opt_result::success ();
4427 free_data_ref (newdr);
4430 datarefs->safe_push (dr);
4431 if (dataref_groups)
4432 dataref_groups->safe_push (group_id);
4433 return opt_result::success ();
4436 /* Function vect_analyze_data_refs.
4438 Find all the data references in the loop or basic block.
4440 The general structure of the analysis of data refs in the vectorizer is as
4441 follows:
4442 1- vect_analyze_data_refs(loop/bb): call
4443 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4444 in the loop/bb and their dependences.
4445 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4446 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4447 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4451 opt_result
4452 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4454 class loop *loop = NULL;
4455 unsigned int i;
4456 struct data_reference *dr;
4457 tree scalar_type;
4459 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4461 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4462 loop = LOOP_VINFO_LOOP (loop_vinfo);
4464 /* Go through the data-refs, check that the analysis succeeded. Update
4465 pointer from stmt_vec_info struct to DR and vectype. */
4467 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4468 FOR_EACH_VEC_ELT (datarefs, i, dr)
4470 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4471 poly_uint64 vf;
4473 gcc_assert (DR_REF (dr));
4474 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4475 gcc_assert (!stmt_info->dr_aux.dr);
4476 stmt_info->dr_aux.dr = dr;
4477 stmt_info->dr_aux.stmt = stmt_info;
4479 /* Check that analysis of the data-ref succeeded. */
4480 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4481 || !DR_STEP (dr))
4483 bool maybe_gather
4484 = DR_IS_READ (dr)
4485 && !TREE_THIS_VOLATILE (DR_REF (dr));
4486 bool maybe_scatter
4487 = DR_IS_WRITE (dr)
4488 && !TREE_THIS_VOLATILE (DR_REF (dr));
4490 /* If target supports vector gather loads or scatter stores,
4491 see if they can't be used. */
4492 if (is_a <loop_vec_info> (vinfo)
4493 && !nested_in_vect_loop_p (loop, stmt_info))
4495 if (maybe_gather || maybe_scatter)
4497 if (maybe_gather)
4498 gatherscatter = GATHER;
4499 else
4500 gatherscatter = SCATTER;
4504 if (gatherscatter == SG_NONE)
4506 if (dump_enabled_p ())
4507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4508 "not vectorized: data ref analysis "
4509 "failed %G", stmt_info->stmt);
4510 if (is_a <bb_vec_info> (vinfo))
4512 /* In BB vectorization the ref can still participate
4513 in dependence analysis, we just can't vectorize it. */
4514 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4515 continue;
4517 return opt_result::failure_at (stmt_info->stmt,
4518 "not vectorized:"
4519 " data ref analysis failed: %G",
4520 stmt_info->stmt);
4524 /* See if this was detected as SIMD lane access. */
4525 if (dr->aux == (void *)-1
4526 || dr->aux == (void *)-2
4527 || dr->aux == (void *)-3
4528 || dr->aux == (void *)-4)
4530 if (nested_in_vect_loop_p (loop, stmt_info))
4531 return opt_result::failure_at (stmt_info->stmt,
4532 "not vectorized:"
4533 " data ref analysis failed: %G",
4534 stmt_info->stmt);
4535 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4536 = -(uintptr_t) dr->aux;
4539 tree base = get_base_address (DR_REF (dr));
4540 if (base && VAR_P (base) && DECL_NONALIASED (base))
4542 if (dump_enabled_p ())
4543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4544 "not vectorized: base object not addressable "
4545 "for stmt: %G", stmt_info->stmt);
4546 if (is_a <bb_vec_info> (vinfo))
4548 /* In BB vectorization the ref can still participate
4549 in dependence analysis, we just can't vectorize it. */
4550 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4551 continue;
4553 return opt_result::failure_at (stmt_info->stmt,
4554 "not vectorized: base object not"
4555 " addressable for stmt: %G",
4556 stmt_info->stmt);
4559 if (is_a <loop_vec_info> (vinfo)
4560 && DR_STEP (dr)
4561 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4563 if (nested_in_vect_loop_p (loop, stmt_info))
4564 return opt_result::failure_at (stmt_info->stmt,
4565 "not vectorized: "
4566 "not suitable for strided load %G",
4567 stmt_info->stmt);
4568 STMT_VINFO_STRIDED_P (stmt_info) = true;
4571 /* Update DR field in stmt_vec_info struct. */
4573 /* If the dataref is in an inner-loop of the loop that is considered for
4574 for vectorization, we also want to analyze the access relative to
4575 the outer-loop (DR contains information only relative to the
4576 inner-most enclosing loop). We do that by building a reference to the
4577 first location accessed by the inner-loop, and analyze it relative to
4578 the outer-loop. */
4579 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4581 /* Build a reference to the first location accessed by the
4582 inner loop: *(BASE + INIT + OFFSET). By construction,
4583 this address must be invariant in the inner loop, so we
4584 can consider it as being used in the outer loop. */
4585 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4586 tree offset = unshare_expr (DR_OFFSET (dr));
4587 tree init = unshare_expr (DR_INIT (dr));
4588 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4589 init, offset);
4590 tree init_addr = fold_build_pointer_plus (base, init_offset);
4591 tree init_ref = build_fold_indirect_ref (init_addr);
4593 if (dump_enabled_p ())
4594 dump_printf_loc (MSG_NOTE, vect_location,
4595 "analyze in outer loop: %T\n", init_ref);
4597 opt_result res
4598 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4599 init_ref, loop, stmt_info->stmt);
4600 if (!res)
4601 /* dr_analyze_innermost already explained the failure. */
4602 return res;
4604 if (dump_enabled_p ())
4605 dump_printf_loc (MSG_NOTE, vect_location,
4606 "\touter base_address: %T\n"
4607 "\touter offset from base address: %T\n"
4608 "\touter constant offset from base address: %T\n"
4609 "\touter step: %T\n"
4610 "\touter base alignment: %d\n\n"
4611 "\touter base misalignment: %d\n"
4612 "\touter offset alignment: %d\n"
4613 "\touter step alignment: %d\n",
4614 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4615 STMT_VINFO_DR_OFFSET (stmt_info),
4616 STMT_VINFO_DR_INIT (stmt_info),
4617 STMT_VINFO_DR_STEP (stmt_info),
4618 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4619 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4620 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4621 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4624 /* Set vectype for STMT. */
4625 scalar_type = TREE_TYPE (DR_REF (dr));
4626 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4627 if (!vectype)
4629 if (dump_enabled_p ())
4631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4632 "not vectorized: no vectype for stmt: %G",
4633 stmt_info->stmt);
4634 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4635 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4636 scalar_type);
4637 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4640 if (is_a <bb_vec_info> (vinfo))
4642 /* No vector type is fine, the ref can still participate
4643 in dependence analysis, we just can't vectorize it. */
4644 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4645 continue;
4647 if (fatal)
4648 *fatal = false;
4649 return opt_result::failure_at (stmt_info->stmt,
4650 "not vectorized:"
4651 " no vectype for stmt: %G"
4652 " scalar_type: %T\n",
4653 stmt_info->stmt, scalar_type);
4655 else
4657 if (dump_enabled_p ())
4658 dump_printf_loc (MSG_NOTE, vect_location,
4659 "got vectype for stmt: %G%T\n",
4660 stmt_info->stmt, vectype);
4663 /* Adjust the minimal vectorization factor according to the
4664 vector type. */
4665 vf = TYPE_VECTOR_SUBPARTS (vectype);
4666 *min_vf = upper_bound (*min_vf, vf);
4668 /* Leave the BB vectorizer to pick the vector type later, based on
4669 the final dataref group size and SLP node size. */
4670 if (is_a <loop_vec_info> (vinfo))
4671 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4673 if (gatherscatter != SG_NONE)
4675 gather_scatter_info gs_info;
4676 if (!vect_check_gather_scatter (stmt_info,
4677 as_a <loop_vec_info> (vinfo),
4678 &gs_info)
4679 || !get_vectype_for_scalar_type (vinfo,
4680 TREE_TYPE (gs_info.offset)))
4682 if (fatal)
4683 *fatal = false;
4684 return opt_result::failure_at
4685 (stmt_info->stmt,
4686 (gatherscatter == GATHER)
4687 ? "not vectorized: not suitable for gather load %G"
4688 : "not vectorized: not suitable for scatter store %G",
4689 stmt_info->stmt);
4691 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4695 /* We used to stop processing and prune the list here. Verify we no
4696 longer need to. */
4697 gcc_assert (i == datarefs.length ());
4699 return opt_result::success ();
4703 /* Function vect_get_new_vect_var.
4705 Returns a name for a new variable. The current naming scheme appends the
4706 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4707 the name of vectorizer generated variables, and appends that to NAME if
4708 provided. */
4710 tree
4711 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4713 const char *prefix;
4714 tree new_vect_var;
4716 switch (var_kind)
4718 case vect_simple_var:
4719 prefix = "vect";
4720 break;
4721 case vect_scalar_var:
4722 prefix = "stmp";
4723 break;
4724 case vect_mask_var:
4725 prefix = "mask";
4726 break;
4727 case vect_pointer_var:
4728 prefix = "vectp";
4729 break;
4730 default:
4731 gcc_unreachable ();
4734 if (name)
4736 char* tmp = concat (prefix, "_", name, NULL);
4737 new_vect_var = create_tmp_reg (type, tmp);
4738 free (tmp);
4740 else
4741 new_vect_var = create_tmp_reg (type, prefix);
4743 return new_vect_var;
4746 /* Like vect_get_new_vect_var but return an SSA name. */
4748 tree
4749 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4751 const char *prefix;
4752 tree new_vect_var;
4754 switch (var_kind)
4756 case vect_simple_var:
4757 prefix = "vect";
4758 break;
4759 case vect_scalar_var:
4760 prefix = "stmp";
4761 break;
4762 case vect_pointer_var:
4763 prefix = "vectp";
4764 break;
4765 default:
4766 gcc_unreachable ();
4769 if (name)
4771 char* tmp = concat (prefix, "_", name, NULL);
4772 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4773 free (tmp);
4775 else
4776 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4778 return new_vect_var;
4781 /* Duplicate points-to info on NAME from DR_INFO. */
4783 static void
4784 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4786 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4787 /* DR_PTR_INFO is for a base SSA name, not including constant or
4788 variable offsets in the ref so its alignment info does not apply. */
4789 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4792 /* Function vect_create_addr_base_for_vector_ref.
4794 Create an expression that computes the address of the first memory location
4795 that will be accessed for a data reference.
4797 Input:
4798 STMT_INFO: The statement containing the data reference.
4799 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4800 OFFSET: Optional. If supplied, it is be added to the initial address.
4801 LOOP: Specify relative to which loop-nest should the address be computed.
4802 For example, when the dataref is in an inner-loop nested in an
4803 outer-loop that is now being vectorized, LOOP can be either the
4804 outer-loop, or the inner-loop. The first memory location accessed
4805 by the following dataref ('in' points to short):
4807 for (i=0; i<N; i++)
4808 for (j=0; j<M; j++)
4809 s += in[i+j]
4811 is as follows:
4812 if LOOP=i_loop: &in (relative to i_loop)
4813 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4815 Output:
4816 1. Return an SSA_NAME whose value is the address of the memory location of
4817 the first vector of the data reference.
4818 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4819 these statement(s) which define the returned SSA_NAME.
4821 FORNOW: We are only handling array accesses with step 1. */
4823 tree
4824 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4825 gimple_seq *new_stmt_list,
4826 tree offset)
4828 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4829 struct data_reference *dr = dr_info->dr;
4830 const char *base_name;
4831 tree addr_base;
4832 tree dest;
4833 gimple_seq seq = NULL;
4834 tree vect_ptr_type;
4835 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4836 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4838 tree data_ref_base = unshare_expr (drb->base_address);
4839 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4840 tree init = unshare_expr (drb->init);
4842 if (loop_vinfo)
4843 base_name = get_name (data_ref_base);
4844 else
4846 base_offset = ssize_int (0);
4847 init = ssize_int (0);
4848 base_name = get_name (DR_REF (dr));
4851 /* Create base_offset */
4852 base_offset = size_binop (PLUS_EXPR,
4853 fold_convert (sizetype, base_offset),
4854 fold_convert (sizetype, init));
4856 if (offset)
4858 offset = fold_convert (sizetype, offset);
4859 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4860 base_offset, offset);
4863 /* base + base_offset */
4864 if (loop_vinfo)
4865 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4866 else
4867 addr_base = build1 (ADDR_EXPR,
4868 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4869 /* Strip zero offset components since we don't need
4870 them and they can confuse late diagnostics if
4871 we CSE them wrongly. See PR106904 for example. */
4872 unshare_expr (strip_zero_offset_components
4873 (DR_REF (dr))));
4875 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4876 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4877 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4878 gimple_seq_add_seq (new_stmt_list, seq);
4880 if (DR_PTR_INFO (dr)
4881 && TREE_CODE (addr_base) == SSA_NAME
4882 /* We should only duplicate pointer info to newly created SSA names. */
4883 && SSA_NAME_VAR (addr_base) == dest)
4885 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4886 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4889 if (dump_enabled_p ())
4890 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4892 return addr_base;
4896 /* Function vect_create_data_ref_ptr.
4898 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4899 location accessed in the loop by STMT_INFO, along with the def-use update
4900 chain to appropriately advance the pointer through the loop iterations.
4901 Also set aliasing information for the pointer. This pointer is used by
4902 the callers to this function to create a memory reference expression for
4903 vector load/store access.
4905 Input:
4906 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4907 GIMPLE_ASSIGN <name, data-ref> or
4908 GIMPLE_ASSIGN <data-ref, name>.
4909 2. AGGR_TYPE: the type of the reference, which should be either a vector
4910 or an array.
4911 3. AT_LOOP: the loop where the vector memref is to be created.
4912 4. OFFSET (optional): a byte offset to be added to the initial address
4913 accessed by the data-ref in STMT_INFO.
4914 5. BSI: location where the new stmts are to be placed if there is no loop
4915 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4916 pointing to the initial address.
4917 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4918 to the IV during each iteration of the loop. NULL says to move
4919 by one copy of AGGR_TYPE up or down, depending on the step of the
4920 data reference.
4922 Output:
4923 1. Declare a new ptr to vector_type, and have it point to the base of the
4924 data reference (initial addressed accessed by the data reference).
4925 For example, for vector of type V8HI, the following code is generated:
4927 v8hi *ap;
4928 ap = (v8hi *)initial_address;
4930 if OFFSET is not supplied:
4931 initial_address = &a[init];
4932 if OFFSET is supplied:
4933 initial_address = &a[init] + OFFSET;
4934 if BYTE_OFFSET is supplied:
4935 initial_address = &a[init] + BYTE_OFFSET;
4937 Return the initial_address in INITIAL_ADDRESS.
4939 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4940 update the pointer in each iteration of the loop.
4942 Return the increment stmt that updates the pointer in PTR_INCR.
4944 3. Return the pointer. */
4946 tree
4947 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4948 tree aggr_type, class loop *at_loop, tree offset,
4949 tree *initial_address, gimple_stmt_iterator *gsi,
4950 gimple **ptr_incr, bool only_init,
4951 tree iv_step)
4953 const char *base_name;
4954 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4955 class loop *loop = NULL;
4956 bool nested_in_vect_loop = false;
4957 class loop *containing_loop = NULL;
4958 tree aggr_ptr_type;
4959 tree aggr_ptr;
4960 tree new_temp;
4961 gimple_seq new_stmt_list = NULL;
4962 edge pe = NULL;
4963 basic_block new_bb;
4964 tree aggr_ptr_init;
4965 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4966 struct data_reference *dr = dr_info->dr;
4967 tree aptr;
4968 gimple_stmt_iterator incr_gsi;
4969 bool insert_after;
4970 tree indx_before_incr, indx_after_incr;
4971 gimple *incr;
4972 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4974 gcc_assert (iv_step != NULL_TREE
4975 || TREE_CODE (aggr_type) == ARRAY_TYPE
4976 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4978 if (loop_vinfo)
4980 loop = LOOP_VINFO_LOOP (loop_vinfo);
4981 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4982 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4983 pe = loop_preheader_edge (loop);
4985 else
4987 gcc_assert (bb_vinfo);
4988 only_init = true;
4989 *ptr_incr = NULL;
4992 /* Create an expression for the first address accessed by this load
4993 in LOOP. */
4994 base_name = get_name (DR_BASE_ADDRESS (dr));
4996 if (dump_enabled_p ())
4998 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4999 dump_printf_loc (MSG_NOTE, vect_location,
5000 "create %s-pointer variable to type: %T",
5001 get_tree_code_name (TREE_CODE (aggr_type)),
5002 aggr_type);
5003 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5004 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5005 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5006 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5007 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5008 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5009 else
5010 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5011 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5014 /* (1) Create the new aggregate-pointer variable.
5015 Vector and array types inherit the alias set of their component
5016 type by default so we need to use a ref-all pointer if the data
5017 reference does not conflict with the created aggregated data
5018 reference because it is not addressable. */
5019 bool need_ref_all = false;
5020 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5021 get_alias_set (DR_REF (dr))))
5022 need_ref_all = true;
5023 /* Likewise for any of the data references in the stmt group. */
5024 else if (DR_GROUP_SIZE (stmt_info) > 1)
5026 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5029 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5030 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5031 get_alias_set (DR_REF (sdr))))
5033 need_ref_all = true;
5034 break;
5036 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5038 while (sinfo);
5040 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5041 need_ref_all);
5042 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5045 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5046 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5047 def-use update cycles for the pointer: one relative to the outer-loop
5048 (LOOP), which is what steps (3) and (4) below do. The other is relative
5049 to the inner-loop (which is the inner-most loop containing the dataref),
5050 and this is done be step (5) below.
5052 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5053 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5054 redundant. Steps (3),(4) create the following:
5056 vp0 = &base_addr;
5057 LOOP: vp1 = phi(vp0,vp2)
5060 vp2 = vp1 + step
5061 goto LOOP
5063 If there is an inner-loop nested in loop, then step (5) will also be
5064 applied, and an additional update in the inner-loop will be created:
5066 vp0 = &base_addr;
5067 LOOP: vp1 = phi(vp0,vp2)
5069 inner: vp3 = phi(vp1,vp4)
5070 vp4 = vp3 + inner_step
5071 if () goto inner
5073 vp2 = vp1 + step
5074 if () goto LOOP */
5076 /* (2) Calculate the initial address of the aggregate-pointer, and set
5077 the aggregate-pointer to point to it before the loop. */
5079 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5081 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5082 stmt_info, &new_stmt_list,
5083 offset);
5084 if (new_stmt_list)
5086 if (pe)
5088 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5089 gcc_assert (!new_bb);
5091 else
5092 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5095 *initial_address = new_temp;
5096 aggr_ptr_init = new_temp;
5098 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5099 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5100 inner-loop nested in LOOP (during outer-loop vectorization). */
5102 /* No update in loop is required. */
5103 if (only_init && (!loop_vinfo || at_loop == loop))
5104 aptr = aggr_ptr_init;
5105 else
5107 /* Accesses to invariant addresses should be handled specially
5108 by the caller. */
5109 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5110 gcc_assert (!integer_zerop (step));
5112 if (iv_step == NULL_TREE)
5114 /* The step of the aggregate pointer is the type size,
5115 negated for downward accesses. */
5116 iv_step = TYPE_SIZE_UNIT (aggr_type);
5117 if (tree_int_cst_sgn (step) == -1)
5118 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5121 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5123 create_iv (aggr_ptr_init, PLUS_EXPR,
5124 fold_convert (aggr_ptr_type, iv_step),
5125 aggr_ptr, loop, &incr_gsi, insert_after,
5126 &indx_before_incr, &indx_after_incr);
5127 incr = gsi_stmt (incr_gsi);
5129 /* Copy the points-to information if it exists. */
5130 if (DR_PTR_INFO (dr))
5132 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5133 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5135 if (ptr_incr)
5136 *ptr_incr = incr;
5138 aptr = indx_before_incr;
5141 if (!nested_in_vect_loop || only_init)
5142 return aptr;
5145 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5146 nested in LOOP, if exists. */
5148 gcc_assert (nested_in_vect_loop);
5149 if (!only_init)
5151 standard_iv_increment_position (containing_loop, &incr_gsi,
5152 &insert_after);
5153 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5154 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5155 &indx_before_incr, &indx_after_incr);
5156 incr = gsi_stmt (incr_gsi);
5158 /* Copy the points-to information if it exists. */
5159 if (DR_PTR_INFO (dr))
5161 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5162 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5164 if (ptr_incr)
5165 *ptr_incr = incr;
5167 return indx_before_incr;
5169 else
5170 gcc_unreachable ();
5174 /* Function bump_vector_ptr
5176 Increment a pointer (to a vector type) by vector-size. If requested,
5177 i.e. if PTR-INCR is given, then also connect the new increment stmt
5178 to the existing def-use update-chain of the pointer, by modifying
5179 the PTR_INCR as illustrated below:
5181 The pointer def-use update-chain before this function:
5182 DATAREF_PTR = phi (p_0, p_2)
5183 ....
5184 PTR_INCR: p_2 = DATAREF_PTR + step
5186 The pointer def-use update-chain after this function:
5187 DATAREF_PTR = phi (p_0, p_2)
5188 ....
5189 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5190 ....
5191 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5193 Input:
5194 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5195 in the loop.
5196 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5197 the loop. The increment amount across iterations is expected
5198 to be vector_size.
5199 BSI - location where the new update stmt is to be placed.
5200 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5201 BUMP - optional. The offset by which to bump the pointer. If not given,
5202 the offset is assumed to be vector_size.
5204 Output: Return NEW_DATAREF_PTR as illustrated above.
5208 tree
5209 bump_vector_ptr (vec_info *vinfo,
5210 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5211 stmt_vec_info stmt_info, tree bump)
5213 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5214 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5215 tree update = TYPE_SIZE_UNIT (vectype);
5216 gimple *incr_stmt;
5217 ssa_op_iter iter;
5218 use_operand_p use_p;
5219 tree new_dataref_ptr;
5221 if (bump)
5222 update = bump;
5224 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5225 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5226 else if (is_gimple_min_invariant (dataref_ptr))
5227 /* When possible avoid emitting a separate increment stmt that will
5228 force the addressed object addressable. */
5229 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5230 fold_build2 (MEM_REF,
5231 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5232 dataref_ptr,
5233 fold_convert (ptr_type_node, update)));
5234 else
5235 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5236 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5237 dataref_ptr, update);
5238 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5239 /* Fold the increment, avoiding excessive chains use-def chains of
5240 those, leading to compile-time issues for passes until the next
5241 forwprop pass which would do this as well. */
5242 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5243 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5245 incr_stmt = gsi_stmt (fold_gsi);
5246 update_stmt (incr_stmt);
5249 /* Copy the points-to information if it exists. */
5250 if (DR_PTR_INFO (dr))
5252 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5253 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5256 if (!ptr_incr)
5257 return new_dataref_ptr;
5259 /* Update the vector-pointer's cross-iteration increment. */
5260 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5262 tree use = USE_FROM_PTR (use_p);
5264 if (use == dataref_ptr)
5265 SET_USE (use_p, new_dataref_ptr);
5266 else
5267 gcc_assert (operand_equal_p (use, update, 0));
5270 return new_dataref_ptr;
5274 /* Copy memory reference info such as base/clique from the SRC reference
5275 to the DEST MEM_REF. */
5277 void
5278 vect_copy_ref_info (tree dest, tree src)
5280 if (TREE_CODE (dest) != MEM_REF)
5281 return;
5283 tree src_base = src;
5284 while (handled_component_p (src_base))
5285 src_base = TREE_OPERAND (src_base, 0);
5286 if (TREE_CODE (src_base) != MEM_REF
5287 && TREE_CODE (src_base) != TARGET_MEM_REF)
5288 return;
5290 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5291 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5295 /* Function vect_create_destination_var.
5297 Create a new temporary of type VECTYPE. */
5299 tree
5300 vect_create_destination_var (tree scalar_dest, tree vectype)
5302 tree vec_dest;
5303 const char *name;
5304 char *new_name;
5305 tree type;
5306 enum vect_var_kind kind;
5308 kind = vectype
5309 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5310 ? vect_mask_var
5311 : vect_simple_var
5312 : vect_scalar_var;
5313 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5315 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5317 name = get_name (scalar_dest);
5318 if (name)
5319 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5320 else
5321 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5322 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5323 free (new_name);
5325 return vec_dest;
5328 /* Function vect_grouped_store_supported.
5330 Returns TRUE if interleave high and interleave low permutations
5331 are supported, and FALSE otherwise. */
5333 bool
5334 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5336 machine_mode mode = TYPE_MODE (vectype);
5338 /* vect_permute_store_chain requires the group size to be equal to 3 or
5339 be a power of two. */
5340 if (count != 3 && exact_log2 (count) == -1)
5342 if (dump_enabled_p ())
5343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5344 "the size of the group of accesses"
5345 " is not a power of 2 or not eqaul to 3\n");
5346 return false;
5349 /* Check that the permutation is supported. */
5350 if (VECTOR_MODE_P (mode))
5352 unsigned int i;
5353 if (count == 3)
5355 unsigned int j0 = 0, j1 = 0, j2 = 0;
5356 unsigned int i, j;
5358 unsigned int nelt;
5359 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5361 if (dump_enabled_p ())
5362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5363 "cannot handle groups of 3 stores for"
5364 " variable-length vectors\n");
5365 return false;
5368 vec_perm_builder sel (nelt, nelt, 1);
5369 sel.quick_grow (nelt);
5370 vec_perm_indices indices;
5371 for (j = 0; j < 3; j++)
5373 int nelt0 = ((3 - j) * nelt) % 3;
5374 int nelt1 = ((3 - j) * nelt + 1) % 3;
5375 int nelt2 = ((3 - j) * nelt + 2) % 3;
5376 for (i = 0; i < nelt; i++)
5378 if (3 * i + nelt0 < nelt)
5379 sel[3 * i + nelt0] = j0++;
5380 if (3 * i + nelt1 < nelt)
5381 sel[3 * i + nelt1] = nelt + j1++;
5382 if (3 * i + nelt2 < nelt)
5383 sel[3 * i + nelt2] = 0;
5385 indices.new_vector (sel, 2, nelt);
5386 if (!can_vec_perm_const_p (mode, mode, indices))
5388 if (dump_enabled_p ())
5389 dump_printf (MSG_MISSED_OPTIMIZATION,
5390 "permutation op not supported by target.\n");
5391 return false;
5394 for (i = 0; i < nelt; i++)
5396 if (3 * i + nelt0 < nelt)
5397 sel[3 * i + nelt0] = 3 * i + nelt0;
5398 if (3 * i + nelt1 < nelt)
5399 sel[3 * i + nelt1] = 3 * i + nelt1;
5400 if (3 * i + nelt2 < nelt)
5401 sel[3 * i + nelt2] = nelt + j2++;
5403 indices.new_vector (sel, 2, nelt);
5404 if (!can_vec_perm_const_p (mode, mode, indices))
5406 if (dump_enabled_p ())
5407 dump_printf (MSG_MISSED_OPTIMIZATION,
5408 "permutation op not supported by target.\n");
5409 return false;
5412 return true;
5414 else
5416 /* If length is not equal to 3 then only power of 2 is supported. */
5417 gcc_assert (pow2p_hwi (count));
5418 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5420 /* The encoding has 2 interleaved stepped patterns. */
5421 if(!multiple_p (nelt, 2))
5422 return false;
5423 vec_perm_builder sel (nelt, 2, 3);
5424 sel.quick_grow (6);
5425 for (i = 0; i < 3; i++)
5427 sel[i * 2] = i;
5428 sel[i * 2 + 1] = i + nelt;
5430 vec_perm_indices indices (sel, 2, nelt);
5431 if (can_vec_perm_const_p (mode, mode, indices))
5433 for (i = 0; i < 6; i++)
5434 sel[i] += exact_div (nelt, 2);
5435 indices.new_vector (sel, 2, nelt);
5436 if (can_vec_perm_const_p (mode, mode, indices))
5437 return true;
5442 if (dump_enabled_p ())
5443 dump_printf (MSG_MISSED_OPTIMIZATION,
5444 "permutation op not supported by target.\n");
5445 return false;
5448 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5449 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5451 internal_fn
5452 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5453 bool masked_p)
5455 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5456 vec_mask_len_store_lanes_optab, vectype,
5457 count))
5458 return IFN_MASK_LEN_STORE_LANES;
5459 else if (masked_p)
5461 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5462 vec_mask_store_lanes_optab, vectype,
5463 count))
5464 return IFN_MASK_STORE_LANES;
5466 else
5468 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5469 vec_store_lanes_optab, vectype, count))
5470 return IFN_STORE_LANES;
5472 return IFN_LAST;
5476 /* Function vect_permute_store_chain.
5478 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5479 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5480 the data correctly for the stores. Return the final references for stores
5481 in RESULT_CHAIN.
5483 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5484 The input is 4 vectors each containing 8 elements. We assign a number to
5485 each element, the input sequence is:
5487 1st vec: 0 1 2 3 4 5 6 7
5488 2nd vec: 8 9 10 11 12 13 14 15
5489 3rd vec: 16 17 18 19 20 21 22 23
5490 4th vec: 24 25 26 27 28 29 30 31
5492 The output sequence should be:
5494 1st vec: 0 8 16 24 1 9 17 25
5495 2nd vec: 2 10 18 26 3 11 19 27
5496 3rd vec: 4 12 20 28 5 13 21 30
5497 4th vec: 6 14 22 30 7 15 23 31
5499 i.e., we interleave the contents of the four vectors in their order.
5501 We use interleave_high/low instructions to create such output. The input of
5502 each interleave_high/low operation is two vectors:
5503 1st vec 2nd vec
5504 0 1 2 3 4 5 6 7
5505 the even elements of the result vector are obtained left-to-right from the
5506 high/low elements of the first vector. The odd elements of the result are
5507 obtained left-to-right from the high/low elements of the second vector.
5508 The output of interleave_high will be: 0 4 1 5
5509 and of interleave_low: 2 6 3 7
5512 The permutation is done in log LENGTH stages. In each stage interleave_high
5513 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5514 where the first argument is taken from the first half of DR_CHAIN and the
5515 second argument from it's second half.
5516 In our example,
5518 I1: interleave_high (1st vec, 3rd vec)
5519 I2: interleave_low (1st vec, 3rd vec)
5520 I3: interleave_high (2nd vec, 4th vec)
5521 I4: interleave_low (2nd vec, 4th vec)
5523 The output for the first stage is:
5525 I1: 0 16 1 17 2 18 3 19
5526 I2: 4 20 5 21 6 22 7 23
5527 I3: 8 24 9 25 10 26 11 27
5528 I4: 12 28 13 29 14 30 15 31
5530 The output of the second stage, i.e. the final result is:
5532 I1: 0 8 16 24 1 9 17 25
5533 I2: 2 10 18 26 3 11 19 27
5534 I3: 4 12 20 28 5 13 21 30
5535 I4: 6 14 22 30 7 15 23 31. */
5537 void
5538 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5539 unsigned int length,
5540 stmt_vec_info stmt_info,
5541 gimple_stmt_iterator *gsi,
5542 vec<tree> *result_chain)
5544 tree vect1, vect2, high, low;
5545 gimple *perm_stmt;
5546 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5547 tree perm_mask_low, perm_mask_high;
5548 tree data_ref;
5549 tree perm3_mask_low, perm3_mask_high;
5550 unsigned int i, j, n, log_length = exact_log2 (length);
5552 result_chain->quick_grow (length);
5553 memcpy (result_chain->address (), dr_chain.address (),
5554 length * sizeof (tree));
5556 if (length == 3)
5558 /* vect_grouped_store_supported ensures that this is constant. */
5559 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5560 unsigned int j0 = 0, j1 = 0, j2 = 0;
5562 vec_perm_builder sel (nelt, nelt, 1);
5563 sel.quick_grow (nelt);
5564 vec_perm_indices indices;
5565 for (j = 0; j < 3; j++)
5567 int nelt0 = ((3 - j) * nelt) % 3;
5568 int nelt1 = ((3 - j) * nelt + 1) % 3;
5569 int nelt2 = ((3 - j) * nelt + 2) % 3;
5571 for (i = 0; i < nelt; i++)
5573 if (3 * i + nelt0 < nelt)
5574 sel[3 * i + nelt0] = j0++;
5575 if (3 * i + nelt1 < nelt)
5576 sel[3 * i + nelt1] = nelt + j1++;
5577 if (3 * i + nelt2 < nelt)
5578 sel[3 * i + nelt2] = 0;
5580 indices.new_vector (sel, 2, nelt);
5581 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5583 for (i = 0; i < nelt; i++)
5585 if (3 * i + nelt0 < nelt)
5586 sel[3 * i + nelt0] = 3 * i + nelt0;
5587 if (3 * i + nelt1 < nelt)
5588 sel[3 * i + nelt1] = 3 * i + nelt1;
5589 if (3 * i + nelt2 < nelt)
5590 sel[3 * i + nelt2] = nelt + j2++;
5592 indices.new_vector (sel, 2, nelt);
5593 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5595 vect1 = dr_chain[0];
5596 vect2 = dr_chain[1];
5598 /* Create interleaving stmt:
5599 low = VEC_PERM_EXPR <vect1, vect2,
5600 {j, nelt, *, j + 1, nelt + j + 1, *,
5601 j + 2, nelt + j + 2, *, ...}> */
5602 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5603 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5604 vect2, perm3_mask_low);
5605 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5607 vect1 = data_ref;
5608 vect2 = dr_chain[2];
5609 /* Create interleaving stmt:
5610 low = VEC_PERM_EXPR <vect1, vect2,
5611 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5612 6, 7, nelt + j + 2, ...}> */
5613 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5614 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5615 vect2, perm3_mask_high);
5616 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5617 (*result_chain)[j] = data_ref;
5620 else
5622 /* If length is not equal to 3 then only power of 2 is supported. */
5623 gcc_assert (pow2p_hwi (length));
5625 /* The encoding has 2 interleaved stepped patterns. */
5626 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5627 vec_perm_builder sel (nelt, 2, 3);
5628 sel.quick_grow (6);
5629 for (i = 0; i < 3; i++)
5631 sel[i * 2] = i;
5632 sel[i * 2 + 1] = i + nelt;
5634 vec_perm_indices indices (sel, 2, nelt);
5635 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5637 for (i = 0; i < 6; i++)
5638 sel[i] += exact_div (nelt, 2);
5639 indices.new_vector (sel, 2, nelt);
5640 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5642 for (i = 0, n = log_length; i < n; i++)
5644 for (j = 0; j < length/2; j++)
5646 vect1 = dr_chain[j];
5647 vect2 = dr_chain[j+length/2];
5649 /* Create interleaving stmt:
5650 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5651 ...}> */
5652 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5653 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5654 vect2, perm_mask_high);
5655 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5656 (*result_chain)[2*j] = high;
5658 /* Create interleaving stmt:
5659 low = VEC_PERM_EXPR <vect1, vect2,
5660 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5661 ...}> */
5662 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5663 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5664 vect2, perm_mask_low);
5665 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5666 (*result_chain)[2*j+1] = low;
5668 memcpy (dr_chain.address (), result_chain->address (),
5669 length * sizeof (tree));
5674 /* Function vect_setup_realignment
5676 This function is called when vectorizing an unaligned load using
5677 the dr_explicit_realign[_optimized] scheme.
5678 This function generates the following code at the loop prolog:
5680 p = initial_addr;
5681 x msq_init = *(floor(p)); # prolog load
5682 realignment_token = call target_builtin;
5683 loop:
5684 x msq = phi (msq_init, ---)
5686 The stmts marked with x are generated only for the case of
5687 dr_explicit_realign_optimized.
5689 The code above sets up a new (vector) pointer, pointing to the first
5690 location accessed by STMT_INFO, and a "floor-aligned" load using that
5691 pointer. It also generates code to compute the "realignment-token"
5692 (if the relevant target hook was defined), and creates a phi-node at the
5693 loop-header bb whose arguments are the result of the prolog-load (created
5694 by this function) and the result of a load that takes place in the loop
5695 (to be created by the caller to this function).
5697 For the case of dr_explicit_realign_optimized:
5698 The caller to this function uses the phi-result (msq) to create the
5699 realignment code inside the loop, and sets up the missing phi argument,
5700 as follows:
5701 loop:
5702 msq = phi (msq_init, lsq)
5703 lsq = *(floor(p')); # load in loop
5704 result = realign_load (msq, lsq, realignment_token);
5706 For the case of dr_explicit_realign:
5707 loop:
5708 msq = *(floor(p)); # load in loop
5709 p' = p + (VS-1);
5710 lsq = *(floor(p')); # load in loop
5711 result = realign_load (msq, lsq, realignment_token);
5713 Input:
5714 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5715 a memory location that may be unaligned.
5716 BSI - place where new code is to be inserted.
5717 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5718 is used.
5720 Output:
5721 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5722 target hook, if defined.
5723 Return value - the result of the loop-header phi node. */
5725 tree
5726 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5727 gimple_stmt_iterator *gsi, tree *realignment_token,
5728 enum dr_alignment_support alignment_support_scheme,
5729 tree init_addr,
5730 class loop **at_loop)
5732 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5733 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5734 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5735 struct data_reference *dr = dr_info->dr;
5736 class loop *loop = NULL;
5737 edge pe = NULL;
5738 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5739 tree vec_dest;
5740 gimple *inc;
5741 tree ptr;
5742 tree data_ref;
5743 basic_block new_bb;
5744 tree msq_init = NULL_TREE;
5745 tree new_temp;
5746 gphi *phi_stmt;
5747 tree msq = NULL_TREE;
5748 gimple_seq stmts = NULL;
5749 bool compute_in_loop = false;
5750 bool nested_in_vect_loop = false;
5751 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5752 class loop *loop_for_initial_load = NULL;
5754 if (loop_vinfo)
5756 loop = LOOP_VINFO_LOOP (loop_vinfo);
5757 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5760 gcc_assert (alignment_support_scheme == dr_explicit_realign
5761 || alignment_support_scheme == dr_explicit_realign_optimized);
5763 /* We need to generate three things:
5764 1. the misalignment computation
5765 2. the extra vector load (for the optimized realignment scheme).
5766 3. the phi node for the two vectors from which the realignment is
5767 done (for the optimized realignment scheme). */
5769 /* 1. Determine where to generate the misalignment computation.
5771 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5772 calculation will be generated by this function, outside the loop (in the
5773 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5774 caller, inside the loop.
5776 Background: If the misalignment remains fixed throughout the iterations of
5777 the loop, then both realignment schemes are applicable, and also the
5778 misalignment computation can be done outside LOOP. This is because we are
5779 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5780 are a multiple of VS (the Vector Size), and therefore the misalignment in
5781 different vectorized LOOP iterations is always the same.
5782 The problem arises only if the memory access is in an inner-loop nested
5783 inside LOOP, which is now being vectorized using outer-loop vectorization.
5784 This is the only case when the misalignment of the memory access may not
5785 remain fixed throughout the iterations of the inner-loop (as explained in
5786 detail in vect_supportable_dr_alignment). In this case, not only is the
5787 optimized realignment scheme not applicable, but also the misalignment
5788 computation (and generation of the realignment token that is passed to
5789 REALIGN_LOAD) have to be done inside the loop.
5791 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5792 or not, which in turn determines if the misalignment is computed inside
5793 the inner-loop, or outside LOOP. */
5795 if (init_addr != NULL_TREE || !loop_vinfo)
5797 compute_in_loop = true;
5798 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5802 /* 2. Determine where to generate the extra vector load.
5804 For the optimized realignment scheme, instead of generating two vector
5805 loads in each iteration, we generate a single extra vector load in the
5806 preheader of the loop, and in each iteration reuse the result of the
5807 vector load from the previous iteration. In case the memory access is in
5808 an inner-loop nested inside LOOP, which is now being vectorized using
5809 outer-loop vectorization, we need to determine whether this initial vector
5810 load should be generated at the preheader of the inner-loop, or can be
5811 generated at the preheader of LOOP. If the memory access has no evolution
5812 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5813 to be generated inside LOOP (in the preheader of the inner-loop). */
5815 if (nested_in_vect_loop)
5817 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5818 bool invariant_in_outerloop =
5819 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5820 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5822 else
5823 loop_for_initial_load = loop;
5824 if (at_loop)
5825 *at_loop = loop_for_initial_load;
5827 tree vuse = NULL_TREE;
5828 if (loop_for_initial_load)
5830 pe = loop_preheader_edge (loop_for_initial_load);
5831 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5832 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5834 if (!vuse)
5835 vuse = gimple_vuse (gsi_stmt (*gsi));
5837 /* 3. For the case of the optimized realignment, create the first vector
5838 load at the loop preheader. */
5840 if (alignment_support_scheme == dr_explicit_realign_optimized)
5842 /* Create msq_init = *(floor(p1)) in the loop preheader */
5843 gassign *new_stmt;
5845 gcc_assert (!compute_in_loop);
5846 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5847 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5848 loop_for_initial_load, NULL_TREE,
5849 &init_addr, NULL, &inc, true);
5850 if (TREE_CODE (ptr) == SSA_NAME)
5851 new_temp = copy_ssa_name (ptr);
5852 else
5853 new_temp = make_ssa_name (TREE_TYPE (ptr));
5854 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5855 tree type = TREE_TYPE (ptr);
5856 new_stmt = gimple_build_assign
5857 (new_temp, BIT_AND_EXPR, ptr,
5858 fold_build2 (MINUS_EXPR, type,
5859 build_int_cst (type, 0),
5860 build_int_cst (type, align)));
5861 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5862 gcc_assert (!new_bb);
5863 data_ref
5864 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5865 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5866 vect_copy_ref_info (data_ref, DR_REF (dr));
5867 new_stmt = gimple_build_assign (vec_dest, data_ref);
5868 new_temp = make_ssa_name (vec_dest, new_stmt);
5869 gimple_assign_set_lhs (new_stmt, new_temp);
5870 gimple_set_vuse (new_stmt, vuse);
5871 if (pe)
5873 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5874 gcc_assert (!new_bb);
5876 else
5877 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5879 msq_init = gimple_assign_lhs (new_stmt);
5882 /* 4. Create realignment token using a target builtin, if available.
5883 It is done either inside the containing loop, or before LOOP (as
5884 determined above). */
5886 if (targetm.vectorize.builtin_mask_for_load)
5888 gcall *new_stmt;
5889 tree builtin_decl;
5891 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5892 if (!init_addr)
5894 /* Generate the INIT_ADDR computation outside LOOP. */
5895 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5896 stmt_info, &stmts,
5897 NULL_TREE);
5898 if (loop)
5900 pe = loop_preheader_edge (loop);
5901 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5902 gcc_assert (!new_bb);
5904 else
5905 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5908 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5909 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5910 vec_dest =
5911 vect_create_destination_var (scalar_dest,
5912 gimple_call_return_type (new_stmt));
5913 new_temp = make_ssa_name (vec_dest, new_stmt);
5914 gimple_call_set_lhs (new_stmt, new_temp);
5916 if (compute_in_loop)
5917 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5918 else
5920 /* Generate the misalignment computation outside LOOP. */
5921 pe = loop_preheader_edge (loop);
5922 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5923 gcc_assert (!new_bb);
5926 *realignment_token = gimple_call_lhs (new_stmt);
5928 /* The result of the CALL_EXPR to this builtin is determined from
5929 the value of the parameter and no global variables are touched
5930 which makes the builtin a "const" function. Requiring the
5931 builtin to have the "const" attribute makes it unnecessary
5932 to call mark_call_clobbered. */
5933 gcc_assert (TREE_READONLY (builtin_decl));
5936 if (alignment_support_scheme == dr_explicit_realign)
5937 return msq;
5939 gcc_assert (!compute_in_loop);
5940 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5943 /* 5. Create msq = phi <msq_init, lsq> in loop */
5945 pe = loop_preheader_edge (containing_loop);
5946 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5947 msq = make_ssa_name (vec_dest);
5948 phi_stmt = create_phi_node (msq, containing_loop->header);
5949 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5951 return msq;
5955 /* Function vect_grouped_load_supported.
5957 COUNT is the size of the load group (the number of statements plus the
5958 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5959 only one statement, with a gap of COUNT - 1.
5961 Returns true if a suitable permute exists. */
5963 bool
5964 vect_grouped_load_supported (tree vectype, bool single_element_p,
5965 unsigned HOST_WIDE_INT count)
5967 machine_mode mode = TYPE_MODE (vectype);
5969 /* If this is single-element interleaving with an element distance
5970 that leaves unused vector loads around punt - we at least create
5971 very sub-optimal code in that case (and blow up memory,
5972 see PR65518). */
5973 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5975 if (dump_enabled_p ())
5976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5977 "single-element interleaving not supported "
5978 "for not adjacent vector loads\n");
5979 return false;
5982 /* vect_permute_load_chain requires the group size to be equal to 3 or
5983 be a power of two. */
5984 if (count != 3 && exact_log2 (count) == -1)
5986 if (dump_enabled_p ())
5987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5988 "the size of the group of accesses"
5989 " is not a power of 2 or not equal to 3\n");
5990 return false;
5993 /* Check that the permutation is supported. */
5994 if (VECTOR_MODE_P (mode))
5996 unsigned int i, j;
5997 if (count == 3)
5999 unsigned int nelt;
6000 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6002 if (dump_enabled_p ())
6003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6004 "cannot handle groups of 3 loads for"
6005 " variable-length vectors\n");
6006 return false;
6009 vec_perm_builder sel (nelt, nelt, 1);
6010 sel.quick_grow (nelt);
6011 vec_perm_indices indices;
6012 unsigned int k;
6013 for (k = 0; k < 3; k++)
6015 for (i = 0; i < nelt; i++)
6016 if (3 * i + k < 2 * nelt)
6017 sel[i] = 3 * i + k;
6018 else
6019 sel[i] = 0;
6020 indices.new_vector (sel, 2, nelt);
6021 if (!can_vec_perm_const_p (mode, mode, indices))
6023 if (dump_enabled_p ())
6024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6025 "shuffle of 3 loads is not supported by"
6026 " target\n");
6027 return false;
6029 for (i = 0, j = 0; i < nelt; i++)
6030 if (3 * i + k < 2 * nelt)
6031 sel[i] = i;
6032 else
6033 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6034 indices.new_vector (sel, 2, nelt);
6035 if (!can_vec_perm_const_p (mode, mode, indices))
6037 if (dump_enabled_p ())
6038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6039 "shuffle of 3 loads is not supported by"
6040 " target\n");
6041 return false;
6044 return true;
6046 else
6048 /* If length is not equal to 3 then only power of 2 is supported. */
6049 gcc_assert (pow2p_hwi (count));
6050 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6052 /* The encoding has a single stepped pattern. */
6053 vec_perm_builder sel (nelt, 1, 3);
6054 sel.quick_grow (3);
6055 for (i = 0; i < 3; i++)
6056 sel[i] = i * 2;
6057 vec_perm_indices indices (sel, 2, nelt);
6058 if (can_vec_perm_const_p (mode, mode, indices))
6060 for (i = 0; i < 3; i++)
6061 sel[i] = i * 2 + 1;
6062 indices.new_vector (sel, 2, nelt);
6063 if (can_vec_perm_const_p (mode, mode, indices))
6064 return true;
6069 if (dump_enabled_p ())
6070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6071 "extract even/odd not supported by target\n");
6072 return false;
6075 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6076 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6078 internal_fn
6079 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6080 bool masked_p)
6082 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6083 vec_mask_len_load_lanes_optab, vectype,
6084 count))
6085 return IFN_MASK_LEN_LOAD_LANES;
6086 else if (masked_p)
6088 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6089 vec_mask_load_lanes_optab, vectype,
6090 count))
6091 return IFN_MASK_LOAD_LANES;
6093 else
6095 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6096 vectype, count))
6097 return IFN_LOAD_LANES;
6099 return IFN_LAST;
6102 /* Function vect_permute_load_chain.
6104 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6105 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6106 the input data correctly. Return the final references for loads in
6107 RESULT_CHAIN.
6109 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6110 The input is 4 vectors each containing 8 elements. We assign a number to each
6111 element, the input sequence is:
6113 1st vec: 0 1 2 3 4 5 6 7
6114 2nd vec: 8 9 10 11 12 13 14 15
6115 3rd vec: 16 17 18 19 20 21 22 23
6116 4th vec: 24 25 26 27 28 29 30 31
6118 The output sequence should be:
6120 1st vec: 0 4 8 12 16 20 24 28
6121 2nd vec: 1 5 9 13 17 21 25 29
6122 3rd vec: 2 6 10 14 18 22 26 30
6123 4th vec: 3 7 11 15 19 23 27 31
6125 i.e., the first output vector should contain the first elements of each
6126 interleaving group, etc.
6128 We use extract_even/odd instructions to create such output. The input of
6129 each extract_even/odd operation is two vectors
6130 1st vec 2nd vec
6131 0 1 2 3 4 5 6 7
6133 and the output is the vector of extracted even/odd elements. The output of
6134 extract_even will be: 0 2 4 6
6135 and of extract_odd: 1 3 5 7
6138 The permutation is done in log LENGTH stages. In each stage extract_even
6139 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6140 their order. In our example,
6142 E1: extract_even (1st vec, 2nd vec)
6143 E2: extract_odd (1st vec, 2nd vec)
6144 E3: extract_even (3rd vec, 4th vec)
6145 E4: extract_odd (3rd vec, 4th vec)
6147 The output for the first stage will be:
6149 E1: 0 2 4 6 8 10 12 14
6150 E2: 1 3 5 7 9 11 13 15
6151 E3: 16 18 20 22 24 26 28 30
6152 E4: 17 19 21 23 25 27 29 31
6154 In order to proceed and create the correct sequence for the next stage (or
6155 for the correct output, if the second stage is the last one, as in our
6156 example), we first put the output of extract_even operation and then the
6157 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6158 The input for the second stage is:
6160 1st vec (E1): 0 2 4 6 8 10 12 14
6161 2nd vec (E3): 16 18 20 22 24 26 28 30
6162 3rd vec (E2): 1 3 5 7 9 11 13 15
6163 4th vec (E4): 17 19 21 23 25 27 29 31
6165 The output of the second stage:
6167 E1: 0 4 8 12 16 20 24 28
6168 E2: 2 6 10 14 18 22 26 30
6169 E3: 1 5 9 13 17 21 25 29
6170 E4: 3 7 11 15 19 23 27 31
6172 And RESULT_CHAIN after reordering:
6174 1st vec (E1): 0 4 8 12 16 20 24 28
6175 2nd vec (E3): 1 5 9 13 17 21 25 29
6176 3rd vec (E2): 2 6 10 14 18 22 26 30
6177 4th vec (E4): 3 7 11 15 19 23 27 31. */
6179 static void
6180 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6181 unsigned int length,
6182 stmt_vec_info stmt_info,
6183 gimple_stmt_iterator *gsi,
6184 vec<tree> *result_chain)
6186 tree data_ref, first_vect, second_vect;
6187 tree perm_mask_even, perm_mask_odd;
6188 tree perm3_mask_low, perm3_mask_high;
6189 gimple *perm_stmt;
6190 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6191 unsigned int i, j, log_length = exact_log2 (length);
6193 result_chain->quick_grow (length);
6194 memcpy (result_chain->address (), dr_chain.address (),
6195 length * sizeof (tree));
6197 if (length == 3)
6199 /* vect_grouped_load_supported ensures that this is constant. */
6200 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6201 unsigned int k;
6203 vec_perm_builder sel (nelt, nelt, 1);
6204 sel.quick_grow (nelt);
6205 vec_perm_indices indices;
6206 for (k = 0; k < 3; k++)
6208 for (i = 0; i < nelt; i++)
6209 if (3 * i + k < 2 * nelt)
6210 sel[i] = 3 * i + k;
6211 else
6212 sel[i] = 0;
6213 indices.new_vector (sel, 2, nelt);
6214 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6216 for (i = 0, j = 0; i < nelt; i++)
6217 if (3 * i + k < 2 * nelt)
6218 sel[i] = i;
6219 else
6220 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6221 indices.new_vector (sel, 2, nelt);
6222 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6224 first_vect = dr_chain[0];
6225 second_vect = dr_chain[1];
6227 /* Create interleaving stmt (low part of):
6228 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6229 ...}> */
6230 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6231 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6232 second_vect, perm3_mask_low);
6233 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6235 /* Create interleaving stmt (high part of):
6236 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6237 ...}> */
6238 first_vect = data_ref;
6239 second_vect = dr_chain[2];
6240 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6241 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6242 second_vect, perm3_mask_high);
6243 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6244 (*result_chain)[k] = data_ref;
6247 else
6249 /* If length is not equal to 3 then only power of 2 is supported. */
6250 gcc_assert (pow2p_hwi (length));
6252 /* The encoding has a single stepped pattern. */
6253 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6254 vec_perm_builder sel (nelt, 1, 3);
6255 sel.quick_grow (3);
6256 for (i = 0; i < 3; ++i)
6257 sel[i] = i * 2;
6258 vec_perm_indices indices (sel, 2, nelt);
6259 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6261 for (i = 0; i < 3; ++i)
6262 sel[i] = i * 2 + 1;
6263 indices.new_vector (sel, 2, nelt);
6264 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6266 for (i = 0; i < log_length; i++)
6268 for (j = 0; j < length; j += 2)
6270 first_vect = dr_chain[j];
6271 second_vect = dr_chain[j+1];
6273 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6274 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6275 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6276 first_vect, second_vect,
6277 perm_mask_even);
6278 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6279 (*result_chain)[j/2] = data_ref;
6281 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6282 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6283 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6284 first_vect, second_vect,
6285 perm_mask_odd);
6286 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6287 (*result_chain)[j/2+length/2] = data_ref;
6289 memcpy (dr_chain.address (), result_chain->address (),
6290 length * sizeof (tree));
6295 /* Function vect_shift_permute_load_chain.
6297 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6298 sequence of stmts to reorder the input data accordingly.
6299 Return the final references for loads in RESULT_CHAIN.
6300 Return true if successed, false otherwise.
6302 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6303 The input is 3 vectors each containing 8 elements. We assign a
6304 number to each element, the input sequence is:
6306 1st vec: 0 1 2 3 4 5 6 7
6307 2nd vec: 8 9 10 11 12 13 14 15
6308 3rd vec: 16 17 18 19 20 21 22 23
6310 The output sequence should be:
6312 1st vec: 0 3 6 9 12 15 18 21
6313 2nd vec: 1 4 7 10 13 16 19 22
6314 3rd vec: 2 5 8 11 14 17 20 23
6316 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6318 First we shuffle all 3 vectors to get correct elements order:
6320 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6321 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6322 3rd vec: (16 19 22) (17 20 23) (18 21)
6324 Next we unite and shift vector 3 times:
6326 1st step:
6327 shift right by 6 the concatenation of:
6328 "1st vec" and "2nd vec"
6329 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6330 "2nd vec" and "3rd vec"
6331 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6332 "3rd vec" and "1st vec"
6333 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6334 | New vectors |
6336 So that now new vectors are:
6338 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6339 2nd vec: (10 13) (16 19 22) (17 20 23)
6340 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6342 2nd step:
6343 shift right by 5 the concatenation of:
6344 "1st vec" and "3rd vec"
6345 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6346 "2nd vec" and "1st vec"
6347 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6348 "3rd vec" and "2nd vec"
6349 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6350 | New vectors |
6352 So that now new vectors are:
6354 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6355 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6356 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6358 3rd step:
6359 shift right by 5 the concatenation of:
6360 "1st vec" and "1st vec"
6361 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6362 shift right by 3 the concatenation of:
6363 "2nd vec" and "2nd vec"
6364 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6365 | New vectors |
6367 So that now all vectors are READY:
6368 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6369 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6370 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6372 This algorithm is faster than one in vect_permute_load_chain if:
6373 1. "shift of a concatination" is faster than general permutation.
6374 This is usually so.
6375 2. The TARGET machine can't execute vector instructions in parallel.
6376 This is because each step of the algorithm depends on previous.
6377 The algorithm in vect_permute_load_chain is much more parallel.
6379 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6382 static bool
6383 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6384 unsigned int length,
6385 stmt_vec_info stmt_info,
6386 gimple_stmt_iterator *gsi,
6387 vec<tree> *result_chain)
6389 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6390 tree perm2_mask1, perm2_mask2, perm3_mask;
6391 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6392 gimple *perm_stmt;
6394 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6395 machine_mode vmode = TYPE_MODE (vectype);
6396 unsigned int i;
6397 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6399 unsigned HOST_WIDE_INT nelt, vf;
6400 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6401 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6402 /* Not supported for variable-length vectors. */
6403 return false;
6405 vec_perm_builder sel (nelt, nelt, 1);
6406 sel.quick_grow (nelt);
6408 result_chain->quick_grow (length);
6409 memcpy (result_chain->address (), dr_chain.address (),
6410 length * sizeof (tree));
6412 if (pow2p_hwi (length) && vf > 4)
6414 unsigned int j, log_length = exact_log2 (length);
6415 for (i = 0; i < nelt / 2; ++i)
6416 sel[i] = i * 2;
6417 for (i = 0; i < nelt / 2; ++i)
6418 sel[nelt / 2 + i] = i * 2 + 1;
6419 vec_perm_indices indices (sel, 2, nelt);
6420 if (!can_vec_perm_const_p (vmode, vmode, indices))
6422 if (dump_enabled_p ())
6423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6424 "shuffle of 2 fields structure is not \
6425 supported by target\n");
6426 return false;
6428 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6430 for (i = 0; i < nelt / 2; ++i)
6431 sel[i] = i * 2 + 1;
6432 for (i = 0; i < nelt / 2; ++i)
6433 sel[nelt / 2 + i] = i * 2;
6434 indices.new_vector (sel, 2, nelt);
6435 if (!can_vec_perm_const_p (vmode, vmode, indices))
6437 if (dump_enabled_p ())
6438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6439 "shuffle of 2 fields structure is not \
6440 supported by target\n");
6441 return false;
6443 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6445 /* Generating permutation constant to shift all elements.
6446 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6447 for (i = 0; i < nelt; i++)
6448 sel[i] = nelt / 2 + i;
6449 indices.new_vector (sel, 2, nelt);
6450 if (!can_vec_perm_const_p (vmode, vmode, indices))
6452 if (dump_enabled_p ())
6453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454 "shift permutation is not supported by target\n");
6455 return false;
6457 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6459 /* Generating permutation constant to select vector from 2.
6460 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6461 for (i = 0; i < nelt / 2; i++)
6462 sel[i] = i;
6463 for (i = nelt / 2; i < nelt; i++)
6464 sel[i] = nelt + i;
6465 indices.new_vector (sel, 2, nelt);
6466 if (!can_vec_perm_const_p (vmode, vmode, indices))
6468 if (dump_enabled_p ())
6469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470 "select is not supported by target\n");
6471 return false;
6473 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6475 for (i = 0; i < log_length; i++)
6477 for (j = 0; j < length; j += 2)
6479 first_vect = dr_chain[j];
6480 second_vect = dr_chain[j + 1];
6482 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6483 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6484 first_vect, first_vect,
6485 perm2_mask1);
6486 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6487 vect[0] = data_ref;
6489 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6490 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6491 second_vect, second_vect,
6492 perm2_mask2);
6493 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6494 vect[1] = data_ref;
6496 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6497 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6498 vect[0], vect[1], shift1_mask);
6499 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6500 (*result_chain)[j/2 + length/2] = data_ref;
6502 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6503 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6504 vect[0], vect[1], select_mask);
6505 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6506 (*result_chain)[j/2] = data_ref;
6508 memcpy (dr_chain.address (), result_chain->address (),
6509 length * sizeof (tree));
6511 return true;
6513 if (length == 3 && vf > 2)
6515 unsigned int k = 0, l = 0;
6517 /* Generating permutation constant to get all elements in rigth order.
6518 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6519 for (i = 0; i < nelt; i++)
6521 if (3 * k + (l % 3) >= nelt)
6523 k = 0;
6524 l += (3 - (nelt % 3));
6526 sel[i] = 3 * k + (l % 3);
6527 k++;
6529 vec_perm_indices indices (sel, 2, nelt);
6530 if (!can_vec_perm_const_p (vmode, vmode, indices))
6532 if (dump_enabled_p ())
6533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6534 "shuffle of 3 fields structure is not \
6535 supported by target\n");
6536 return false;
6538 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6540 /* Generating permutation constant to shift all elements.
6541 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6542 for (i = 0; i < nelt; i++)
6543 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6544 indices.new_vector (sel, 2, nelt);
6545 if (!can_vec_perm_const_p (vmode, vmode, indices))
6547 if (dump_enabled_p ())
6548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6549 "shift permutation is not supported by target\n");
6550 return false;
6552 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6554 /* Generating permutation constant to shift all elements.
6555 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6556 for (i = 0; i < nelt; i++)
6557 sel[i] = 2 * (nelt / 3) + 1 + i;
6558 indices.new_vector (sel, 2, nelt);
6559 if (!can_vec_perm_const_p (vmode, vmode, indices))
6561 if (dump_enabled_p ())
6562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6563 "shift permutation is not supported by target\n");
6564 return false;
6566 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6568 /* Generating permutation constant to shift all elements.
6569 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6570 for (i = 0; i < nelt; i++)
6571 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6572 indices.new_vector (sel, 2, nelt);
6573 if (!can_vec_perm_const_p (vmode, vmode, indices))
6575 if (dump_enabled_p ())
6576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6577 "shift permutation is not supported by target\n");
6578 return false;
6580 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6582 /* Generating permutation constant to shift all elements.
6583 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6584 for (i = 0; i < nelt; i++)
6585 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6586 indices.new_vector (sel, 2, nelt);
6587 if (!can_vec_perm_const_p (vmode, vmode, indices))
6589 if (dump_enabled_p ())
6590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591 "shift permutation is not supported by target\n");
6592 return false;
6594 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6596 for (k = 0; k < 3; k++)
6598 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6599 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6600 dr_chain[k], dr_chain[k],
6601 perm3_mask);
6602 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6603 vect[k] = data_ref;
6606 for (k = 0; k < 3; k++)
6608 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6609 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6610 vect[k % 3], vect[(k + 1) % 3],
6611 shift1_mask);
6612 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6613 vect_shift[k] = data_ref;
6616 for (k = 0; k < 3; k++)
6618 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6619 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6620 vect_shift[(4 - k) % 3],
6621 vect_shift[(3 - k) % 3],
6622 shift2_mask);
6623 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6624 vect[k] = data_ref;
6627 (*result_chain)[3 - (nelt % 3)] = vect[2];
6629 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6630 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6631 vect[0], shift3_mask);
6632 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6633 (*result_chain)[nelt % 3] = data_ref;
6635 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6636 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6637 vect[1], shift4_mask);
6638 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6639 (*result_chain)[0] = data_ref;
6640 return true;
6642 return false;
6645 /* Function vect_transform_grouped_load.
6647 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6648 to perform their permutation and ascribe the result vectorized statements to
6649 the scalar statements.
6652 void
6653 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6654 vec<tree> dr_chain,
6655 int size, gimple_stmt_iterator *gsi)
6657 machine_mode mode;
6658 vec<tree> result_chain = vNULL;
6660 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6661 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6662 vectors, that are ready for vector computation. */
6663 result_chain.create (size);
6665 /* If reassociation width for vector type is 2 or greater target machine can
6666 execute 2 or more vector instructions in parallel. Otherwise try to
6667 get chain for loads group using vect_shift_permute_load_chain. */
6668 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6669 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6670 || pow2p_hwi (size)
6671 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6672 gsi, &result_chain))
6673 vect_permute_load_chain (vinfo, dr_chain,
6674 size, stmt_info, gsi, &result_chain);
6675 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6676 result_chain.release ();
6679 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6680 generated as part of the vectorization of STMT_INFO. Assign the statement
6681 for each vector to the associated scalar statement. */
6683 void
6684 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6685 vec<tree> result_chain)
6687 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6688 unsigned int i, gap_count;
6689 tree tmp_data_ref;
6691 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6692 Since we scan the chain starting from it's first node, their order
6693 corresponds the order of data-refs in RESULT_CHAIN. */
6694 stmt_vec_info next_stmt_info = first_stmt_info;
6695 gap_count = 1;
6696 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6698 if (!next_stmt_info)
6699 break;
6701 /* Skip the gaps. Loads created for the gaps will be removed by dead
6702 code elimination pass later. No need to check for the first stmt in
6703 the group, since it always exists.
6704 DR_GROUP_GAP is the number of steps in elements from the previous
6705 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6706 correspond to the gaps. */
6707 if (next_stmt_info != first_stmt_info
6708 && gap_count < DR_GROUP_GAP (next_stmt_info))
6710 gap_count++;
6711 continue;
6714 /* ??? The following needs cleanup after the removal of
6715 DR_GROUP_SAME_DR_STMT. */
6716 if (next_stmt_info)
6718 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6719 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6720 copies, and we put the new vector statement last. */
6721 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6723 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6724 gap_count = 1;
6729 /* Function vect_force_dr_alignment_p.
6731 Returns whether the alignment of a DECL can be forced to be aligned
6732 on ALIGNMENT bit boundary. */
6734 bool
6735 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6737 if (!VAR_P (decl))
6738 return false;
6740 if (decl_in_symtab_p (decl)
6741 && !symtab_node::get (decl)->can_increase_alignment_p ())
6742 return false;
6744 if (TREE_STATIC (decl))
6745 return (known_le (alignment,
6746 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6747 else
6748 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6751 /* Return whether the data reference DR_INFO is supported with respect to its
6752 alignment.
6753 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6754 it is aligned, i.e., check if it is possible to vectorize it with different
6755 alignment. */
6757 enum dr_alignment_support
6758 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6759 tree vectype, int misalignment)
6761 data_reference *dr = dr_info->dr;
6762 stmt_vec_info stmt_info = dr_info->stmt;
6763 machine_mode mode = TYPE_MODE (vectype);
6764 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6765 class loop *vect_loop = NULL;
6766 bool nested_in_vect_loop = false;
6768 if (misalignment == 0)
6769 return dr_aligned;
6771 /* For now assume all conditional loads/stores support unaligned
6772 access without any special code. */
6773 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6774 if (gimple_call_internal_p (stmt)
6775 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6776 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6777 return dr_unaligned_supported;
6779 if (loop_vinfo)
6781 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6782 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6785 /* Possibly unaligned access. */
6787 /* We can choose between using the implicit realignment scheme (generating
6788 a misaligned_move stmt) and the explicit realignment scheme (generating
6789 aligned loads with a REALIGN_LOAD). There are two variants to the
6790 explicit realignment scheme: optimized, and unoptimized.
6791 We can optimize the realignment only if the step between consecutive
6792 vector loads is equal to the vector size. Since the vector memory
6793 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6794 is guaranteed that the misalignment amount remains the same throughout the
6795 execution of the vectorized loop. Therefore, we can create the
6796 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6797 at the loop preheader.
6799 However, in the case of outer-loop vectorization, when vectorizing a
6800 memory access in the inner-loop nested within the LOOP that is now being
6801 vectorized, while it is guaranteed that the misalignment of the
6802 vectorized memory access will remain the same in different outer-loop
6803 iterations, it is *not* guaranteed that is will remain the same throughout
6804 the execution of the inner-loop. This is because the inner-loop advances
6805 with the original scalar step (and not in steps of VS). If the inner-loop
6806 step happens to be a multiple of VS, then the misalignment remains fixed
6807 and we can use the optimized realignment scheme. For example:
6809 for (i=0; i<N; i++)
6810 for (j=0; j<M; j++)
6811 s += a[i+j];
6813 When vectorizing the i-loop in the above example, the step between
6814 consecutive vector loads is 1, and so the misalignment does not remain
6815 fixed across the execution of the inner-loop, and the realignment cannot
6816 be optimized (as illustrated in the following pseudo vectorized loop):
6818 for (i=0; i<N; i+=4)
6819 for (j=0; j<M; j++){
6820 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6821 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6822 // (assuming that we start from an aligned address).
6825 We therefore have to use the unoptimized realignment scheme:
6827 for (i=0; i<N; i+=4)
6828 for (j=k; j<M; j+=4)
6829 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6830 // that the misalignment of the initial address is
6831 // 0).
6833 The loop can then be vectorized as follows:
6835 for (k=0; k<4; k++){
6836 rt = get_realignment_token (&vp[k]);
6837 for (i=0; i<N; i+=4){
6838 v1 = vp[i+k];
6839 for (j=k; j<M; j+=4){
6840 v2 = vp[i+j+VS-1];
6841 va = REALIGN_LOAD <v1,v2,rt>;
6842 vs += va;
6843 v1 = v2;
6846 } */
6848 if (DR_IS_READ (dr))
6850 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6851 && (!targetm.vectorize.builtin_mask_for_load
6852 || targetm.vectorize.builtin_mask_for_load ()))
6854 /* If we are doing SLP then the accesses need not have the
6855 same alignment, instead it depends on the SLP group size. */
6856 if (loop_vinfo
6857 && STMT_SLP_TYPE (stmt_info)
6858 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
6859 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6860 * (DR_GROUP_SIZE
6861 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6862 TYPE_VECTOR_SUBPARTS (vectype))))
6864 else if (!loop_vinfo
6865 || (nested_in_vect_loop
6866 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6867 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6868 return dr_explicit_realign;
6869 else
6870 return dr_explicit_realign_optimized;
6874 bool is_packed = false;
6875 tree type = TREE_TYPE (DR_REF (dr));
6876 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6877 is_packed = not_size_aligned (DR_REF (dr));
6878 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6879 is_packed))
6880 return dr_unaligned_supported;
6882 /* Unsupported. */
6883 return dr_unaligned_unsupported;