Fix xfail for 32-bit hppa*-*-* in gcc.dg/pr84877.c
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blob2ca5a1b131bf5d7a9eb531e0c532712b45418246
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
104 static cgraph_node*
105 simd_clone_call_p (gimple *stmt)
107 gcall *call = dyn_cast <gcall *> (stmt);
108 if (!call)
109 return NULL;
111 tree fndecl = NULL_TREE;
112 if (gimple_call_internal_p (call, IFN_MASK_CALL))
113 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
114 else
115 fndecl = gimple_call_fndecl (stmt);
117 if (fndecl == NULL_TREE)
118 return NULL;
120 cgraph_node *node = cgraph_node::get (fndecl);
121 if (node && node->simd_clones != NULL)
122 return node;
124 return NULL;
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
144 types. */
146 tree
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
149 HOST_WIDE_INT lhs, rhs;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
154 return scalar_type;
156 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
158 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
159 if (assign)
161 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
162 if (gimple_assign_cast_p (assign)
163 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
169 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
171 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
172 if (rhs < lhs)
173 scalar_type = rhs_type;
176 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
178 auto clone = node->simd_clones->simdclone;
179 for (unsigned int i = 0; i < clone->nargs; ++i)
181 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
183 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
184 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
185 if (rhs < lhs)
187 scalar_type = arg_scalar_type;
188 lhs = rhs;
193 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
195 unsigned int i = 0;
196 if (gimple_call_internal_p (call))
198 internal_fn ifn = gimple_call_internal_fn (call);
199 if (internal_load_fn_p (ifn))
200 /* For loads the LHS type does the trick. */
201 i = ~0U;
202 else if (internal_store_fn_p (ifn))
204 /* For stores use the tyep of the stored value. */
205 i = internal_fn_stored_value_index (ifn);
206 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
207 i = ~0U;
209 else if (internal_fn_mask_index (ifn) == 0)
210 i = 1;
212 if (i < gimple_call_num_args (call))
214 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
217 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
218 if (rhs < lhs)
219 scalar_type = rhs_type;
224 return scalar_type;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
232 static opt_result
233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
238 return opt_result::failure_at (vect_location,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
241 " == 0\n");
243 opt_result res
244 = runtime_alias_check_p (ddr, loop,
245 optimize_loop_nest_for_speed_p (loop));
246 if (!res)
247 return res;
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
255 static void
256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
258 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
259 for (unsigned int i = 0; i < checks.length(); ++i)
260 if (checks[i] == value)
261 return;
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "need run-time check that %T is nonzero\n",
266 value);
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
274 static bool
275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
277 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
278 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
283 return true;
285 /* If there is a loop invariant read involved we might vectorize it in
286 the prologue, breaking scalar oder with respect to the in-loop store. */
287 if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
288 || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
289 return false;
291 /* STMT_A and STMT_B belong to overlapping groups. All loads are
292 emitted at the position of the first scalar load.
293 Stores in a group are emitted at the position of the last scalar store.
294 Compute that position and check whether the resulting order matches
295 the current one. */
296 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
297 if (il_a)
299 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
300 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
301 s = DR_GROUP_NEXT_ELEMENT (s))
302 il_a = get_later_stmt (il_a, s);
303 else /* DR_IS_READ */
304 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
305 s = DR_GROUP_NEXT_ELEMENT (s))
306 if (get_later_stmt (il_a, s) == il_a)
307 il_a = s;
309 else
310 il_a = stmtinfo_a;
311 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
312 if (il_b)
314 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
315 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
316 s = DR_GROUP_NEXT_ELEMENT (s))
317 il_b = get_later_stmt (il_b, s);
318 else /* DR_IS_READ */
319 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
320 s = DR_GROUP_NEXT_ELEMENT (s))
321 if (get_later_stmt (il_b, s) == il_b)
322 il_b = s;
324 else
325 il_b = stmtinfo_b;
326 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
327 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
330 /* A subroutine of vect_analyze_data_ref_dependence. Handle
331 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
332 distances. These distances are conservatively correct but they don't
333 reflect a guaranteed dependence.
335 Return true if this function does all the work necessary to avoid
336 an alias or false if the caller should use the dependence distances
337 to limit the vectorization factor in the usual way. LOOP_DEPTH is
338 the depth of the loop described by LOOP_VINFO and the other arguments
339 are as for vect_analyze_data_ref_dependence. */
341 static bool
342 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
343 loop_vec_info loop_vinfo,
344 int loop_depth, unsigned int *max_vf)
346 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
347 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
349 int dist = dist_v[loop_depth];
350 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
352 /* If the user asserted safelen >= DIST consecutive iterations
353 can be executed concurrently, assume independence.
355 ??? An alternative would be to add the alias check even
356 in this case, and vectorize the fallback loop with the
357 maximum VF set to safelen. However, if the user has
358 explicitly given a length, it's less likely that that
359 would be a win. */
360 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
362 if ((unsigned int) loop->safelen < *max_vf)
363 *max_vf = loop->safelen;
364 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
365 continue;
368 /* For dependence distances of 2 or more, we have the option
369 of limiting VF or checking for an alias at runtime.
370 Prefer to check at runtime if we can, to avoid limiting
371 the VF unnecessarily when the bases are in fact independent.
373 Note that the alias checks will be removed if the VF ends up
374 being small enough. */
375 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
376 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
377 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
378 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
379 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
382 return true;
386 /* Function vect_analyze_data_ref_dependence.
388 FIXME: I needed to change the sense of the returned flag.
390 Return FALSE if there (might) exist a dependence between a memory-reference
391 DRA and a memory-reference DRB. When versioning for alias may check a
392 dependence at run-time, return TRUE. Adjust *MAX_VF according to
393 the data dependence. */
395 static opt_result
396 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
397 loop_vec_info loop_vinfo,
398 unsigned int *max_vf)
400 unsigned int i;
401 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
402 struct data_reference *dra = DDR_A (ddr);
403 struct data_reference *drb = DDR_B (ddr);
404 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
405 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
406 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
407 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
408 lambda_vector dist_v;
409 unsigned int loop_depth;
411 /* If user asserted safelen consecutive iterations can be
412 executed concurrently, assume independence. */
413 auto apply_safelen = [&]()
415 if (loop->safelen >= 2)
417 if ((unsigned int) loop->safelen < *max_vf)
418 *max_vf = loop->safelen;
419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 return true;
422 return false;
425 /* In loop analysis all data references should be vectorizable. */
426 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
427 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
428 gcc_unreachable ();
430 /* Independent data accesses. */
431 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
432 return opt_result::success ();
434 if (dra == drb
435 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
436 return opt_result::success ();
438 /* We do not have to consider dependences between accesses that belong
439 to the same group, unless the stride could be smaller than the
440 group size. */
441 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
442 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
443 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
444 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
445 return opt_result::success ();
447 /* Even if we have an anti-dependence then, as the vectorized loop covers at
448 least two scalar iterations, there is always also a true dependence.
449 As the vectorizer does not re-order loads and stores we can ignore
450 the anti-dependence if TBAA can disambiguate both DRs similar to the
451 case with known negative distance anti-dependences (positive
452 distance anti-dependences would violate TBAA constraints). */
453 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
454 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
455 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
456 get_alias_set (DR_REF (drb))))
457 return opt_result::success ();
459 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
460 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
462 if (apply_safelen ())
463 return opt_result::success ();
465 return opt_result::failure_at
466 (stmtinfo_a->stmt,
467 "possible alias involving gather/scatter between %T and %T\n",
468 DR_REF (dra), DR_REF (drb));
471 /* Unknown data dependence. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
474 if (apply_safelen ())
475 return opt_result::success ();
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
479 "versioning for alias required: "
480 "can't determine dependence between %T and %T\n",
481 DR_REF (dra), DR_REF (drb));
483 /* Add to list of ddrs that need to be tested at run-time. */
484 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
487 /* Known data dependence. */
488 if (DDR_NUM_DIST_VECTS (ddr) == 0)
490 if (apply_safelen ())
491 return opt_result::success ();
493 if (dump_enabled_p ())
494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
495 "versioning for alias required: "
496 "bad dist vector for %T and %T\n",
497 DR_REF (dra), DR_REF (drb));
498 /* Add to list of ddrs that need to be tested at run-time. */
499 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
502 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
504 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
505 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
506 loop_depth, max_vf))
507 return opt_result::success ();
509 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
511 int dist = dist_v[loop_depth];
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance = %d.\n", dist);
517 if (dist == 0)
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance == 0 between %T and %T\n",
522 DR_REF (dra), DR_REF (drb));
524 /* When we perform grouped accesses and perform implicit CSE
525 by detecting equal accesses and doing disambiguation with
526 runtime alias tests like for
527 .. = a[i];
528 .. = a[i+1];
529 a[i] = ..;
530 a[i+1] = ..;
531 *p = ..;
532 .. = a[i];
533 .. = a[i+1];
534 where we will end up loading { a[i], a[i+1] } once, make
535 sure that inserting group loads before the first load and
536 stores after the last store will do the right thing.
537 Similar for groups like
538 a[i] = ...;
539 ... = a[i];
540 a[i+1] = ...;
541 where loads from the group interleave with the store. */
542 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
543 return opt_result::failure_at (stmtinfo_a->stmt,
544 "READ_WRITE dependence"
545 " in interleaving.\n");
547 if (loop->safelen < 2)
549 tree indicator = dr_zero_step_indicator (dra);
550 if (!indicator || integer_zerop (indicator))
551 return opt_result::failure_at (stmtinfo_a->stmt,
552 "access also has a zero step\n");
553 else if (TREE_CODE (indicator) != INTEGER_CST)
554 vect_check_nonzero_value (loop_vinfo, indicator);
556 continue;
559 if (dist > 0 && DDR_REVERSED_P (ddr))
561 /* If DDR_REVERSED_P the order of the data-refs in DDR was
562 reversed (to make distance vector positive), and the actual
563 distance is negative. */
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "dependence distance negative.\n");
567 /* When doing outer loop vectorization, we need to check if there is
568 a backward dependence at the inner loop level if the dependence
569 at the outer loop is reversed. See PR81740. */
570 if (nested_in_vect_loop_p (loop, stmtinfo_a)
571 || nested_in_vect_loop_p (loop, stmtinfo_b))
573 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
574 DDR_LOOP_NEST (ddr));
575 if (dist_v[inner_depth] < 0)
576 return opt_result::failure_at (stmtinfo_a->stmt,
577 "not vectorized, dependence "
578 "between data-refs %T and %T\n",
579 DR_REF (dra), DR_REF (drb));
581 /* Record a negative dependence distance to later limit the
582 amount of stmt copying / unrolling we can perform.
583 Only need to handle read-after-write dependence. */
584 if (DR_IS_READ (drb)
585 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
586 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
587 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
588 continue;
591 unsigned int abs_dist = abs (dist);
592 if (abs_dist >= 2 && abs_dist < *max_vf)
594 /* The dependence distance requires reduction of the maximal
595 vectorization factor. */
596 *max_vf = abs_dist;
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "adjusting maximal vectorization factor to %i\n",
600 *max_vf);
603 if (abs_dist >= *max_vf)
605 /* Dependence distance does not create dependence, as far as
606 vectorization is concerned, in this case. */
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location,
609 "dependence distance >= VF.\n");
610 continue;
613 return opt_result::failure_at (stmtinfo_a->stmt,
614 "not vectorized, possible dependence "
615 "between data-refs %T and %T\n",
616 DR_REF (dra), DR_REF (drb));
619 return opt_result::success ();
622 /* Function vect_analyze_early_break_dependences.
624 Examine all the data references in the loop and make sure that if we have
625 multiple exits that we are able to safely move stores such that they become
626 safe for vectorization. The function also calculates the place where to move
627 the instructions to and computes what the new vUSE chain should be.
629 This works in tandem with the CFG that will be produced by
630 slpeel_tree_duplicate_loop_to_edge_cfg later on.
632 This function tries to validate whether an early break vectorization
633 is possible for the current instruction sequence. Returns True i
634 possible, otherwise False.
636 Requirements:
637 - Any memory access must be to a fixed size buffer.
638 - There must not be any loads and stores to the same object.
639 - Multiple loads are allowed as long as they don't alias.
641 NOTE:
642 This implementation is very conservative. Any overlapping loads/stores
643 that take place before the early break statement gets rejected aside from
644 WAR dependencies.
646 i.e.:
648 a[i] = 8
649 c = a[i]
650 if (b[i])
653 is not allowed, but
655 c = a[i]
656 a[i] = 8
657 if (b[i])
660 is which is the common case. */
662 static opt_result
663 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
665 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
667 /* List of all load data references found during traversal. */
668 auto_vec<data_reference *> bases;
669 basic_block dest_bb = NULL;
671 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
672 class loop *loop_nest = loop_outer (loop);
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_NOTE, vect_location,
676 "loop contains multiple exits, analyzing"
677 " statement dependencies.\n");
679 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "alternate exit has been chosen as main exit.\n");
684 /* Since we don't support general control flow, the location we'll move the
685 side-effects to is always the latch connected exit. When we support
686 general control flow we can do better but for now this is fine. Move
687 side-effects to the in-loop destination of the last early exit. For the PEELED
688 case we move the side-effects to the latch block as this is guaranteed to be the
689 last block to be executed when a vector iteration finished. */
690 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
691 dest_bb = loop->latch;
692 else
693 dest_bb = single_pred (loop->latch);
695 /* We start looking from dest_bb, for the non-PEELED case we don't want to
696 move any stores already present, but we do want to read and validate the
697 loads. */
698 basic_block bb = dest_bb;
700 /* In the peeled case we need to check all the loads in the loop since to move the
701 the stores we lift the stores over all loads into the latch. */
702 bool check_deps = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
706 gimple_stmt_iterator gsi = gsi_last_bb (bb);
708 /* Now analyze all the remaining statements and try to determine which
709 instructions are allowed/needed to be moved. */
710 while (!gsi_end_p (gsi))
712 gimple *stmt = gsi_stmt (gsi);
713 gsi_prev (&gsi);
714 if (!gimple_has_ops (stmt)
715 || is_gimple_debug (stmt))
716 continue;
718 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
719 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
720 if (!dr_ref)
721 continue;
723 /* Check if vector accesses to the object will be within bounds.
724 must be a constant or assume loop will be versioned or niters
725 bounded by VF so accesses are within range. We only need to check the
726 reads since writes are moved to a safe place where if we get there we
727 know they are safe to perform. */
728 if (DR_IS_READ (dr_ref)
729 && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
731 if (dump_enabled_p ())
732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
733 "early breaks not supported: vectorization "
734 "would %s beyond size of obj.",
735 DR_IS_READ (dr_ref) ? "read" : "write");
736 return opt_result::failure_at (stmt,
737 "can't safely apply code motion to "
738 "dependencies of %G to vectorize "
739 "the early exit.\n", stmt);
742 if (!check_deps)
743 continue;
745 if (DR_IS_READ (dr_ref))
746 bases.safe_push (dr_ref);
747 else if (DR_IS_WRITE (dr_ref))
749 /* We are moving writes down in the CFG. To be sure that this
750 is valid after vectorization we have to check all the loads
751 we are sinking the stores past to see if any of them may
752 alias or are the same object.
754 Same objects will not be an issue because unless the store
755 is marked volatile the value can be forwarded. If the
756 store is marked volatile we don't vectorize the loop
757 anyway.
759 That leaves the check for aliasing. We don't really need
760 to care about the stores aliasing with each other since the
761 stores are moved in order so the effects are still observed
762 correctly. This leaves the check for WAR dependencies
763 which we would be introducing here if the DR can alias.
764 The check is quadratic in loads/stores but I have not found
765 a better API to do this. I believe all loads and stores
766 must be checked. We also must check them when we
767 encountered the store, since we don't care about loads past
768 the store. */
770 for (auto dr_read : bases)
771 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
775 vect_location,
776 "early breaks not supported: "
777 "overlapping loads and stores "
778 "found before the break "
779 "statement.\n");
781 return opt_result::failure_at (stmt,
782 "can't safely apply code motion to dependencies"
783 " to vectorize the early exit. %G may alias with"
784 " %G\n", stmt, dr_read->stmt);
788 if (gimple_vdef (stmt))
790 if (dump_enabled_p ())
791 dump_printf_loc (MSG_NOTE, vect_location,
792 "==> recording stmt %G", stmt);
794 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
796 else if (gimple_vuse (stmt))
798 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
799 if (dump_enabled_p ())
800 dump_printf_loc (MSG_NOTE, vect_location,
801 "marked statement for vUSE update: %G", stmt);
805 if (!single_pred_p (bb))
807 gcc_assert (bb == loop->header);
808 break;
811 /* For the non-PEELED case we don't want to check the loads in the IV exit block
812 for dependencies with the stores, but any block preceeding it we do. */
813 check_deps = true;
814 bb = single_pred (bb);
816 while (1);
818 /* We don't allow outer -> inner loop transitions which should have been
819 trapped already during loop form analysis. */
820 gcc_assert (dest_bb->loop_father == loop);
822 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
824 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
826 /* All uses shall be updated to that of the first load. Entries are
827 stored in reverse order. */
828 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
829 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_NOTE, vect_location,
833 "will update use: %T, mem_ref: %G", vuse, g);
837 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE, vect_location,
839 "recorded statements to be moved to BB %d\n",
840 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
842 return opt_result::success ();
845 /* Function vect_analyze_data_ref_dependences.
847 Examine all the data references in the loop, and make sure there do not
848 exist any data dependences between them. Set *MAX_VF according to
849 the maximum vectorization factor the data dependences allow. */
851 opt_result
852 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
853 unsigned int *max_vf)
855 unsigned int i;
856 struct data_dependence_relation *ddr;
858 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
860 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
862 LOOP_VINFO_DDRS (loop_vinfo)
863 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
864 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
865 /* We do not need read-read dependences. */
866 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
867 &LOOP_VINFO_DDRS (loop_vinfo),
868 LOOP_VINFO_LOOP_NEST (loop_vinfo),
869 false);
870 gcc_assert (res);
873 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
875 /* For epilogues we either have no aliases or alias versioning
876 was applied to original loop. Therefore we may just get max_vf
877 using VF of original loop. */
878 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
879 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
880 else
881 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
883 opt_result res
884 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
885 if (!res)
886 return res;
889 /* If we have early break statements in the loop, check to see if they
890 are of a form we can vectorizer. */
891 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
892 return vect_analyze_early_break_dependences (loop_vinfo);
894 return opt_result::success ();
898 /* Function vect_slp_analyze_data_ref_dependence.
900 Return TRUE if there (might) exist a dependence between a memory-reference
901 DRA and a memory-reference DRB for VINFO. When versioning for alias
902 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
903 according to the data dependence. */
905 static bool
906 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
907 struct data_dependence_relation *ddr)
909 struct data_reference *dra = DDR_A (ddr);
910 struct data_reference *drb = DDR_B (ddr);
911 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
912 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
914 /* We need to check dependences of statements marked as unvectorizable
915 as well, they still can prohibit vectorization. */
917 /* Independent data accesses. */
918 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
919 return false;
921 if (dra == drb)
922 return false;
924 /* Read-read is OK. */
925 if (DR_IS_READ (dra) && DR_IS_READ (drb))
926 return false;
928 /* If dra and drb are part of the same interleaving chain consider
929 them independent. */
930 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
931 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
932 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
933 return false;
935 /* Unknown data dependence. */
936 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
938 if (dump_enabled_p ())
939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
940 "can't determine dependence between %T and %T\n",
941 DR_REF (dra), DR_REF (drb));
943 else if (dump_enabled_p ())
944 dump_printf_loc (MSG_NOTE, vect_location,
945 "determined dependence between %T and %T\n",
946 DR_REF (dra), DR_REF (drb));
948 return true;
952 /* Analyze dependences involved in the transform of a store SLP NODE. */
954 static bool
955 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
957 /* This walks over all stmts involved in the SLP store done
958 in NODE verifying we can sink them up to the last stmt in the
959 group. */
960 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
961 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
963 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
965 stmt_vec_info access_info
966 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
967 if (access_info == last_access_info)
968 continue;
969 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
970 ao_ref ref;
971 bool ref_initialized_p = false;
972 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
973 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
975 gimple *stmt = gsi_stmt (gsi);
976 if (! gimple_vuse (stmt))
977 continue;
979 /* If we couldn't record a (single) data reference for this
980 stmt we have to resort to the alias oracle. */
981 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
982 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
983 if (!dr_b)
985 /* We are moving a store - this means
986 we cannot use TBAA for disambiguation. */
987 if (!ref_initialized_p)
988 ao_ref_init (&ref, DR_REF (dr_a));
989 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
990 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
991 return false;
992 continue;
995 gcc_assert (!gimple_visited_p (stmt));
997 ddr_p ddr = initialize_data_dependence_relation (dr_a,
998 dr_b, vNULL);
999 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1000 free_dependence_relation (ddr);
1001 if (dependent)
1002 return false;
1005 return true;
1008 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1009 contain the vector of scalar stores of this instance if we are
1010 disambiguating the loads. */
1012 static bool
1013 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1014 vec<stmt_vec_info> stores,
1015 stmt_vec_info last_store_info)
1017 /* This walks over all stmts involved in the SLP load done
1018 in NODE verifying we can hoist them up to the first stmt in the
1019 group. */
1020 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1021 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1023 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1025 stmt_vec_info access_info
1026 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1027 if (access_info == first_access_info)
1028 continue;
1029 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1030 ao_ref ref;
1031 bool ref_initialized_p = false;
1032 hash_set<stmt_vec_info> grp_visited;
1033 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1034 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1036 gimple *stmt = gsi_stmt (gsi);
1037 if (! gimple_vdef (stmt))
1038 continue;
1040 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1042 /* If we run into a store of this same instance (we've just
1043 marked those) then delay dependence checking until we run
1044 into the last store because this is where it will have
1045 been sunk to (and we verified that we can do that already). */
1046 if (gimple_visited_p (stmt))
1048 if (stmt_info != last_store_info)
1049 continue;
1051 for (stmt_vec_info &store_info : stores)
1053 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1054 ddr_p ddr = initialize_data_dependence_relation
1055 (dr_a, store_dr, vNULL);
1056 bool dependent
1057 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1058 free_dependence_relation (ddr);
1059 if (dependent)
1060 return false;
1062 continue;
1065 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1067 /* We are hoisting a load - this means we can use TBAA for
1068 disambiguation. */
1069 if (!ref_initialized_p)
1070 ao_ref_init (&ref, DR_REF (dr_a));
1071 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1073 /* If we couldn't record a (single) data reference for this
1074 stmt we have to give up now. */
1075 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1076 if (!dr_b)
1077 return false;
1078 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1079 dr_b, vNULL);
1080 bool dependent
1081 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1082 free_dependence_relation (ddr);
1083 if (dependent)
1084 return false;
1086 /* No dependence. */
1087 return true;
1089 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1091 /* When we run into a store group we have to honor
1092 that earlier stores might be moved here. We don't
1093 know exactly which and where to since we lack a
1094 back-mapping from DR to SLP node, so assume all
1095 earlier stores are sunk here. It's enough to
1096 consider the last stmt of a group for this.
1097 ??? Both this and the fact that we disregard that
1098 the conflicting instance might be removed later
1099 is overly conservative. */
1100 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1101 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1102 store_info != NULL;
1103 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1104 if ((store_info == stmt_info
1105 || get_later_stmt (store_info, stmt_info) == stmt_info)
1106 && !check_hoist (store_info))
1107 return false;
1109 else
1111 if (!check_hoist (stmt_info))
1112 return false;
1116 return true;
1120 /* Function vect_analyze_data_ref_dependences.
1122 Examine all the data references in the basic-block, and make sure there
1123 do not exist any data dependences between them. Set *MAX_VF according to
1124 the maximum vectorization factor the data dependences allow. */
1126 bool
1127 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1129 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1131 /* The stores of this instance are at the root of the SLP tree. */
1132 slp_tree store = NULL;
1133 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1134 store = SLP_INSTANCE_TREE (instance);
1136 /* Verify we can sink stores to the vectorized stmt insert location. */
1137 stmt_vec_info last_store_info = NULL;
1138 if (store)
1140 if (! vect_slp_analyze_store_dependences (vinfo, store))
1141 return false;
1143 /* Mark stores in this instance and remember the last one. */
1144 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1145 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1146 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1149 bool res = true;
1151 /* Verify we can sink loads to the vectorized stmt insert location,
1152 special-casing stores of this instance. */
1153 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1154 if (! vect_slp_analyze_load_dependences (vinfo, load,
1155 store
1156 ? SLP_TREE_SCALAR_STMTS (store)
1157 : vNULL, last_store_info))
1159 res = false;
1160 break;
1163 /* Unset the visited flag. */
1164 if (store)
1165 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1166 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1168 return res;
1171 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1172 applied. */
1175 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1177 HOST_WIDE_INT diff = 0;
1178 /* Alignment is only analyzed for the first element of a DR group,
1179 use that but adjust misalignment by the offset of the access. */
1180 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1182 dr_vec_info *first_dr
1183 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1184 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1185 INTEGER_CSTs and the first element in the group has the lowest
1186 address. */
1187 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1188 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1189 gcc_assert (diff >= 0);
1190 dr_info = first_dr;
1193 int misalign = dr_info->misalignment;
1194 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1195 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1196 return misalign;
1198 /* If the access is only aligned for a vector type with smaller alignment
1199 requirement the access has unknown misalignment. */
1200 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1201 targetm.vectorize.preferred_vector_alignment (vectype)))
1202 return DR_MISALIGNMENT_UNKNOWN;
1204 /* Apply the offset from the DR group start and the externally supplied
1205 offset which can for example result from a negative stride access. */
1206 poly_int64 misalignment = misalign + diff + offset;
1208 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1209 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1210 unsigned HOST_WIDE_INT target_alignment_c
1211 = dr_info->target_alignment.to_constant ();
1212 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1213 return DR_MISALIGNMENT_UNKNOWN;
1214 return misalign;
1217 /* Record the base alignment guarantee given by DRB, which occurs
1218 in STMT_INFO. */
1220 static void
1221 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1222 innermost_loop_behavior *drb)
1224 bool existed;
1225 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1226 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1227 if (!existed || entry.second->base_alignment < drb->base_alignment)
1229 entry = std::make_pair (stmt_info, drb);
1230 if (dump_enabled_p ())
1231 dump_printf_loc (MSG_NOTE, vect_location,
1232 "recording new base alignment for %T\n"
1233 " alignment: %d\n"
1234 " misalignment: %d\n"
1235 " based on: %G",
1236 drb->base_address,
1237 drb->base_alignment,
1238 drb->base_misalignment,
1239 stmt_info->stmt);
1243 /* If the region we're going to vectorize is reached, all unconditional
1244 data references occur at least once. We can therefore pool the base
1245 alignment guarantees from each unconditional reference. Do this by
1246 going through all the data references in VINFO and checking whether
1247 the containing statement makes the reference unconditionally. If so,
1248 record the alignment of the base address in VINFO so that it can be
1249 used for all other references with the same base. */
1251 void
1252 vect_record_base_alignments (vec_info *vinfo)
1254 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1255 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1256 for (data_reference *dr : vinfo->shared->datarefs)
1258 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1259 stmt_vec_info stmt_info = dr_info->stmt;
1260 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1261 && STMT_VINFO_VECTORIZABLE (stmt_info)
1262 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1264 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1266 /* If DR is nested in the loop that is being vectorized, we can also
1267 record the alignment of the base wrt the outer loop. */
1268 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1269 vect_record_base_alignment
1270 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1275 /* Function vect_compute_data_ref_alignment
1277 Compute the misalignment of the data reference DR_INFO when vectorizing
1278 with VECTYPE.
1280 Output:
1281 1. initialized misalignment info for DR_INFO
1283 FOR NOW: No analysis is actually performed. Misalignment is calculated
1284 only for trivial cases. TODO. */
1286 static void
1287 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1288 tree vectype)
1290 stmt_vec_info stmt_info = dr_info->stmt;
1291 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1292 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1293 class loop *loop = NULL;
1294 tree ref = DR_REF (dr_info->dr);
1296 if (dump_enabled_p ())
1297 dump_printf_loc (MSG_NOTE, vect_location,
1298 "vect_compute_data_ref_alignment:\n");
1300 if (loop_vinfo)
1301 loop = LOOP_VINFO_LOOP (loop_vinfo);
1303 /* Initialize misalignment to unknown. */
1304 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1306 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1307 return;
1309 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1310 bool step_preserves_misalignment_p;
1312 poly_uint64 vector_alignment
1313 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1314 BITS_PER_UNIT);
1315 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1317 /* If the main loop has peeled for alignment we have no way of knowing
1318 whether the data accesses in the epilogues are aligned. We can't at
1319 compile time answer the question whether we have entered the main loop or
1320 not. Fixes PR 92351. */
1321 if (loop_vinfo)
1323 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1324 if (orig_loop_vinfo
1325 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1326 return;
1329 unsigned HOST_WIDE_INT vect_align_c;
1330 if (!vector_alignment.is_constant (&vect_align_c))
1331 return;
1333 /* No step for BB vectorization. */
1334 if (!loop)
1336 gcc_assert (integer_zerop (drb->step));
1337 step_preserves_misalignment_p = true;
1340 /* In case the dataref is in an inner-loop of the loop that is being
1341 vectorized (LOOP), we use the base and misalignment information
1342 relative to the outer-loop (LOOP). This is ok only if the misalignment
1343 stays the same throughout the execution of the inner-loop, which is why
1344 we have to check that the stride of the dataref in the inner-loop evenly
1345 divides by the vector alignment. */
1346 else if (nested_in_vect_loop_p (loop, stmt_info))
1348 step_preserves_misalignment_p
1349 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1351 if (dump_enabled_p ())
1353 if (step_preserves_misalignment_p)
1354 dump_printf_loc (MSG_NOTE, vect_location,
1355 "inner step divides the vector alignment.\n");
1356 else
1357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358 "inner step doesn't divide the vector"
1359 " alignment.\n");
1363 /* Similarly we can only use base and misalignment information relative to
1364 an innermost loop if the misalignment stays the same throughout the
1365 execution of the loop. As above, this is the case if the stride of
1366 the dataref evenly divides by the alignment. */
1367 else
1369 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1370 step_preserves_misalignment_p
1371 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1373 if (!step_preserves_misalignment_p && dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 "step doesn't divide the vector alignment.\n");
1378 unsigned int base_alignment = drb->base_alignment;
1379 unsigned int base_misalignment = drb->base_misalignment;
1381 /* Calculate the maximum of the pooled base address alignment and the
1382 alignment that we can compute for DR itself. */
1383 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1384 = base_alignments->get (drb->base_address);
1385 if (entry
1386 && base_alignment < (*entry).second->base_alignment
1387 && (loop_vinfo
1388 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1389 gimple_bb (entry->first->stmt))
1390 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1391 || (entry->first->dr_aux.group <= dr_info->group)))))
1393 base_alignment = entry->second->base_alignment;
1394 base_misalignment = entry->second->base_misalignment;
1397 if (drb->offset_alignment < vect_align_c
1398 || !step_preserves_misalignment_p
1399 /* We need to know whether the step wrt the vectorized loop is
1400 negative when computing the starting misalignment below. */
1401 || TREE_CODE (drb->step) != INTEGER_CST)
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1405 "Unknown alignment for access: %T\n", ref);
1406 return;
1409 if (base_alignment < vect_align_c)
1411 unsigned int max_alignment;
1412 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1413 if (max_alignment < vect_align_c
1414 || !vect_can_force_dr_alignment_p (base,
1415 vect_align_c * BITS_PER_UNIT))
1417 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_NOTE, vect_location,
1419 "can't force alignment of ref: %T\n", ref);
1420 return;
1423 /* Force the alignment of the decl.
1424 NOTE: This is the only change to the code we make during
1425 the analysis phase, before deciding to vectorize the loop. */
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location,
1428 "force alignment of %T\n", ref);
1430 dr_info->base_decl = base;
1431 dr_info->base_misaligned = true;
1432 base_misalignment = 0;
1434 poly_int64 misalignment
1435 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1437 unsigned int const_misalignment;
1438 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1440 if (dump_enabled_p ())
1441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442 "Non-constant misalignment for access: %T\n", ref);
1443 return;
1446 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1448 if (dump_enabled_p ())
1449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1450 "misalign = %d bytes of ref %T\n",
1451 const_misalignment, ref);
1453 return;
1456 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1457 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1458 is made aligned via peeling. */
1460 static bool
1461 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1462 dr_vec_info *dr_peel_info)
1464 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1465 DR_TARGET_ALIGNMENT (dr_info)))
1467 poly_offset_int diff
1468 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1469 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1470 if (known_eq (diff, 0)
1471 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1472 return true;
1474 return false;
1477 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1478 aligned via peeling. */
1480 static bool
1481 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1482 dr_vec_info *dr_peel_info)
1484 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1485 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1486 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1487 DR_OFFSET (dr_peel_info->dr), 0)
1488 || !operand_equal_p (DR_STEP (dr_info->dr),
1489 DR_STEP (dr_peel_info->dr), 0))
1490 return false;
1492 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1495 /* Compute the value for dr_info->misalign so that the access appears
1496 aligned. This is used by peeling to compensate for dr_misalignment
1497 applying the offset for negative step. */
1500 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1502 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1503 return 0;
1505 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1506 poly_int64 misalignment
1507 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1508 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1510 unsigned HOST_WIDE_INT target_alignment_c;
1511 int misalign;
1512 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1513 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1514 return DR_MISALIGNMENT_UNKNOWN;
1515 return misalign;
1518 /* Function vect_update_misalignment_for_peel.
1519 Sets DR_INFO's misalignment
1520 - to 0 if it has the same alignment as DR_PEEL_INFO,
1521 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1522 - to -1 (unknown) otherwise.
1524 DR_INFO - the data reference whose misalignment is to be adjusted.
1525 DR_PEEL_INFO - the data reference whose misalignment is being made
1526 zero in the vector loop by the peel.
1527 NPEEL - the number of iterations in the peel loop if the misalignment
1528 of DR_PEEL_INFO is known at compile time. */
1530 static void
1531 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1532 dr_vec_info *dr_peel_info, int npeel)
1534 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1535 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1537 SET_DR_MISALIGNMENT (dr_info,
1538 vect_dr_misalign_for_aligned_access (dr_peel_info));
1539 return;
1542 unsigned HOST_WIDE_INT alignment;
1543 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1544 && known_alignment_for_access_p (dr_info,
1545 STMT_VINFO_VECTYPE (dr_info->stmt))
1546 && known_alignment_for_access_p (dr_peel_info,
1547 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1549 int misal = dr_info->misalignment;
1550 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1551 misal &= alignment - 1;
1552 set_dr_misalignment (dr_info, misal);
1553 return;
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1558 "to unknown (-1).\n");
1559 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1562 /* Return true if alignment is relevant for DR_INFO. */
1564 static bool
1565 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1567 stmt_vec_info stmt_info = dr_info->stmt;
1569 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1570 return false;
1572 /* For interleaving, only the alignment of the first access matters. */
1573 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1574 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1575 return false;
1577 /* Scatter-gather and invariant accesses continue to address individual
1578 scalars, so vector-level alignment is irrelevant. */
1579 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1580 || integer_zerop (DR_STEP (dr_info->dr)))
1581 return false;
1583 /* Strided accesses perform only component accesses, alignment is
1584 irrelevant for them. */
1585 if (STMT_VINFO_STRIDED_P (stmt_info)
1586 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1587 return false;
1589 return true;
1592 /* Given an memory reference EXP return whether its alignment is less
1593 than its size. */
1595 static bool
1596 not_size_aligned (tree exp)
1598 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1599 return true;
1601 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1602 > get_object_alignment (exp));
1605 /* Function vector_alignment_reachable_p
1607 Return true if vector alignment for DR_INFO is reachable by peeling
1608 a few loop iterations. Return false otherwise. */
1610 static bool
1611 vector_alignment_reachable_p (dr_vec_info *dr_info)
1613 stmt_vec_info stmt_info = dr_info->stmt;
1614 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1616 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1618 /* For interleaved access we peel only if number of iterations in
1619 the prolog loop ({VF - misalignment}), is a multiple of the
1620 number of the interleaved accesses. */
1621 int elem_size, mis_in_elements;
1623 /* FORNOW: handle only known alignment. */
1624 if (!known_alignment_for_access_p (dr_info, vectype))
1625 return false;
1627 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1628 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1629 elem_size = vector_element_size (vector_size, nelements);
1630 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1632 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1633 return false;
1636 /* If misalignment is known at the compile time then allow peeling
1637 only if natural alignment is reachable through peeling. */
1638 if (known_alignment_for_access_p (dr_info, vectype)
1639 && !aligned_access_p (dr_info, vectype))
1641 HOST_WIDE_INT elmsize =
1642 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1643 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_NOTE, vect_location,
1646 "data size = %wd. misalignment = %d.\n", elmsize,
1647 dr_misalignment (dr_info, vectype));
1649 if (dr_misalignment (dr_info, vectype) % elmsize)
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653 "data size does not divide the misalignment.\n");
1654 return false;
1658 if (!known_alignment_for_access_p (dr_info, vectype))
1660 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1661 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664 "Unknown misalignment, %snaturally aligned\n",
1665 is_packed ? "not " : "");
1666 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1669 return true;
1673 /* Calculate the cost of the memory access represented by DR_INFO. */
1675 static void
1676 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1677 dr_alignment_support alignment_support_scheme,
1678 int misalignment,
1679 unsigned int *inside_cost,
1680 unsigned int *outside_cost,
1681 stmt_vector_for_cost *body_cost_vec,
1682 stmt_vector_for_cost *prologue_cost_vec)
1684 stmt_vec_info stmt_info = dr_info->stmt;
1685 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1686 int ncopies;
1688 if (PURE_SLP_STMT (stmt_info))
1689 ncopies = 1;
1690 else
1691 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1693 if (DR_IS_READ (dr_info->dr))
1694 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1695 misalignment, true, inside_cost,
1696 outside_cost, prologue_cost_vec, body_cost_vec, false);
1697 else
1698 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1699 misalignment, inside_cost, body_cost_vec);
1701 if (dump_enabled_p ())
1702 dump_printf_loc (MSG_NOTE, vect_location,
1703 "vect_get_data_access_cost: inside_cost = %d, "
1704 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1708 typedef struct _vect_peel_info
1710 dr_vec_info *dr_info;
1711 int npeel;
1712 unsigned int count;
1713 } *vect_peel_info;
1715 typedef struct _vect_peel_extended_info
1717 vec_info *vinfo;
1718 struct _vect_peel_info peel_info;
1719 unsigned int inside_cost;
1720 unsigned int outside_cost;
1721 } *vect_peel_extended_info;
1724 /* Peeling hashtable helpers. */
1726 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1728 static inline hashval_t hash (const _vect_peel_info *);
1729 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1732 inline hashval_t
1733 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1735 return (hashval_t) peel_info->npeel;
1738 inline bool
1739 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1741 return (a->npeel == b->npeel);
1745 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1747 static void
1748 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1749 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1750 int npeel, bool supportable_if_not_aligned)
1752 struct _vect_peel_info elem, *slot;
1753 _vect_peel_info **new_slot;
1755 elem.npeel = npeel;
1756 slot = peeling_htab->find (&elem);
1757 if (slot)
1758 slot->count++;
1759 else
1761 slot = XNEW (struct _vect_peel_info);
1762 slot->npeel = npeel;
1763 slot->dr_info = dr_info;
1764 slot->count = 1;
1765 new_slot = peeling_htab->find_slot (slot, INSERT);
1766 *new_slot = slot;
1769 /* If this DR is not supported with unknown misalignment then bias
1770 this slot when the cost model is disabled. */
1771 if (!supportable_if_not_aligned
1772 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1773 slot->count += VECT_MAX_COST;
1777 /* Traverse peeling hash table to find peeling option that aligns maximum
1778 number of data accesses. */
1781 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1782 _vect_peel_extended_info *max)
1784 vect_peel_info elem = *slot;
1786 if (elem->count > max->peel_info.count
1787 || (elem->count == max->peel_info.count
1788 && max->peel_info.npeel > elem->npeel))
1790 max->peel_info.npeel = elem->npeel;
1791 max->peel_info.count = elem->count;
1792 max->peel_info.dr_info = elem->dr_info;
1795 return 1;
1798 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1799 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1800 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1801 after peeling. */
1803 static void
1804 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1805 dr_vec_info *dr0_info,
1806 unsigned int *inside_cost,
1807 unsigned int *outside_cost,
1808 stmt_vector_for_cost *body_cost_vec,
1809 stmt_vector_for_cost *prologue_cost_vec,
1810 unsigned int npeel)
1812 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1814 bool dr0_alignment_known_p
1815 = (dr0_info
1816 && known_alignment_for_access_p (dr0_info,
1817 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1819 for (data_reference *dr : datarefs)
1821 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1822 if (!vect_relevant_for_alignment_p (dr_info))
1823 continue;
1825 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1826 dr_alignment_support alignment_support_scheme;
1827 int misalignment;
1828 unsigned HOST_WIDE_INT alignment;
1830 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1831 size_zero_node) < 0;
1832 poly_int64 off = 0;
1833 if (negative)
1834 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1835 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1837 if (npeel == 0)
1838 misalignment = dr_misalignment (dr_info, vectype, off);
1839 else if (dr_info == dr0_info
1840 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1841 misalignment = 0;
1842 else if (!dr0_alignment_known_p
1843 || !known_alignment_for_access_p (dr_info, vectype)
1844 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1845 misalignment = DR_MISALIGNMENT_UNKNOWN;
1846 else
1848 misalignment = dr_misalignment (dr_info, vectype, off);
1849 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1850 misalignment &= alignment - 1;
1852 alignment_support_scheme
1853 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1854 misalignment);
1856 vect_get_data_access_cost (loop_vinfo, dr_info,
1857 alignment_support_scheme, misalignment,
1858 inside_cost, outside_cost,
1859 body_cost_vec, prologue_cost_vec);
1863 /* Traverse peeling hash table and calculate cost for each peeling option.
1864 Find the one with the lowest cost. */
1867 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1868 _vect_peel_extended_info *min)
1870 vect_peel_info elem = *slot;
1871 int dummy;
1872 unsigned int inside_cost = 0, outside_cost = 0;
1873 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1874 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1875 epilogue_cost_vec;
1877 prologue_cost_vec.create (2);
1878 body_cost_vec.create (2);
1879 epilogue_cost_vec.create (2);
1881 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1882 &outside_cost, &body_cost_vec,
1883 &prologue_cost_vec, elem->npeel);
1885 body_cost_vec.release ();
1887 outside_cost += vect_get_known_peeling_cost
1888 (loop_vinfo, elem->npeel, &dummy,
1889 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1890 &prologue_cost_vec, &epilogue_cost_vec);
1892 /* Prologue and epilogue costs are added to the target model later.
1893 These costs depend only on the scalar iteration cost, the
1894 number of peeling iterations finally chosen, and the number of
1895 misaligned statements. So discard the information found here. */
1896 prologue_cost_vec.release ();
1897 epilogue_cost_vec.release ();
1899 if (inside_cost < min->inside_cost
1900 || (inside_cost == min->inside_cost
1901 && outside_cost < min->outside_cost))
1903 min->inside_cost = inside_cost;
1904 min->outside_cost = outside_cost;
1905 min->peel_info.dr_info = elem->dr_info;
1906 min->peel_info.npeel = elem->npeel;
1907 min->peel_info.count = elem->count;
1910 return 1;
1914 /* Choose best peeling option by traversing peeling hash table and either
1915 choosing an option with the lowest cost (if cost model is enabled) or the
1916 option that aligns as many accesses as possible. */
1918 static struct _vect_peel_extended_info
1919 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1920 loop_vec_info loop_vinfo)
1922 struct _vect_peel_extended_info res;
1924 res.peel_info.dr_info = NULL;
1925 res.vinfo = loop_vinfo;
1927 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1929 res.inside_cost = INT_MAX;
1930 res.outside_cost = INT_MAX;
1931 peeling_htab->traverse <_vect_peel_extended_info *,
1932 vect_peeling_hash_get_lowest_cost> (&res);
1934 else
1936 res.peel_info.count = 0;
1937 peeling_htab->traverse <_vect_peel_extended_info *,
1938 vect_peeling_hash_get_most_frequent> (&res);
1939 res.inside_cost = 0;
1940 res.outside_cost = 0;
1943 return res;
1946 /* Return true if the new peeling NPEEL is supported. */
1948 static bool
1949 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1950 unsigned npeel)
1952 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1953 enum dr_alignment_support supportable_dr_alignment;
1955 bool dr0_alignment_known_p
1956 = known_alignment_for_access_p (dr0_info,
1957 STMT_VINFO_VECTYPE (dr0_info->stmt));
1959 /* Ensure that all data refs can be vectorized after the peel. */
1960 for (data_reference *dr : datarefs)
1962 if (dr == dr0_info->dr)
1963 continue;
1965 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1966 if (!vect_relevant_for_alignment_p (dr_info)
1967 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1968 continue;
1970 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1971 int misalignment;
1972 unsigned HOST_WIDE_INT alignment;
1973 if (!dr0_alignment_known_p
1974 || !known_alignment_for_access_p (dr_info, vectype)
1975 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1976 misalignment = DR_MISALIGNMENT_UNKNOWN;
1977 else
1979 misalignment = dr_misalignment (dr_info, vectype);
1980 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1981 misalignment &= alignment - 1;
1983 supportable_dr_alignment
1984 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1985 misalignment);
1986 if (supportable_dr_alignment == dr_unaligned_unsupported)
1987 return false;
1990 return true;
1993 /* Compare two data-references DRA and DRB to group them into chunks
1994 with related alignment. */
1996 static int
1997 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1999 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2000 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2001 int cmp;
2003 /* Stabilize sort. */
2004 if (dra == drb)
2005 return 0;
2007 /* Ordering of DRs according to base. */
2008 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2009 DR_BASE_ADDRESS (drb));
2010 if (cmp != 0)
2011 return cmp;
2013 /* And according to DR_OFFSET. */
2014 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2015 if (cmp != 0)
2016 return cmp;
2018 /* And after step. */
2019 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2020 if (cmp != 0)
2021 return cmp;
2023 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2024 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2025 if (cmp == 0)
2026 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2027 return cmp;
2030 /* Function vect_enhance_data_refs_alignment
2032 This pass will use loop versioning and loop peeling in order to enhance
2033 the alignment of data references in the loop.
2035 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2036 original loop is to be vectorized. Any other loops that are created by
2037 the transformations performed in this pass - are not supposed to be
2038 vectorized. This restriction will be relaxed.
2040 This pass will require a cost model to guide it whether to apply peeling
2041 or versioning or a combination of the two. For example, the scheme that
2042 intel uses when given a loop with several memory accesses, is as follows:
2043 choose one memory access ('p') which alignment you want to force by doing
2044 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2045 other accesses are not necessarily aligned, or (2) use loop versioning to
2046 generate one loop in which all accesses are aligned, and another loop in
2047 which only 'p' is necessarily aligned.
2049 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2050 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2051 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2053 Devising a cost model is the most critical aspect of this work. It will
2054 guide us on which access to peel for, whether to use loop versioning, how
2055 many versions to create, etc. The cost model will probably consist of
2056 generic considerations as well as target specific considerations (on
2057 powerpc for example, misaligned stores are more painful than misaligned
2058 loads).
2060 Here are the general steps involved in alignment enhancements:
2062 -- original loop, before alignment analysis:
2063 for (i=0; i<N; i++){
2064 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2065 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2068 -- After vect_compute_data_refs_alignment:
2069 for (i=0; i<N; i++){
2070 x = q[i]; # DR_MISALIGNMENT(q) = 3
2071 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2074 -- Possibility 1: we do loop versioning:
2075 if (p is aligned) {
2076 for (i=0; i<N; i++){ # loop 1A
2077 x = q[i]; # DR_MISALIGNMENT(q) = 3
2078 p[i] = y; # DR_MISALIGNMENT(p) = 0
2081 else {
2082 for (i=0; i<N; i++){ # loop 1B
2083 x = q[i]; # DR_MISALIGNMENT(q) = 3
2084 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2088 -- Possibility 2: we do loop peeling:
2089 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2090 x = q[i];
2091 p[i] = y;
2093 for (i = 3; i < N; i++){ # loop 2A
2094 x = q[i]; # DR_MISALIGNMENT(q) = 0
2095 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2098 -- Possibility 3: combination of loop peeling and versioning:
2099 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2100 x = q[i];
2101 p[i] = y;
2103 if (p is aligned) {
2104 for (i = 3; i<N; i++){ # loop 3A
2105 x = q[i]; # DR_MISALIGNMENT(q) = 0
2106 p[i] = y; # DR_MISALIGNMENT(p) = 0
2109 else {
2110 for (i = 3; i<N; i++){ # loop 3B
2111 x = q[i]; # DR_MISALIGNMENT(q) = 0
2112 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2116 These loops are later passed to loop_transform to be vectorized. The
2117 vectorizer will use the alignment information to guide the transformation
2118 (whether to generate regular loads/stores, or with special handling for
2119 misalignment). */
2121 opt_result
2122 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2124 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2125 dr_vec_info *first_store = NULL;
2126 dr_vec_info *dr0_info = NULL;
2127 struct data_reference *dr;
2128 unsigned int i;
2129 bool do_peeling = false;
2130 bool do_versioning = false;
2131 unsigned int npeel = 0;
2132 bool one_misalignment_known = false;
2133 bool one_misalignment_unknown = false;
2134 bool one_dr_unsupportable = false;
2135 dr_vec_info *unsupportable_dr_info = NULL;
2136 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2137 hash_table<peel_info_hasher> peeling_htab (1);
2139 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2141 /* Reset data so we can safely be called multiple times. */
2142 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2143 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2145 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2146 return opt_result::success ();
2148 /* Sort the vector of datarefs so DRs that have the same or dependent
2149 alignment are next to each other. */
2150 auto_vec<data_reference_p> datarefs
2151 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2152 datarefs.qsort (dr_align_group_sort_cmp);
2154 /* Compute the number of DRs that become aligned when we peel
2155 a dataref so it becomes aligned. */
2156 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2157 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2158 unsigned i0;
2159 for (i0 = 0; i0 < datarefs.length (); ++i0)
2160 if (DR_BASE_ADDRESS (datarefs[i0]))
2161 break;
2162 for (i = i0 + 1; i <= datarefs.length (); ++i)
2164 if (i == datarefs.length ()
2165 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2166 DR_BASE_ADDRESS (datarefs[i]), 0)
2167 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2168 DR_OFFSET (datarefs[i]), 0)
2169 || !operand_equal_p (DR_STEP (datarefs[i0]),
2170 DR_STEP (datarefs[i]), 0))
2172 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2173 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2174 will get known misalignment if we align one of the refs
2175 with the largest DR_TARGET_ALIGNMENT. */
2176 for (unsigned j = i0; j < i; ++j)
2178 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2179 for (unsigned k = i0; k < i; ++k)
2181 if (k == j)
2182 continue;
2183 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2184 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2185 dr_infoj))
2186 n_same_align_refs[j]++;
2189 i0 = i;
2193 /* While cost model enhancements are expected in the future, the high level
2194 view of the code at this time is as follows:
2196 A) If there is a misaligned access then see if peeling to align
2197 this access can make all data references satisfy
2198 vect_supportable_dr_alignment. If so, update data structures
2199 as needed and return true.
2201 B) If peeling wasn't possible and there is a data reference with an
2202 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2203 then see if loop versioning checks can be used to make all data
2204 references satisfy vect_supportable_dr_alignment. If so, update
2205 data structures as needed and return true.
2207 C) If neither peeling nor versioning were successful then return false if
2208 any data reference does not satisfy vect_supportable_dr_alignment.
2210 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2212 Note, Possibility 3 above (which is peeling and versioning together) is not
2213 being done at this time. */
2215 /* (1) Peeling to force alignment. */
2217 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2218 Considerations:
2219 + How many accesses will become aligned due to the peeling
2220 - How many accesses will become unaligned due to the peeling,
2221 and the cost of misaligned accesses.
2222 - The cost of peeling (the extra runtime checks, the increase
2223 in code size). */
2225 FOR_EACH_VEC_ELT (datarefs, i, dr)
2227 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2228 if (!vect_relevant_for_alignment_p (dr_info))
2229 continue;
2231 stmt_vec_info stmt_info = dr_info->stmt;
2232 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2233 do_peeling = vector_alignment_reachable_p (dr_info);
2234 if (do_peeling)
2236 if (known_alignment_for_access_p (dr_info, vectype))
2238 unsigned int npeel_tmp = 0;
2239 bool negative = tree_int_cst_compare (DR_STEP (dr),
2240 size_zero_node) < 0;
2242 /* If known_alignment_for_access_p then we have set
2243 DR_MISALIGNMENT which is only done if we know it at compiler
2244 time, so it is safe to assume target alignment is constant.
2246 unsigned int target_align =
2247 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2248 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2249 poly_int64 off = 0;
2250 if (negative)
2251 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2252 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2253 mis = negative ? mis : -mis;
2254 if (mis != 0)
2255 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2257 /* For multiple types, it is possible that the bigger type access
2258 will have more than one peeling option. E.g., a loop with two
2259 types: one of size (vector size / 4), and the other one of
2260 size (vector size / 8). Vectorization factor will 8. If both
2261 accesses are misaligned by 3, the first one needs one scalar
2262 iteration to be aligned, and the second one needs 5. But the
2263 first one will be aligned also by peeling 5 scalar
2264 iterations, and in that case both accesses will be aligned.
2265 Hence, except for the immediate peeling amount, we also want
2266 to try to add full vector size, while we don't exceed
2267 vectorization factor.
2268 We do this automatically for cost model, since we calculate
2269 cost for every peeling option. */
2270 poly_uint64 nscalars = npeel_tmp;
2271 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2273 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2274 nscalars = (STMT_SLP_TYPE (stmt_info)
2275 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2278 /* Save info about DR in the hash table. Also include peeling
2279 amounts according to the explanation above. Indicate
2280 the alignment status when the ref is not aligned.
2281 ??? Rather than using unknown alignment here we should
2282 prune all entries from the peeling hashtable which cause
2283 DRs to be not supported. */
2284 bool supportable_if_not_aligned
2285 = vect_supportable_dr_alignment
2286 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2287 while (known_le (npeel_tmp, nscalars))
2289 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2290 dr_info, npeel_tmp,
2291 supportable_if_not_aligned);
2292 npeel_tmp += MAX (1, target_align / dr_size);
2295 one_misalignment_known = true;
2297 else
2299 /* If we don't know any misalignment values, we prefer
2300 peeling for data-ref that has the maximum number of data-refs
2301 with the same alignment, unless the target prefers to align
2302 stores over load. */
2303 unsigned same_align_drs = n_same_align_refs[i];
2304 if (!dr0_info
2305 || dr0_same_align_drs < same_align_drs)
2307 dr0_same_align_drs = same_align_drs;
2308 dr0_info = dr_info;
2310 /* For data-refs with the same number of related
2311 accesses prefer the one where the misalign
2312 computation will be invariant in the outermost loop. */
2313 else if (dr0_same_align_drs == same_align_drs)
2315 class loop *ivloop0, *ivloop;
2316 ivloop0 = outermost_invariant_loop_for_expr
2317 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2318 ivloop = outermost_invariant_loop_for_expr
2319 (loop, DR_BASE_ADDRESS (dr));
2320 if ((ivloop && !ivloop0)
2321 || (ivloop && ivloop0
2322 && flow_loop_nested_p (ivloop, ivloop0)))
2323 dr0_info = dr_info;
2326 one_misalignment_unknown = true;
2328 /* Check for data refs with unsupportable alignment that
2329 can be peeled. */
2330 enum dr_alignment_support supportable_dr_alignment
2331 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2332 DR_MISALIGNMENT_UNKNOWN);
2333 if (supportable_dr_alignment == dr_unaligned_unsupported)
2335 one_dr_unsupportable = true;
2336 unsupportable_dr_info = dr_info;
2339 if (!first_store && DR_IS_WRITE (dr))
2341 first_store = dr_info;
2342 first_store_same_align_drs = same_align_drs;
2346 else
2348 if (!aligned_access_p (dr_info, vectype))
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "vector alignment may not be reachable\n");
2353 break;
2358 /* Check if we can possibly peel the loop. */
2359 if (!vect_can_advance_ivs_p (loop_vinfo)
2360 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2361 loop_preheader_edge (loop))
2362 || loop->inner
2363 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2364 do_peeling = false;
2366 struct _vect_peel_extended_info peel_for_known_alignment;
2367 struct _vect_peel_extended_info peel_for_unknown_alignment;
2368 struct _vect_peel_extended_info best_peel;
2370 peel_for_unknown_alignment.inside_cost = INT_MAX;
2371 peel_for_unknown_alignment.outside_cost = INT_MAX;
2372 peel_for_unknown_alignment.peel_info.count = 0;
2374 if (do_peeling
2375 && one_misalignment_unknown)
2377 /* Check if the target requires to prefer stores over loads, i.e., if
2378 misaligned stores are more expensive than misaligned loads (taking
2379 drs with same alignment into account). */
2380 unsigned int load_inside_cost = 0;
2381 unsigned int load_outside_cost = 0;
2382 unsigned int store_inside_cost = 0;
2383 unsigned int store_outside_cost = 0;
2384 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2386 stmt_vector_for_cost dummy;
2387 dummy.create (2);
2388 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2389 &load_inside_cost,
2390 &load_outside_cost,
2391 &dummy, &dummy, estimated_npeels);
2392 dummy.release ();
2394 if (first_store)
2396 dummy.create (2);
2397 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2398 &store_inside_cost,
2399 &store_outside_cost,
2400 &dummy, &dummy,
2401 estimated_npeels);
2402 dummy.release ();
2404 else
2406 store_inside_cost = INT_MAX;
2407 store_outside_cost = INT_MAX;
2410 if (load_inside_cost > store_inside_cost
2411 || (load_inside_cost == store_inside_cost
2412 && load_outside_cost > store_outside_cost))
2414 dr0_info = first_store;
2415 dr0_same_align_drs = first_store_same_align_drs;
2416 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2417 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2419 else
2421 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2422 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2425 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2426 prologue_cost_vec.create (2);
2427 epilogue_cost_vec.create (2);
2429 int dummy2;
2430 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2431 (loop_vinfo, estimated_npeels, &dummy2,
2432 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2433 &prologue_cost_vec, &epilogue_cost_vec);
2435 prologue_cost_vec.release ();
2436 epilogue_cost_vec.release ();
2438 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2441 peel_for_unknown_alignment.peel_info.npeel = 0;
2442 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2444 best_peel = peel_for_unknown_alignment;
2446 peel_for_known_alignment.inside_cost = INT_MAX;
2447 peel_for_known_alignment.outside_cost = INT_MAX;
2448 peel_for_known_alignment.peel_info.count = 0;
2449 peel_for_known_alignment.peel_info.dr_info = NULL;
2451 if (do_peeling && one_misalignment_known)
2453 /* Peeling is possible, but there is no data access that is not supported
2454 unless aligned. So we try to choose the best possible peeling from
2455 the hash table. */
2456 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2457 (&peeling_htab, loop_vinfo);
2460 /* Compare costs of peeling for known and unknown alignment. */
2461 if (peel_for_known_alignment.peel_info.dr_info != NULL
2462 && peel_for_unknown_alignment.inside_cost
2463 >= peel_for_known_alignment.inside_cost)
2465 best_peel = peel_for_known_alignment;
2467 /* If the best peeling for known alignment has NPEEL == 0, perform no
2468 peeling at all except if there is an unsupportable dr that we can
2469 align. */
2470 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2471 do_peeling = false;
2474 /* If there is an unsupportable data ref, prefer this over all choices so far
2475 since we'd have to discard a chosen peeling except when it accidentally
2476 aligned the unsupportable data ref. */
2477 if (one_dr_unsupportable)
2478 dr0_info = unsupportable_dr_info;
2479 else if (do_peeling)
2481 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2482 TODO: Use nopeel_outside_cost or get rid of it? */
2483 unsigned nopeel_inside_cost = 0;
2484 unsigned nopeel_outside_cost = 0;
2486 stmt_vector_for_cost dummy;
2487 dummy.create (2);
2488 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2489 &nopeel_outside_cost, &dummy, &dummy, 0);
2490 dummy.release ();
2492 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2493 costs will be recorded. */
2494 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2495 prologue_cost_vec.create (2);
2496 epilogue_cost_vec.create (2);
2498 int dummy2;
2499 nopeel_outside_cost += vect_get_known_peeling_cost
2500 (loop_vinfo, 0, &dummy2,
2501 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2502 &prologue_cost_vec, &epilogue_cost_vec);
2504 prologue_cost_vec.release ();
2505 epilogue_cost_vec.release ();
2507 npeel = best_peel.peel_info.npeel;
2508 dr0_info = best_peel.peel_info.dr_info;
2510 /* If no peeling is not more expensive than the best peeling we
2511 have so far, don't perform any peeling. */
2512 if (nopeel_inside_cost <= best_peel.inside_cost)
2513 do_peeling = false;
2516 if (do_peeling)
2518 stmt_vec_info stmt_info = dr0_info->stmt;
2519 if (known_alignment_for_access_p (dr0_info,
2520 STMT_VINFO_VECTYPE (stmt_info)))
2522 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2523 size_zero_node) < 0;
2524 if (!npeel)
2526 /* Since it's known at compile time, compute the number of
2527 iterations in the peeled loop (the peeling factor) for use in
2528 updating DR_MISALIGNMENT values. The peeling factor is the
2529 vectorization factor minus the misalignment as an element
2530 count. */
2531 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2532 poly_int64 off = 0;
2533 if (negative)
2534 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2535 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2536 unsigned int mis
2537 = dr_misalignment (dr0_info, vectype, off);
2538 mis = negative ? mis : -mis;
2539 /* If known_alignment_for_access_p then we have set
2540 DR_MISALIGNMENT which is only done if we know it at compiler
2541 time, so it is safe to assume target alignment is constant.
2543 unsigned int target_align =
2544 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2545 npeel = ((mis & (target_align - 1))
2546 / vect_get_scalar_dr_size (dr0_info));
2549 /* For interleaved data access every iteration accesses all the
2550 members of the group, therefore we divide the number of iterations
2551 by the group size. */
2552 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2553 npeel /= DR_GROUP_SIZE (stmt_info);
2555 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_NOTE, vect_location,
2557 "Try peeling by %d\n", npeel);
2560 /* Ensure that all datarefs can be vectorized after the peel. */
2561 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2562 do_peeling = false;
2564 /* Check if all datarefs are supportable and log. */
2565 if (do_peeling
2566 && npeel == 0
2567 && known_alignment_for_access_p (dr0_info,
2568 STMT_VINFO_VECTYPE (stmt_info)))
2569 return opt_result::success ();
2571 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2572 if (do_peeling)
2574 unsigned max_allowed_peel
2575 = param_vect_max_peeling_for_alignment;
2576 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2577 max_allowed_peel = 0;
2578 if (max_allowed_peel != (unsigned)-1)
2580 unsigned max_peel = npeel;
2581 if (max_peel == 0)
2583 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2584 unsigned HOST_WIDE_INT target_align_c;
2585 if (target_align.is_constant (&target_align_c))
2586 max_peel =
2587 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2588 else
2590 do_peeling = false;
2591 if (dump_enabled_p ())
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "Disable peeling, max peels set and vector"
2594 " alignment unknown\n");
2597 if (max_peel > max_allowed_peel)
2599 do_peeling = false;
2600 if (dump_enabled_p ())
2601 dump_printf_loc (MSG_NOTE, vect_location,
2602 "Disable peeling, max peels reached: %d\n", max_peel);
2607 /* Cost model #2 - if peeling may result in a remaining loop not
2608 iterating enough to be vectorized then do not peel. Since this
2609 is a cost heuristic rather than a correctness decision, use the
2610 most likely runtime value for variable vectorization factors. */
2611 if (do_peeling
2612 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2614 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2615 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2616 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2617 < assumed_vf + max_peel)
2618 do_peeling = false;
2621 if (do_peeling)
2623 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2624 If the misalignment of DR_i is identical to that of dr0 then set
2625 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2626 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2627 by the peeling factor times the element size of DR_i (MOD the
2628 vectorization factor times the size). Otherwise, the
2629 misalignment of DR_i must be set to unknown. */
2630 FOR_EACH_VEC_ELT (datarefs, i, dr)
2631 if (dr != dr0_info->dr)
2633 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2634 if (!vect_relevant_for_alignment_p (dr_info))
2635 continue;
2637 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2640 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2641 if (npeel)
2642 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2643 else
2644 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2645 SET_DR_MISALIGNMENT (dr0_info,
2646 vect_dr_misalign_for_aligned_access (dr0_info));
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "Alignment of access forced using peeling.\n");
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "Peeling for alignment will be applied.\n");
2655 /* The inside-loop cost will be accounted for in vectorizable_load
2656 and vectorizable_store correctly with adjusted alignments.
2657 Drop the body_cst_vec on the floor here. */
2658 return opt_result::success ();
2662 /* (2) Versioning to force alignment. */
2664 /* Try versioning if:
2665 1) optimize loop for speed and the cost-model is not cheap
2666 2) there is at least one unsupported misaligned data ref with an unknown
2667 misalignment, and
2668 3) all misaligned data refs with a known misalignment are supported, and
2669 4) the number of runtime alignment checks is within reason. */
2671 do_versioning
2672 = (optimize_loop_nest_for_speed_p (loop)
2673 && !loop->inner /* FORNOW */
2674 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2676 if (do_versioning)
2678 FOR_EACH_VEC_ELT (datarefs, i, dr)
2680 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2681 if (!vect_relevant_for_alignment_p (dr_info))
2682 continue;
2684 stmt_vec_info stmt_info = dr_info->stmt;
2685 if (STMT_VINFO_STRIDED_P (stmt_info))
2687 do_versioning = false;
2688 break;
2691 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2692 bool negative = tree_int_cst_compare (DR_STEP (dr),
2693 size_zero_node) < 0;
2694 poly_int64 off = 0;
2695 if (negative)
2696 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2697 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2698 int misalignment;
2699 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2700 continue;
2702 enum dr_alignment_support supportable_dr_alignment
2703 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2704 misalignment);
2705 if (supportable_dr_alignment == dr_unaligned_unsupported)
2707 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2708 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2709 >= (unsigned) param_vect_max_version_for_alignment_checks))
2711 do_versioning = false;
2712 break;
2715 /* At present we don't support versioning for alignment
2716 with variable VF, since there's no guarantee that the
2717 VF is a power of two. We could relax this if we added
2718 a way of enforcing a power-of-two size. */
2719 unsigned HOST_WIDE_INT size;
2720 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2722 do_versioning = false;
2723 break;
2726 /* Forcing alignment in the first iteration is no good if
2727 we don't keep it across iterations. For now, just disable
2728 versioning in this case.
2729 ?? We could actually unroll the loop to achieve the required
2730 overall step alignment, and forcing the alignment could be
2731 done by doing some iterations of the non-vectorized loop. */
2732 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2733 * DR_STEP_ALIGNMENT (dr),
2734 DR_TARGET_ALIGNMENT (dr_info)))
2736 do_versioning = false;
2737 break;
2740 /* The rightmost bits of an aligned address must be zeros.
2741 Construct the mask needed for this test. For example,
2742 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2743 mask must be 15 = 0xf. */
2744 int mask = size - 1;
2746 /* FORNOW: use the same mask to test all potentially unaligned
2747 references in the loop. */
2748 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2749 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2751 do_versioning = false;
2752 break;
2755 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2756 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2760 /* Versioning requires at least one misaligned data reference. */
2761 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2762 do_versioning = false;
2763 else if (!do_versioning)
2764 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2767 if (do_versioning)
2769 const vec<stmt_vec_info> &may_misalign_stmts
2770 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2771 stmt_vec_info stmt_info;
2773 /* It can now be assumed that the data references in the statements
2774 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2775 of the loop being vectorized. */
2776 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2778 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2779 SET_DR_MISALIGNMENT (dr_info,
2780 vect_dr_misalign_for_aligned_access (dr_info));
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_NOTE, vect_location,
2783 "Alignment of access forced using versioning.\n");
2786 if (dump_enabled_p ())
2787 dump_printf_loc (MSG_NOTE, vect_location,
2788 "Versioning for alignment will be applied.\n");
2790 /* Peeling and versioning can't be done together at this time. */
2791 gcc_assert (! (do_peeling && do_versioning));
2793 return opt_result::success ();
2796 /* This point is reached if neither peeling nor versioning is being done. */
2797 gcc_assert (! (do_peeling || do_versioning));
2799 return opt_result::success ();
2803 /* Function vect_analyze_data_refs_alignment
2805 Analyze the alignment of the data-references in the loop.
2806 Return FALSE if a data reference is found that cannot be vectorized. */
2808 opt_result
2809 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2811 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2813 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2814 struct data_reference *dr;
2815 unsigned int i;
2817 vect_record_base_alignments (loop_vinfo);
2818 FOR_EACH_VEC_ELT (datarefs, i, dr)
2820 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2821 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2823 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2824 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2825 continue;
2826 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2827 STMT_VINFO_VECTYPE (dr_info->stmt));
2831 return opt_result::success ();
2835 /* Analyze alignment of DRs of stmts in NODE. */
2837 static bool
2838 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2840 /* Alignment is maintained in the first element of the group. */
2841 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2842 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2843 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2844 tree vectype = SLP_TREE_VECTYPE (node);
2845 poly_uint64 vector_alignment
2846 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2847 BITS_PER_UNIT);
2848 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2849 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2850 /* Re-analyze alignment when we're facing a vectorization with a bigger
2851 alignment requirement. */
2852 else if (known_lt (dr_info->target_alignment, vector_alignment))
2854 poly_uint64 old_target_alignment = dr_info->target_alignment;
2855 int old_misalignment = dr_info->misalignment;
2856 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2857 /* But keep knowledge about a smaller alignment. */
2858 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2859 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2861 dr_info->target_alignment = old_target_alignment;
2862 dr_info->misalignment = old_misalignment;
2865 /* When we ever face unordered target alignments the first one wins in terms
2866 of analyzing and the other will become unknown in dr_misalignment. */
2867 return true;
2870 /* Function vect_slp_analyze_instance_alignment
2872 Analyze the alignment of the data-references in the SLP instance.
2873 Return FALSE if a data reference is found that cannot be vectorized. */
2875 bool
2876 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2877 slp_instance instance)
2879 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2881 slp_tree node;
2882 unsigned i;
2883 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2884 if (! vect_slp_analyze_node_alignment (vinfo, node))
2885 return false;
2887 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2888 && ! vect_slp_analyze_node_alignment
2889 (vinfo, SLP_INSTANCE_TREE (instance)))
2890 return false;
2892 return true;
2896 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2897 accesses of legal size, step, etc. Detect gaps, single element
2898 interleaving, and other special cases. Set grouped access info.
2899 Collect groups of strided stores for further use in SLP analysis.
2900 Worker for vect_analyze_group_access. */
2902 static bool
2903 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2905 data_reference *dr = dr_info->dr;
2906 tree step = DR_STEP (dr);
2907 tree scalar_type = TREE_TYPE (DR_REF (dr));
2908 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2909 stmt_vec_info stmt_info = dr_info->stmt;
2910 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2911 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2912 HOST_WIDE_INT dr_step = -1;
2913 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2914 bool slp_impossible = false;
2916 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2917 size of the interleaving group (including gaps). */
2918 if (tree_fits_shwi_p (step))
2920 dr_step = tree_to_shwi (step);
2921 /* Check that STEP is a multiple of type size. Otherwise there is
2922 a non-element-sized gap at the end of the group which we
2923 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2924 ??? As we can handle non-constant step fine here we should
2925 simply remove uses of DR_GROUP_GAP between the last and first
2926 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2927 simply not include that gap. */
2928 if ((dr_step % type_size) != 0)
2930 if (dump_enabled_p ())
2931 dump_printf_loc (MSG_NOTE, vect_location,
2932 "Step %T is not a multiple of the element size"
2933 " for %T\n",
2934 step, DR_REF (dr));
2935 return false;
2937 groupsize = absu_hwi (dr_step) / type_size;
2939 else
2940 groupsize = 0;
2942 /* Not consecutive access is possible only if it is a part of interleaving. */
2943 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2945 /* Check if it this DR is a part of interleaving, and is a single
2946 element of the group that is accessed in the loop. */
2948 /* Gaps are supported only for loads. STEP must be a multiple of the type
2949 size. */
2950 if (DR_IS_READ (dr)
2951 && (dr_step % type_size) == 0
2952 && groupsize > 0
2953 /* This could be UINT_MAX but as we are generating code in a very
2954 inefficient way we have to cap earlier.
2955 See PR91403 for example. */
2956 && groupsize <= 4096)
2958 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2959 DR_GROUP_SIZE (stmt_info) = groupsize;
2960 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2961 if (dump_enabled_p ())
2962 dump_printf_loc (MSG_NOTE, vect_location,
2963 "Detected single element interleaving %T"
2964 " step %T\n",
2965 DR_REF (dr), step);
2967 return true;
2970 if (dump_enabled_p ())
2971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972 "not consecutive access %G", stmt_info->stmt);
2974 if (bb_vinfo)
2976 /* Mark the statement as unvectorizable. */
2977 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2978 return true;
2981 if (dump_enabled_p ())
2982 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2983 STMT_VINFO_STRIDED_P (stmt_info) = true;
2984 return true;
2987 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2989 /* First stmt in the interleaving chain. Check the chain. */
2990 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2991 struct data_reference *data_ref = dr;
2992 unsigned int count = 1;
2993 tree prev_init = DR_INIT (data_ref);
2994 HOST_WIDE_INT diff, gaps = 0;
2996 /* By construction, all group members have INTEGER_CST DR_INITs. */
2997 while (next)
2999 /* We never have the same DR multiple times. */
3000 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3001 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3003 data_ref = STMT_VINFO_DATA_REF (next);
3005 /* All group members have the same STEP by construction. */
3006 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3008 /* Check that the distance between two accesses is equal to the type
3009 size. Otherwise, we have gaps. */
3010 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3011 - TREE_INT_CST_LOW (prev_init)) / type_size;
3012 if (diff < 1 || diff > UINT_MAX)
3014 /* For artificial testcases with array accesses with large
3015 constant indices we can run into overflow issues which
3016 can end up fooling the groupsize constraint below so
3017 check the individual gaps (which are represented as
3018 unsigned int) as well. */
3019 if (dump_enabled_p ())
3020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3021 "interleaved access with gap larger "
3022 "than representable\n");
3023 return false;
3025 if (diff != 1)
3027 /* FORNOW: SLP of accesses with gaps is not supported. */
3028 slp_impossible = true;
3029 if (DR_IS_WRITE (data_ref))
3031 if (dump_enabled_p ())
3032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3033 "interleaved store with gaps\n");
3034 return false;
3037 gaps += diff - 1;
3040 last_accessed_element += diff;
3042 /* Store the gap from the previous member of the group. If there is no
3043 gap in the access, DR_GROUP_GAP is always 1. */
3044 DR_GROUP_GAP (next) = diff;
3046 prev_init = DR_INIT (data_ref);
3047 next = DR_GROUP_NEXT_ELEMENT (next);
3048 /* Count the number of data-refs in the chain. */
3049 count++;
3052 if (groupsize == 0)
3053 groupsize = count + gaps;
3055 /* This could be UINT_MAX but as we are generating code in a very
3056 inefficient way we have to cap earlier. See PR78699 for example. */
3057 if (groupsize > 4096)
3059 if (dump_enabled_p ())
3060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3061 "group is too large\n");
3062 return false;
3065 /* Check that the size of the interleaving is equal to count for stores,
3066 i.e., that there are no gaps. */
3067 if (groupsize != count
3068 && !DR_IS_READ (dr))
3070 groupsize = count;
3071 STMT_VINFO_STRIDED_P (stmt_info) = true;
3074 /* If there is a gap after the last load in the group it is the
3075 difference between the groupsize and the last accessed
3076 element.
3077 When there is no gap, this difference should be 0. */
3078 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3080 DR_GROUP_SIZE (stmt_info) = groupsize;
3081 if (dump_enabled_p ())
3083 dump_printf_loc (MSG_NOTE, vect_location,
3084 "Detected interleaving ");
3085 if (DR_IS_READ (dr))
3086 dump_printf (MSG_NOTE, "load ");
3087 else if (STMT_VINFO_STRIDED_P (stmt_info))
3088 dump_printf (MSG_NOTE, "strided store ");
3089 else
3090 dump_printf (MSG_NOTE, "store ");
3091 dump_printf (MSG_NOTE, "of size %u\n",
3092 (unsigned)groupsize);
3093 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3094 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3095 while (next)
3097 if (DR_GROUP_GAP (next) != 1)
3098 dump_printf_loc (MSG_NOTE, vect_location,
3099 "\t<gap of %d elements>\n",
3100 DR_GROUP_GAP (next) - 1);
3101 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3102 next = DR_GROUP_NEXT_ELEMENT (next);
3104 if (DR_GROUP_GAP (stmt_info) != 0)
3105 dump_printf_loc (MSG_NOTE, vect_location,
3106 "\t<gap of %d elements>\n",
3107 DR_GROUP_GAP (stmt_info));
3110 /* SLP: create an SLP data structure for every interleaving group of
3111 stores for further analysis in vect_analyse_slp. */
3112 if (DR_IS_WRITE (dr) && !slp_impossible)
3114 if (loop_vinfo)
3115 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3116 if (bb_vinfo)
3117 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3121 return true;
3124 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3125 accesses of legal size, step, etc. Detect gaps, single element
3126 interleaving, and other special cases. Set grouped access info.
3127 Collect groups of strided stores for further use in SLP analysis. */
3129 static bool
3130 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3132 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3134 /* Dissolve the group if present. */
3135 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3136 while (stmt_info)
3138 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3139 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3140 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3141 stmt_info = next;
3143 return false;
3145 return true;
3148 /* Analyze the access pattern of the data-reference DR_INFO.
3149 In case of non-consecutive accesses call vect_analyze_group_access() to
3150 analyze groups of accesses. */
3152 static bool
3153 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3155 data_reference *dr = dr_info->dr;
3156 tree step = DR_STEP (dr);
3157 tree scalar_type = TREE_TYPE (DR_REF (dr));
3158 stmt_vec_info stmt_info = dr_info->stmt;
3159 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3160 class loop *loop = NULL;
3162 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3163 return true;
3165 if (loop_vinfo)
3166 loop = LOOP_VINFO_LOOP (loop_vinfo);
3168 if (loop_vinfo && !step)
3170 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3172 "bad data-ref access in loop\n");
3173 return false;
3176 /* Allow loads with zero step in inner-loop vectorization. */
3177 if (loop_vinfo && integer_zerop (step))
3179 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3180 if (!nested_in_vect_loop_p (loop, stmt_info))
3181 return DR_IS_READ (dr);
3182 /* Allow references with zero step for outer loops marked
3183 with pragma omp simd only - it guarantees absence of
3184 loop-carried dependencies between inner loop iterations. */
3185 if (loop->safelen < 2)
3187 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_NOTE, vect_location,
3189 "zero step in inner loop of nest\n");
3190 return false;
3194 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3196 /* Interleaved accesses are not yet supported within outer-loop
3197 vectorization for references in the inner-loop. */
3198 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3200 /* For the rest of the analysis we use the outer-loop step. */
3201 step = STMT_VINFO_DR_STEP (stmt_info);
3202 if (integer_zerop (step))
3204 if (dump_enabled_p ())
3205 dump_printf_loc (MSG_NOTE, vect_location,
3206 "zero step in outer loop.\n");
3207 return DR_IS_READ (dr);
3211 /* Consecutive? */
3212 if (TREE_CODE (step) == INTEGER_CST)
3214 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3215 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3216 || (dr_step < 0
3217 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3219 /* Mark that it is not interleaving. */
3220 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3221 return true;
3225 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3227 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_NOTE, vect_location,
3229 "grouped access in outer loop.\n");
3230 return false;
3234 /* Assume this is a DR handled by non-constant strided load case. */
3235 if (TREE_CODE (step) != INTEGER_CST)
3236 return (STMT_VINFO_STRIDED_P (stmt_info)
3237 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3238 || vect_analyze_group_access (vinfo, dr_info)));
3240 /* Not consecutive access - check if it's a part of interleaving group. */
3241 return vect_analyze_group_access (vinfo, dr_info);
3244 /* Compare two data-references DRA and DRB to group them into chunks
3245 suitable for grouping. */
3247 static int
3248 dr_group_sort_cmp (const void *dra_, const void *drb_)
3250 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3251 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3252 data_reference_p dra = dra_info->dr;
3253 data_reference_p drb = drb_info->dr;
3254 int cmp;
3256 /* Stabilize sort. */
3257 if (dra == drb)
3258 return 0;
3260 /* Different group IDs lead never belong to the same group. */
3261 if (dra_info->group != drb_info->group)
3262 return dra_info->group < drb_info->group ? -1 : 1;
3264 /* Ordering of DRs according to base. */
3265 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3266 DR_BASE_ADDRESS (drb));
3267 if (cmp != 0)
3268 return cmp;
3270 /* And according to DR_OFFSET. */
3271 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3272 if (cmp != 0)
3273 return cmp;
3275 /* Put reads before writes. */
3276 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3277 return DR_IS_READ (dra) ? -1 : 1;
3279 /* Then sort after access size. */
3280 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3281 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3282 if (cmp != 0)
3283 return cmp;
3285 /* And after step. */
3286 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3287 if (cmp != 0)
3288 return cmp;
3290 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3291 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3292 if (cmp == 0)
3293 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3294 return cmp;
3297 /* If OP is the result of a conversion, return the unconverted value,
3298 otherwise return null. */
3300 static tree
3301 strip_conversion (tree op)
3303 if (TREE_CODE (op) != SSA_NAME)
3304 return NULL_TREE;
3305 gimple *stmt = SSA_NAME_DEF_STMT (op);
3306 if (!is_gimple_assign (stmt)
3307 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3308 return NULL_TREE;
3309 return gimple_assign_rhs1 (stmt);
3312 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3313 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3314 be grouped in SLP mode. */
3316 static bool
3317 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3318 bool allow_slp_p)
3320 if (gimple_assign_single_p (stmt1_info->stmt))
3321 return gimple_assign_single_p (stmt2_info->stmt);
3323 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3324 if (call1 && gimple_call_internal_p (call1))
3326 /* Check for two masked loads or two masked stores. */
3327 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3328 if (!call2 || !gimple_call_internal_p (call2))
3329 return false;
3330 internal_fn ifn = gimple_call_internal_fn (call1);
3331 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3332 return false;
3333 if (ifn != gimple_call_internal_fn (call2))
3334 return false;
3336 /* Check that the masks are the same. Cope with casts of masks,
3337 like those created by build_mask_conversion. */
3338 tree mask1 = gimple_call_arg (call1, 2);
3339 tree mask2 = gimple_call_arg (call2, 2);
3340 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3342 mask1 = strip_conversion (mask1);
3343 if (!mask1)
3344 return false;
3345 mask2 = strip_conversion (mask2);
3346 if (!mask2)
3347 return false;
3348 if (!operand_equal_p (mask1, mask2, 0))
3349 return false;
3351 return true;
3354 return false;
3357 /* Function vect_analyze_data_ref_accesses.
3359 Analyze the access pattern of all the data references in the loop.
3361 FORNOW: the only access pattern that is considered vectorizable is a
3362 simple step 1 (consecutive) access.
3364 FORNOW: handle only arrays and pointer accesses. */
3366 opt_result
3367 vect_analyze_data_ref_accesses (vec_info *vinfo,
3368 vec<int> *dataref_groups)
3370 unsigned int i;
3371 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3373 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3375 if (datarefs.is_empty ())
3376 return opt_result::success ();
3378 /* Sort the array of datarefs to make building the interleaving chains
3379 linear. Don't modify the original vector's order, it is needed for
3380 determining what dependencies are reversed. */
3381 vec<dr_vec_info *> datarefs_copy;
3382 datarefs_copy.create (datarefs.length ());
3383 for (unsigned i = 0; i < datarefs.length (); i++)
3385 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3386 /* If the caller computed DR grouping use that, otherwise group by
3387 basic blocks. */
3388 if (dataref_groups)
3389 dr_info->group = (*dataref_groups)[i];
3390 else
3391 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3392 datarefs_copy.quick_push (dr_info);
3394 datarefs_copy.qsort (dr_group_sort_cmp);
3395 hash_set<stmt_vec_info> to_fixup;
3397 /* Build the interleaving chains. */
3398 for (i = 0; i < datarefs_copy.length () - 1;)
3400 dr_vec_info *dr_info_a = datarefs_copy[i];
3401 data_reference_p dra = dr_info_a->dr;
3402 int dra_group_id = dr_info_a->group;
3403 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3404 stmt_vec_info lastinfo = NULL;
3405 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3406 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3408 ++i;
3409 continue;
3411 for (i = i + 1; i < datarefs_copy.length (); ++i)
3413 dr_vec_info *dr_info_b = datarefs_copy[i];
3414 data_reference_p drb = dr_info_b->dr;
3415 int drb_group_id = dr_info_b->group;
3416 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3417 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3418 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3419 break;
3421 /* ??? Imperfect sorting (non-compatible types, non-modulo
3422 accesses, same accesses) can lead to a group to be artificially
3423 split here as we don't just skip over those. If it really
3424 matters we can push those to a worklist and re-iterate
3425 over them. The we can just skip ahead to the next DR here. */
3427 /* DRs in a different DR group should not be put into the same
3428 interleaving group. */
3429 if (dra_group_id != drb_group_id)
3430 break;
3432 /* Check that the data-refs have same first location (except init)
3433 and they are both either store or load (not load and store,
3434 not masked loads or stores). */
3435 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3436 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3437 DR_BASE_ADDRESS (drb)) != 0
3438 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3439 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3440 break;
3442 /* Check that the data-refs have the same constant size. */
3443 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3444 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3445 if (!tree_fits_uhwi_p (sza)
3446 || !tree_fits_uhwi_p (szb)
3447 || !tree_int_cst_equal (sza, szb))
3448 break;
3450 /* Check that the data-refs have the same step. */
3451 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3452 break;
3454 /* Check the types are compatible.
3455 ??? We don't distinguish this during sorting. */
3456 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3457 TREE_TYPE (DR_REF (drb))))
3458 break;
3460 /* Check that the DR_INITs are compile-time constants. */
3461 if (!tree_fits_shwi_p (DR_INIT (dra))
3462 || !tree_fits_shwi_p (DR_INIT (drb)))
3463 break;
3465 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3466 just hold extra information. */
3467 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3468 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3469 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3470 break;
3472 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3473 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3474 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3475 HOST_WIDE_INT init_prev
3476 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3477 gcc_assert (init_a <= init_b
3478 && init_a <= init_prev
3479 && init_prev <= init_b);
3481 /* Do not place the same access in the interleaving chain twice. */
3482 if (init_b == init_prev)
3484 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3485 < gimple_uid (DR_STMT (drb)));
3486 /* Simply link in duplicates and fix up the chain below. */
3488 else
3490 /* If init_b == init_a + the size of the type * k, we have an
3491 interleaving, and DRA is accessed before DRB. */
3492 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3493 if (type_size_a == 0
3494 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3495 % type_size_a != 0))
3496 break;
3498 /* If we have a store, the accesses are adjacent. This splits
3499 groups into chunks we support (we don't support vectorization
3500 of stores with gaps). */
3501 if (!DR_IS_READ (dra)
3502 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3503 != type_size_a))
3504 break;
3506 /* If the step (if not zero or non-constant) is smaller than the
3507 difference between data-refs' inits this splits groups into
3508 suitable sizes. */
3509 if (tree_fits_shwi_p (DR_STEP (dra)))
3511 unsigned HOST_WIDE_INT step
3512 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3513 if (step != 0
3514 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3515 break;
3519 if (dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location,
3521 DR_IS_READ (dra)
3522 ? "Detected interleaving load %T and %T\n"
3523 : "Detected interleaving store %T and %T\n",
3524 DR_REF (dra), DR_REF (drb));
3526 /* Link the found element into the group list. */
3527 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3529 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3530 lastinfo = stmtinfo_a;
3532 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3533 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3534 lastinfo = stmtinfo_b;
3536 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3537 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3539 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3540 dump_printf_loc (MSG_NOTE, vect_location,
3541 "Load suitable for SLP vectorization only.\n");
3543 if (init_b == init_prev
3544 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3545 && dump_enabled_p ())
3546 dump_printf_loc (MSG_NOTE, vect_location,
3547 "Queuing group with duplicate access for fixup\n");
3551 /* Fixup groups with duplicate entries by splitting it. */
3552 while (1)
3554 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3555 if (!(it != to_fixup.end ()))
3556 break;
3557 stmt_vec_info grp = *it;
3558 to_fixup.remove (grp);
3560 /* Find the earliest duplicate group member. */
3561 unsigned first_duplicate = -1u;
3562 stmt_vec_info next, g = grp;
3563 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3565 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3566 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3567 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3568 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3569 g = next;
3571 if (first_duplicate == -1U)
3572 continue;
3574 /* Then move all stmts after the first duplicate to a new group.
3575 Note this is a heuristic but one with the property that *it
3576 is fixed up completely. */
3577 g = grp;
3578 stmt_vec_info newgroup = NULL, ng = grp;
3579 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3581 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3583 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3584 if (!newgroup)
3585 newgroup = next;
3586 else
3587 DR_GROUP_NEXT_ELEMENT (ng) = next;
3588 ng = next;
3589 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3591 else
3592 g = DR_GROUP_NEXT_ELEMENT (g);
3594 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3596 /* Fixup the new group which still may contain duplicates. */
3597 to_fixup.add (newgroup);
3600 dr_vec_info *dr_info;
3601 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3603 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3604 && !vect_analyze_data_ref_access (vinfo, dr_info))
3606 if (dump_enabled_p ())
3607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3608 "not vectorized: complicated access pattern.\n");
3610 if (is_a <bb_vec_info> (vinfo))
3612 /* Mark the statement as not vectorizable. */
3613 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3614 continue;
3616 else
3618 datarefs_copy.release ();
3619 return opt_result::failure_at (dr_info->stmt->stmt,
3620 "not vectorized:"
3621 " complicated access pattern.\n");
3626 datarefs_copy.release ();
3627 return opt_result::success ();
3630 /* Function vect_vfa_segment_size.
3632 Input:
3633 DR_INFO: The data reference.
3634 LENGTH_FACTOR: segment length to consider.
3636 Return a value suitable for the dr_with_seg_len::seg_len field.
3637 This is the "distance travelled" by the pointer from the first
3638 iteration in the segment to the last. Note that it does not include
3639 the size of the access; in effect it only describes the first byte. */
3641 static tree
3642 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3644 length_factor = size_binop (MINUS_EXPR,
3645 fold_convert (sizetype, length_factor),
3646 size_one_node);
3647 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3648 length_factor);
3651 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3652 gives the worst-case number of bytes covered by the segment. */
3654 static unsigned HOST_WIDE_INT
3655 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3657 stmt_vec_info stmt_vinfo = dr_info->stmt;
3658 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3659 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3660 unsigned HOST_WIDE_INT access_size = ref_size;
3661 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3663 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3664 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3666 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3667 int misalignment;
3668 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3669 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3670 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3671 == dr_explicit_realign_optimized))
3673 /* We might access a full vector's worth. */
3674 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3676 return access_size;
3679 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3680 describes. */
3682 static unsigned int
3683 vect_vfa_align (dr_vec_info *dr_info)
3685 return dr_alignment (dr_info->dr);
3688 /* Function vect_no_alias_p.
3690 Given data references A and B with equal base and offset, see whether
3691 the alias relation can be decided at compilation time. Return 1 if
3692 it can and the references alias, 0 if it can and the references do
3693 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3694 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3695 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3697 static int
3698 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3699 tree segment_length_a, tree segment_length_b,
3700 unsigned HOST_WIDE_INT access_size_a,
3701 unsigned HOST_WIDE_INT access_size_b)
3703 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3704 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3705 poly_uint64 const_length_a;
3706 poly_uint64 const_length_b;
3708 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3709 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3710 [a, a+12) */
3711 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3713 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3714 offset_a -= const_length_a;
3716 else
3717 const_length_a = tree_to_poly_uint64 (segment_length_a);
3718 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3720 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3721 offset_b -= const_length_b;
3723 else
3724 const_length_b = tree_to_poly_uint64 (segment_length_b);
3726 const_length_a += access_size_a;
3727 const_length_b += access_size_b;
3729 if (ranges_known_overlap_p (offset_a, const_length_a,
3730 offset_b, const_length_b))
3731 return 1;
3733 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3734 offset_b, const_length_b))
3735 return 0;
3737 return -1;
3740 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3741 in DDR is >= VF. */
3743 static bool
3744 dependence_distance_ge_vf (data_dependence_relation *ddr,
3745 unsigned int loop_depth, poly_uint64 vf)
3747 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3748 || DDR_NUM_DIST_VECTS (ddr) == 0)
3749 return false;
3751 /* If the dependence is exact, we should have limited the VF instead. */
3752 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3754 unsigned int i;
3755 lambda_vector dist_v;
3756 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3758 HOST_WIDE_INT dist = dist_v[loop_depth];
3759 if (dist != 0
3760 && !(dist > 0 && DDR_REVERSED_P (ddr))
3761 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3762 return false;
3765 if (dump_enabled_p ())
3766 dump_printf_loc (MSG_NOTE, vect_location,
3767 "dependence distance between %T and %T is >= VF\n",
3768 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3770 return true;
3773 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3775 static void
3776 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3778 dump_printf (dump_kind, "%s (%T) >= ",
3779 lower_bound.unsigned_p ? "unsigned" : "abs",
3780 lower_bound.expr);
3781 dump_dec (dump_kind, lower_bound.min_value);
3784 /* Record that the vectorized loop requires the vec_lower_bound described
3785 by EXPR, UNSIGNED_P and MIN_VALUE. */
3787 static void
3788 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3789 poly_uint64 min_value)
3791 vec<vec_lower_bound> &lower_bounds
3792 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3793 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3794 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3796 unsigned_p &= lower_bounds[i].unsigned_p;
3797 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3798 if (lower_bounds[i].unsigned_p != unsigned_p
3799 || maybe_lt (lower_bounds[i].min_value, min_value))
3801 lower_bounds[i].unsigned_p = unsigned_p;
3802 lower_bounds[i].min_value = min_value;
3803 if (dump_enabled_p ())
3805 dump_printf_loc (MSG_NOTE, vect_location,
3806 "updating run-time check to ");
3807 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3808 dump_printf (MSG_NOTE, "\n");
3811 return;
3814 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3815 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3818 dump_lower_bound (MSG_NOTE, lower_bound);
3819 dump_printf (MSG_NOTE, "\n");
3821 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3824 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3825 will span fewer than GAP bytes. */
3827 static bool
3828 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3829 poly_int64 gap)
3831 stmt_vec_info stmt_info = dr_info->stmt;
3832 HOST_WIDE_INT count
3833 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3834 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3835 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3836 return (estimated_poly_value (gap)
3837 <= count * vect_get_scalar_dr_size (dr_info));
3840 /* Return true if we know that there is no alias between DR_INFO_A and
3841 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3842 When returning true, set *LOWER_BOUND_OUT to this N. */
3844 static bool
3845 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3846 poly_uint64 *lower_bound_out)
3848 /* Check that there is a constant gap of known sign between DR_A
3849 and DR_B. */
3850 data_reference *dr_a = dr_info_a->dr;
3851 data_reference *dr_b = dr_info_b->dr;
3852 poly_int64 init_a, init_b;
3853 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3854 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3855 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3856 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3857 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3858 || !ordered_p (init_a, init_b))
3859 return false;
3861 /* Sort DR_A and DR_B by the address they access. */
3862 if (maybe_lt (init_b, init_a))
3864 std::swap (init_a, init_b);
3865 std::swap (dr_info_a, dr_info_b);
3866 std::swap (dr_a, dr_b);
3869 /* If the two accesses could be dependent within a scalar iteration,
3870 make sure that we'd retain their order. */
3871 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3872 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3873 return false;
3875 /* There is no alias if abs (DR_STEP) is greater than or equal to
3876 the bytes spanned by the combination of the two accesses. */
3877 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3878 return true;
3881 /* Function vect_prune_runtime_alias_test_list.
3883 Prune a list of ddrs to be tested at run-time by versioning for alias.
3884 Merge several alias checks into one if possible.
3885 Return FALSE if resulting list of ddrs is longer then allowed by
3886 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3888 opt_result
3889 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3891 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3892 hash_set <tree_pair_hash> compared_objects;
3894 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3895 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3896 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3897 const vec<vec_object_pair> &check_unequal_addrs
3898 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3899 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3900 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3902 ddr_p ddr;
3903 unsigned int i;
3904 tree length_factor;
3906 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3908 /* Step values are irrelevant for aliasing if the number of vector
3909 iterations is equal to the number of scalar iterations (which can
3910 happen for fully-SLP loops). */
3911 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3913 if (!vf_one_p)
3915 /* Convert the checks for nonzero steps into bound tests. */
3916 tree value;
3917 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3918 vect_check_lower_bound (loop_vinfo, value, true, 1);
3921 if (may_alias_ddrs.is_empty ())
3922 return opt_result::success ();
3924 comp_alias_ddrs.create (may_alias_ddrs.length ());
3926 unsigned int loop_depth
3927 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3928 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3930 /* First, we collect all data ref pairs for aliasing checks. */
3931 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3933 poly_uint64 lower_bound;
3934 tree segment_length_a, segment_length_b;
3935 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3936 unsigned int align_a, align_b;
3938 /* Ignore the alias if the VF we chose ended up being no greater
3939 than the dependence distance. */
3940 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3941 continue;
3943 if (DDR_OBJECT_A (ddr))
3945 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3946 if (!compared_objects.add (new_pair))
3948 if (dump_enabled_p ())
3949 dump_printf_loc (MSG_NOTE, vect_location,
3950 "checking that %T and %T"
3951 " have different addresses\n",
3952 new_pair.first, new_pair.second);
3953 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3955 continue;
3958 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3959 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3961 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3962 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3964 bool preserves_scalar_order_p
3965 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3966 bool ignore_step_p
3967 = (vf_one_p
3968 && (preserves_scalar_order_p
3969 || operand_equal_p (DR_STEP (dr_info_a->dr),
3970 DR_STEP (dr_info_b->dr))));
3972 /* Skip the pair if inter-iteration dependencies are irrelevant
3973 and intra-iteration dependencies are guaranteed to be honored. */
3974 if (ignore_step_p
3975 && (preserves_scalar_order_p
3976 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3977 &lower_bound)))
3979 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_NOTE, vect_location,
3981 "no need for alias check between "
3982 "%T and %T when VF is 1\n",
3983 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3984 continue;
3987 /* See whether we can handle the alias using a bounds check on
3988 the step, and whether that's likely to be the best approach.
3989 (It might not be, for example, if the minimum step is much larger
3990 than the number of bytes handled by one vector iteration.) */
3991 if (!ignore_step_p
3992 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3993 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3994 &lower_bound)
3995 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3996 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3998 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3999 if (dump_enabled_p ())
4001 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4002 "%T and %T when the step %T is outside ",
4003 DR_REF (dr_info_a->dr),
4004 DR_REF (dr_info_b->dr),
4005 DR_STEP (dr_info_a->dr));
4006 if (unsigned_p)
4007 dump_printf (MSG_NOTE, "[0");
4008 else
4010 dump_printf (MSG_NOTE, "(");
4011 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4013 dump_printf (MSG_NOTE, ", ");
4014 dump_dec (MSG_NOTE, lower_bound);
4015 dump_printf (MSG_NOTE, ")\n");
4017 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4018 unsigned_p, lower_bound);
4019 continue;
4022 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4023 if (dr_group_first_a)
4025 stmt_info_a = dr_group_first_a;
4026 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4029 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4030 if (dr_group_first_b)
4032 stmt_info_b = dr_group_first_b;
4033 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4036 if (ignore_step_p)
4038 segment_length_a = size_zero_node;
4039 segment_length_b = size_zero_node;
4041 else
4043 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4044 DR_STEP (dr_info_b->dr), 0))
4045 length_factor = scalar_loop_iters;
4046 else
4047 length_factor = size_int (vect_factor);
4048 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4049 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4051 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4052 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4053 align_a = vect_vfa_align (dr_info_a);
4054 align_b = vect_vfa_align (dr_info_b);
4056 /* See whether the alias is known at compilation time. */
4057 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4058 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4059 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4060 DR_OFFSET (dr_info_b->dr), 0)
4061 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4062 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4063 && poly_int_tree_p (segment_length_a)
4064 && poly_int_tree_p (segment_length_b))
4066 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4067 segment_length_a,
4068 segment_length_b,
4069 access_size_a,
4070 access_size_b);
4071 if (res >= 0 && dump_enabled_p ())
4073 dump_printf_loc (MSG_NOTE, vect_location,
4074 "can tell at compile time that %T and %T",
4075 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4076 if (res == 0)
4077 dump_printf (MSG_NOTE, " do not alias\n");
4078 else
4079 dump_printf (MSG_NOTE, " alias\n");
4082 if (res == 0)
4083 continue;
4085 if (res == 1)
4086 return opt_result::failure_at (stmt_info_b->stmt,
4087 "not vectorized:"
4088 " compilation time alias: %G%G",
4089 stmt_info_a->stmt,
4090 stmt_info_b->stmt);
4093 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4094 access_size_a, align_a);
4095 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4096 access_size_b, align_b);
4097 /* Canonicalize the order to be the one that's needed for accurate
4098 RAW, WAR and WAW flags, in cases where the data references are
4099 well-ordered. The order doesn't really matter otherwise,
4100 but we might as well be consistent. */
4101 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4102 std::swap (dr_a, dr_b);
4104 dr_with_seg_len_pair_t dr_with_seg_len_pair
4105 (dr_a, dr_b, (preserves_scalar_order_p
4106 ? dr_with_seg_len_pair_t::WELL_ORDERED
4107 : dr_with_seg_len_pair_t::REORDERED));
4109 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4112 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4114 unsigned int count = (comp_alias_ddrs.length ()
4115 + check_unequal_addrs.length ());
4117 if (count
4118 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4119 == VECT_COST_MODEL_VERY_CHEAP))
4120 return opt_result::failure_at
4121 (vect_location, "would need a runtime alias check\n");
4123 if (dump_enabled_p ())
4124 dump_printf_loc (MSG_NOTE, vect_location,
4125 "improved number of alias checks from %d to %d\n",
4126 may_alias_ddrs.length (), count);
4127 unsigned limit = param_vect_max_version_for_alias_checks;
4128 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4129 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4130 if (count > limit)
4131 return opt_result::failure_at
4132 (vect_location,
4133 "number of versioning for alias run-time tests exceeds %d "
4134 "(--param vect-max-version-for-alias-checks)\n", limit);
4136 return opt_result::success ();
4139 /* Check whether we can use an internal function for a gather load
4140 or scatter store. READ_P is true for loads and false for stores.
4141 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4142 the type of the memory elements being loaded or stored. OFFSET_TYPE
4143 is the type of the offset that is being applied to the invariant
4144 base address. SCALE is the amount by which the offset should
4145 be multiplied *after* it has been converted to address width.
4147 Return true if the function is supported, storing the function id in
4148 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4150 bool
4151 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4152 tree vectype, tree memory_type, tree offset_type,
4153 int scale, internal_fn *ifn_out,
4154 tree *offset_vectype_out)
4156 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4157 unsigned int element_bits = vector_element_bits (vectype);
4158 if (element_bits != memory_bits)
4159 /* For now the vector elements must be the same width as the
4160 memory elements. */
4161 return false;
4163 /* Work out which function we need. */
4164 internal_fn ifn, alt_ifn, alt_ifn2;
4165 if (read_p)
4167 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4168 alt_ifn = IFN_MASK_GATHER_LOAD;
4169 /* When target supports MASK_LEN_GATHER_LOAD, we always
4170 use MASK_LEN_GATHER_LOAD regardless whether len and
4171 mask are valid or not. */
4172 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4174 else
4176 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4177 alt_ifn = IFN_MASK_SCATTER_STORE;
4178 /* When target supports MASK_LEN_SCATTER_STORE, we always
4179 use MASK_LEN_SCATTER_STORE regardless whether len and
4180 mask are valid or not. */
4181 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4184 for (;;)
4186 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4187 if (!offset_vectype)
4188 return false;
4190 /* Test whether the target supports this combination. */
4191 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4192 offset_vectype, scale))
4194 *ifn_out = ifn;
4195 *offset_vectype_out = offset_vectype;
4196 return true;
4198 else if (!masked_p
4199 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4200 memory_type,
4201 offset_vectype,
4202 scale))
4204 *ifn_out = alt_ifn;
4205 *offset_vectype_out = offset_vectype;
4206 return true;
4208 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4209 memory_type,
4210 offset_vectype, scale))
4212 *ifn_out = alt_ifn2;
4213 *offset_vectype_out = offset_vectype;
4214 return true;
4217 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4218 && TYPE_PRECISION (offset_type) >= element_bits)
4219 return false;
4221 offset_type = build_nonstandard_integer_type
4222 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4226 /* STMT_INFO is a call to an internal gather load or scatter store function.
4227 Describe the operation in INFO. */
4229 static void
4230 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4231 gather_scatter_info *info)
4233 gcall *call = as_a <gcall *> (stmt_info->stmt);
4234 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4235 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4237 info->ifn = gimple_call_internal_fn (call);
4238 info->decl = NULL_TREE;
4239 info->base = gimple_call_arg (call, 0);
4240 info->offset = gimple_call_arg (call, 1);
4241 info->offset_dt = vect_unknown_def_type;
4242 info->offset_vectype = NULL_TREE;
4243 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4244 info->element_type = TREE_TYPE (vectype);
4245 info->memory_type = TREE_TYPE (DR_REF (dr));
4248 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4249 gather load or scatter store. Describe the operation in *INFO if so. */
4251 bool
4252 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4253 gather_scatter_info *info)
4255 HOST_WIDE_INT scale = 1;
4256 poly_int64 pbitpos, pbitsize;
4257 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4258 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4259 tree offtype = NULL_TREE;
4260 tree decl = NULL_TREE, base, off;
4261 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4262 tree memory_type = TREE_TYPE (DR_REF (dr));
4263 machine_mode pmode;
4264 int punsignedp, reversep, pvolatilep = 0;
4265 internal_fn ifn;
4266 tree offset_vectype;
4267 bool masked_p = false;
4269 /* See whether this is already a call to a gather/scatter internal function.
4270 If not, see whether it's a masked load or store. */
4271 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4272 if (call && gimple_call_internal_p (call))
4274 ifn = gimple_call_internal_fn (call);
4275 if (internal_gather_scatter_fn_p (ifn))
4277 vect_describe_gather_scatter_call (stmt_info, info);
4278 return true;
4280 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4283 /* True if we should aim to use internal functions rather than
4284 built-in functions. */
4285 bool use_ifn_p = (DR_IS_READ (dr)
4286 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4287 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4289 base = DR_REF (dr);
4290 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4291 see if we can use the def stmt of the address. */
4292 if (masked_p
4293 && TREE_CODE (base) == MEM_REF
4294 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4295 && integer_zerop (TREE_OPERAND (base, 1))
4296 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4298 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4299 if (is_gimple_assign (def_stmt)
4300 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4301 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4304 /* The gather and scatter builtins need address of the form
4305 loop_invariant + vector * {1, 2, 4, 8}
4307 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4308 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4309 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4310 multiplications and additions in it. To get a vector, we need
4311 a single SSA_NAME that will be defined in the loop and will
4312 contain everything that is not loop invariant and that can be
4313 vectorized. The following code attempts to find such a preexistng
4314 SSA_NAME OFF and put the loop invariants into a tree BASE
4315 that can be gimplified before the loop. */
4316 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4317 &punsignedp, &reversep, &pvolatilep);
4318 if (reversep)
4319 return false;
4321 /* PR 107346. Packed structs can have fields at offsets that are not
4322 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4323 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4324 return false;
4326 /* We need to be able to form an address to the base which for example
4327 isn't possible for hard registers. */
4328 if (may_be_nonaddressable_p (base))
4329 return false;
4331 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4333 if (TREE_CODE (base) == MEM_REF)
4335 if (!integer_zerop (TREE_OPERAND (base, 1)))
4337 if (off == NULL_TREE)
4338 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4339 else
4340 off = size_binop (PLUS_EXPR, off,
4341 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4343 base = TREE_OPERAND (base, 0);
4345 else
4346 base = build_fold_addr_expr (base);
4348 if (off == NULL_TREE)
4349 off = size_zero_node;
4351 /* If base is not loop invariant, either off is 0, then we start with just
4352 the constant offset in the loop invariant BASE and continue with base
4353 as OFF, otherwise give up.
4354 We could handle that case by gimplifying the addition of base + off
4355 into some SSA_NAME and use that as off, but for now punt. */
4356 if (!expr_invariant_in_loop_p (loop, base))
4358 if (!integer_zerop (off))
4359 return false;
4360 off = base;
4361 base = size_int (pbytepos);
4363 /* Otherwise put base + constant offset into the loop invariant BASE
4364 and continue with OFF. */
4365 else
4367 base = fold_convert (sizetype, base);
4368 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4371 /* OFF at this point may be either a SSA_NAME or some tree expression
4372 from get_inner_reference. Try to peel off loop invariants from it
4373 into BASE as long as possible. */
4374 STRIP_NOPS (off);
4375 while (offtype == NULL_TREE)
4377 enum tree_code code;
4378 tree op0, op1, add = NULL_TREE;
4380 if (TREE_CODE (off) == SSA_NAME)
4382 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4384 if (expr_invariant_in_loop_p (loop, off))
4385 return false;
4387 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4388 break;
4390 op0 = gimple_assign_rhs1 (def_stmt);
4391 code = gimple_assign_rhs_code (def_stmt);
4392 op1 = gimple_assign_rhs2 (def_stmt);
4394 else
4396 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4397 return false;
4398 code = TREE_CODE (off);
4399 extract_ops_from_tree (off, &code, &op0, &op1);
4401 switch (code)
4403 case POINTER_PLUS_EXPR:
4404 case PLUS_EXPR:
4405 if (expr_invariant_in_loop_p (loop, op0))
4407 add = op0;
4408 off = op1;
4409 do_add:
4410 add = fold_convert (sizetype, add);
4411 if (scale != 1)
4412 add = size_binop (MULT_EXPR, add, size_int (scale));
4413 base = size_binop (PLUS_EXPR, base, add);
4414 continue;
4416 if (expr_invariant_in_loop_p (loop, op1))
4418 add = op1;
4419 off = op0;
4420 goto do_add;
4422 break;
4423 case MINUS_EXPR:
4424 if (expr_invariant_in_loop_p (loop, op1))
4426 add = fold_convert (sizetype, op1);
4427 add = size_binop (MINUS_EXPR, size_zero_node, add);
4428 off = op0;
4429 goto do_add;
4431 break;
4432 case MULT_EXPR:
4433 if (scale == 1 && tree_fits_shwi_p (op1))
4435 int new_scale = tree_to_shwi (op1);
4436 /* Only treat this as a scaling operation if the target
4437 supports it for at least some offset type. */
4438 if (use_ifn_p
4439 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4440 masked_p, vectype, memory_type,
4441 signed_char_type_node,
4442 new_scale, &ifn,
4443 &offset_vectype)
4444 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4445 masked_p, vectype, memory_type,
4446 unsigned_char_type_node,
4447 new_scale, &ifn,
4448 &offset_vectype))
4449 break;
4450 scale = new_scale;
4451 off = op0;
4452 continue;
4454 break;
4455 case SSA_NAME:
4456 off = op0;
4457 continue;
4458 CASE_CONVERT:
4459 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4460 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4461 break;
4463 /* Don't include the conversion if the target is happy with
4464 the current offset type. */
4465 if (use_ifn_p
4466 && TREE_CODE (off) == SSA_NAME
4467 && !POINTER_TYPE_P (TREE_TYPE (off))
4468 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4469 masked_p, vectype, memory_type,
4470 TREE_TYPE (off), scale, &ifn,
4471 &offset_vectype))
4472 break;
4474 if (TYPE_PRECISION (TREE_TYPE (op0))
4475 == TYPE_PRECISION (TREE_TYPE (off)))
4477 off = op0;
4478 continue;
4481 /* Include the conversion if it is widening and we're using
4482 the IFN path or the target can handle the converted from
4483 offset or the current size is not already the same as the
4484 data vector element size. */
4485 if ((TYPE_PRECISION (TREE_TYPE (op0))
4486 < TYPE_PRECISION (TREE_TYPE (off)))
4487 && (use_ifn_p
4488 || (DR_IS_READ (dr)
4489 ? (targetm.vectorize.builtin_gather
4490 && targetm.vectorize.builtin_gather (vectype,
4491 TREE_TYPE (op0),
4492 scale))
4493 : (targetm.vectorize.builtin_scatter
4494 && targetm.vectorize.builtin_scatter (vectype,
4495 TREE_TYPE (op0),
4496 scale)))
4497 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4498 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4500 off = op0;
4501 offtype = TREE_TYPE (off);
4502 STRIP_NOPS (off);
4503 continue;
4505 break;
4506 default:
4507 break;
4509 break;
4512 /* If at the end OFF still isn't a SSA_NAME or isn't
4513 defined in the loop, punt. */
4514 if (TREE_CODE (off) != SSA_NAME
4515 || expr_invariant_in_loop_p (loop, off))
4516 return false;
4518 if (offtype == NULL_TREE)
4519 offtype = TREE_TYPE (off);
4521 if (use_ifn_p)
4523 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4524 vectype, memory_type, offtype, scale,
4525 &ifn, &offset_vectype))
4526 ifn = IFN_LAST;
4527 decl = NULL_TREE;
4529 else
4531 if (DR_IS_READ (dr))
4533 if (targetm.vectorize.builtin_gather)
4534 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4536 else
4538 if (targetm.vectorize.builtin_scatter)
4539 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4541 ifn = IFN_LAST;
4542 /* The offset vector type will be read from DECL when needed. */
4543 offset_vectype = NULL_TREE;
4546 info->ifn = ifn;
4547 info->decl = decl;
4548 info->base = base;
4549 info->offset = off;
4550 info->offset_dt = vect_unknown_def_type;
4551 info->offset_vectype = offset_vectype;
4552 info->scale = scale;
4553 info->element_type = TREE_TYPE (vectype);
4554 info->memory_type = memory_type;
4555 return true;
4558 /* Find the data references in STMT, analyze them with respect to LOOP and
4559 append them to DATAREFS. Return false if datarefs in this stmt cannot
4560 be handled. */
4562 opt_result
4563 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4564 vec<data_reference_p> *datarefs,
4565 vec<int> *dataref_groups, int group_id)
4567 /* We can ignore clobbers for dataref analysis - they are removed during
4568 loop vectorization and BB vectorization checks dependences with a
4569 stmt walk. */
4570 if (gimple_clobber_p (stmt))
4571 return opt_result::success ();
4573 if (gimple_has_volatile_ops (stmt))
4574 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4575 stmt);
4577 if (stmt_can_throw_internal (cfun, stmt))
4578 return opt_result::failure_at (stmt,
4579 "not vectorized:"
4580 " statement can throw an exception: %G",
4581 stmt);
4583 auto_vec<data_reference_p, 2> refs;
4584 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4585 if (!res)
4586 return res;
4588 if (refs.is_empty ())
4589 return opt_result::success ();
4591 if (refs.length () > 1)
4593 while (!refs.is_empty ())
4594 free_data_ref (refs.pop ());
4595 return opt_result::failure_at (stmt,
4596 "not vectorized: more than one "
4597 "data ref in stmt: %G", stmt);
4600 data_reference_p dr = refs.pop ();
4601 if (gcall *call = dyn_cast <gcall *> (stmt))
4602 if (!gimple_call_internal_p (call)
4603 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4604 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4606 free_data_ref (dr);
4607 return opt_result::failure_at (stmt,
4608 "not vectorized: dr in a call %G", stmt);
4611 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4612 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4614 free_data_ref (dr);
4615 return opt_result::failure_at (stmt,
4616 "not vectorized:"
4617 " statement is an unsupported"
4618 " bitfield access %G", stmt);
4621 if (DR_BASE_ADDRESS (dr)
4622 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4624 free_data_ref (dr);
4625 return opt_result::failure_at (stmt,
4626 "not vectorized:"
4627 " base addr of dr is a constant\n");
4630 /* Check whether this may be a SIMD lane access and adjust the
4631 DR to make it easier for us to handle it. */
4632 if (loop
4633 && loop->simduid
4634 && (!DR_BASE_ADDRESS (dr)
4635 || !DR_OFFSET (dr)
4636 || !DR_INIT (dr)
4637 || !DR_STEP (dr)))
4639 struct data_reference *newdr
4640 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4641 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4642 if (DR_BASE_ADDRESS (newdr)
4643 && DR_OFFSET (newdr)
4644 && DR_INIT (newdr)
4645 && DR_STEP (newdr)
4646 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4647 && integer_zerop (DR_STEP (newdr)))
4649 tree base_address = DR_BASE_ADDRESS (newdr);
4650 tree off = DR_OFFSET (newdr);
4651 tree step = ssize_int (1);
4652 if (integer_zerop (off)
4653 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4655 off = TREE_OPERAND (base_address, 1);
4656 base_address = TREE_OPERAND (base_address, 0);
4658 STRIP_NOPS (off);
4659 if (TREE_CODE (off) == MULT_EXPR
4660 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4662 step = TREE_OPERAND (off, 1);
4663 off = TREE_OPERAND (off, 0);
4664 STRIP_NOPS (off);
4666 if (CONVERT_EXPR_P (off)
4667 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4668 < TYPE_PRECISION (TREE_TYPE (off))))
4669 off = TREE_OPERAND (off, 0);
4670 if (TREE_CODE (off) == SSA_NAME)
4672 gimple *def = SSA_NAME_DEF_STMT (off);
4673 /* Look through widening conversion. */
4674 if (is_gimple_assign (def)
4675 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4677 tree rhs1 = gimple_assign_rhs1 (def);
4678 if (TREE_CODE (rhs1) == SSA_NAME
4679 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4680 && (TYPE_PRECISION (TREE_TYPE (off))
4681 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4682 def = SSA_NAME_DEF_STMT (rhs1);
4684 if (is_gimple_call (def)
4685 && gimple_call_internal_p (def)
4686 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4688 tree arg = gimple_call_arg (def, 0);
4689 tree reft = TREE_TYPE (DR_REF (newdr));
4690 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4691 arg = SSA_NAME_VAR (arg);
4692 if (arg == loop->simduid
4693 /* For now. */
4694 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4696 DR_BASE_ADDRESS (newdr) = base_address;
4697 DR_OFFSET (newdr) = ssize_int (0);
4698 DR_STEP (newdr) = step;
4699 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4700 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4701 /* Mark as simd-lane access. */
4702 tree arg2 = gimple_call_arg (def, 1);
4703 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4704 free_data_ref (dr);
4705 datarefs->safe_push (newdr);
4706 if (dataref_groups)
4707 dataref_groups->safe_push (group_id);
4708 return opt_result::success ();
4713 free_data_ref (newdr);
4716 datarefs->safe_push (dr);
4717 if (dataref_groups)
4718 dataref_groups->safe_push (group_id);
4719 return opt_result::success ();
4722 /* Function vect_analyze_data_refs.
4724 Find all the data references in the loop or basic block.
4726 The general structure of the analysis of data refs in the vectorizer is as
4727 follows:
4728 1- vect_analyze_data_refs(loop/bb): call
4729 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4730 in the loop/bb and their dependences.
4731 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4732 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4733 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4737 opt_result
4738 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4740 class loop *loop = NULL;
4741 unsigned int i;
4742 struct data_reference *dr;
4743 tree scalar_type;
4745 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4747 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4748 loop = LOOP_VINFO_LOOP (loop_vinfo);
4750 /* Go through the data-refs, check that the analysis succeeded. Update
4751 pointer from stmt_vec_info struct to DR and vectype. */
4753 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4754 FOR_EACH_VEC_ELT (datarefs, i, dr)
4756 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4757 poly_uint64 vf;
4759 gcc_assert (DR_REF (dr));
4760 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4761 gcc_assert (!stmt_info->dr_aux.dr);
4762 stmt_info->dr_aux.dr = dr;
4763 stmt_info->dr_aux.stmt = stmt_info;
4765 /* Check that analysis of the data-ref succeeded. */
4766 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4767 || !DR_STEP (dr))
4769 bool maybe_gather
4770 = DR_IS_READ (dr)
4771 && !TREE_THIS_VOLATILE (DR_REF (dr));
4772 bool maybe_scatter
4773 = DR_IS_WRITE (dr)
4774 && !TREE_THIS_VOLATILE (DR_REF (dr));
4776 /* If target supports vector gather loads or scatter stores,
4777 see if they can't be used. */
4778 if (is_a <loop_vec_info> (vinfo)
4779 && !nested_in_vect_loop_p (loop, stmt_info))
4781 if (maybe_gather || maybe_scatter)
4783 if (maybe_gather)
4784 gatherscatter = GATHER;
4785 else
4786 gatherscatter = SCATTER;
4790 if (gatherscatter == SG_NONE)
4792 if (dump_enabled_p ())
4793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4794 "not vectorized: data ref analysis "
4795 "failed %G", stmt_info->stmt);
4796 if (is_a <bb_vec_info> (vinfo))
4798 /* In BB vectorization the ref can still participate
4799 in dependence analysis, we just can't vectorize it. */
4800 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4801 continue;
4803 return opt_result::failure_at (stmt_info->stmt,
4804 "not vectorized:"
4805 " data ref analysis failed: %G",
4806 stmt_info->stmt);
4810 /* See if this was detected as SIMD lane access. */
4811 if (dr->aux == (void *)-1
4812 || dr->aux == (void *)-2
4813 || dr->aux == (void *)-3
4814 || dr->aux == (void *)-4)
4816 if (nested_in_vect_loop_p (loop, stmt_info))
4817 return opt_result::failure_at (stmt_info->stmt,
4818 "not vectorized:"
4819 " data ref analysis failed: %G",
4820 stmt_info->stmt);
4821 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4822 = -(uintptr_t) dr->aux;
4825 tree base = get_base_address (DR_REF (dr));
4826 if (base && VAR_P (base) && DECL_NONALIASED (base))
4828 if (dump_enabled_p ())
4829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4830 "not vectorized: base object not addressable "
4831 "for stmt: %G", stmt_info->stmt);
4832 if (is_a <bb_vec_info> (vinfo))
4834 /* In BB vectorization the ref can still participate
4835 in dependence analysis, we just can't vectorize it. */
4836 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4837 continue;
4839 return opt_result::failure_at (stmt_info->stmt,
4840 "not vectorized: base object not"
4841 " addressable for stmt: %G",
4842 stmt_info->stmt);
4845 if (is_a <loop_vec_info> (vinfo)
4846 && DR_STEP (dr)
4847 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4849 if (nested_in_vect_loop_p (loop, stmt_info))
4850 return opt_result::failure_at (stmt_info->stmt,
4851 "not vectorized: "
4852 "not suitable for strided load %G",
4853 stmt_info->stmt);
4854 STMT_VINFO_STRIDED_P (stmt_info) = true;
4857 /* Update DR field in stmt_vec_info struct. */
4859 /* If the dataref is in an inner-loop of the loop that is considered for
4860 for vectorization, we also want to analyze the access relative to
4861 the outer-loop (DR contains information only relative to the
4862 inner-most enclosing loop). We do that by building a reference to the
4863 first location accessed by the inner-loop, and analyze it relative to
4864 the outer-loop. */
4865 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4867 /* Build a reference to the first location accessed by the
4868 inner loop: *(BASE + INIT + OFFSET). By construction,
4869 this address must be invariant in the inner loop, so we
4870 can consider it as being used in the outer loop. */
4871 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4872 tree offset = unshare_expr (DR_OFFSET (dr));
4873 tree init = unshare_expr (DR_INIT (dr));
4874 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4875 init, offset);
4876 tree init_addr = fold_build_pointer_plus (base, init_offset);
4877 tree init_ref = build_fold_indirect_ref (init_addr);
4879 if (dump_enabled_p ())
4880 dump_printf_loc (MSG_NOTE, vect_location,
4881 "analyze in outer loop: %T\n", init_ref);
4883 opt_result res
4884 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4885 init_ref, loop, stmt_info->stmt);
4886 if (!res)
4887 /* dr_analyze_innermost already explained the failure. */
4888 return res;
4890 if (dump_enabled_p ())
4891 dump_printf_loc (MSG_NOTE, vect_location,
4892 "\touter base_address: %T\n"
4893 "\touter offset from base address: %T\n"
4894 "\touter constant offset from base address: %T\n"
4895 "\touter step: %T\n"
4896 "\touter base alignment: %d\n\n"
4897 "\touter base misalignment: %d\n"
4898 "\touter offset alignment: %d\n"
4899 "\touter step alignment: %d\n",
4900 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4901 STMT_VINFO_DR_OFFSET (stmt_info),
4902 STMT_VINFO_DR_INIT (stmt_info),
4903 STMT_VINFO_DR_STEP (stmt_info),
4904 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4905 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4906 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4907 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4910 /* Set vectype for STMT. */
4911 scalar_type = TREE_TYPE (DR_REF (dr));
4912 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4913 if (!vectype)
4915 if (dump_enabled_p ())
4917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4918 "not vectorized: no vectype for stmt: %G",
4919 stmt_info->stmt);
4920 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4921 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4922 scalar_type);
4923 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4926 if (is_a <bb_vec_info> (vinfo))
4928 /* No vector type is fine, the ref can still participate
4929 in dependence analysis, we just can't vectorize it. */
4930 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4931 continue;
4933 if (fatal)
4934 *fatal = false;
4935 return opt_result::failure_at (stmt_info->stmt,
4936 "not vectorized:"
4937 " no vectype for stmt: %G"
4938 " scalar_type: %T\n",
4939 stmt_info->stmt, scalar_type);
4941 else
4943 if (dump_enabled_p ())
4944 dump_printf_loc (MSG_NOTE, vect_location,
4945 "got vectype for stmt: %G%T\n",
4946 stmt_info->stmt, vectype);
4949 /* Adjust the minimal vectorization factor according to the
4950 vector type. */
4951 vf = TYPE_VECTOR_SUBPARTS (vectype);
4952 *min_vf = upper_bound (*min_vf, vf);
4954 /* Leave the BB vectorizer to pick the vector type later, based on
4955 the final dataref group size and SLP node size. */
4956 if (is_a <loop_vec_info> (vinfo))
4957 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4959 if (gatherscatter != SG_NONE)
4961 gather_scatter_info gs_info;
4962 if (!vect_check_gather_scatter (stmt_info,
4963 as_a <loop_vec_info> (vinfo),
4964 &gs_info)
4965 || !get_vectype_for_scalar_type (vinfo,
4966 TREE_TYPE (gs_info.offset)))
4968 if (fatal)
4969 *fatal = false;
4970 return opt_result::failure_at
4971 (stmt_info->stmt,
4972 (gatherscatter == GATHER)
4973 ? "not vectorized: not suitable for gather load %G"
4974 : "not vectorized: not suitable for scatter store %G",
4975 stmt_info->stmt);
4977 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4981 /* We used to stop processing and prune the list here. Verify we no
4982 longer need to. */
4983 gcc_assert (i == datarefs.length ());
4985 return opt_result::success ();
4989 /* Function vect_get_new_vect_var.
4991 Returns a name for a new variable. The current naming scheme appends the
4992 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4993 the name of vectorizer generated variables, and appends that to NAME if
4994 provided. */
4996 tree
4997 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4999 const char *prefix;
5000 tree new_vect_var;
5002 switch (var_kind)
5004 case vect_simple_var:
5005 prefix = "vect";
5006 break;
5007 case vect_scalar_var:
5008 prefix = "stmp";
5009 break;
5010 case vect_mask_var:
5011 prefix = "mask";
5012 break;
5013 case vect_pointer_var:
5014 prefix = "vectp";
5015 break;
5016 default:
5017 gcc_unreachable ();
5020 if (name)
5022 char* tmp = concat (prefix, "_", name, NULL);
5023 new_vect_var = create_tmp_reg (type, tmp);
5024 free (tmp);
5026 else
5027 new_vect_var = create_tmp_reg (type, prefix);
5029 return new_vect_var;
5032 /* Like vect_get_new_vect_var but return an SSA name. */
5034 tree
5035 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5037 const char *prefix;
5038 tree new_vect_var;
5040 switch (var_kind)
5042 case vect_simple_var:
5043 prefix = "vect";
5044 break;
5045 case vect_scalar_var:
5046 prefix = "stmp";
5047 break;
5048 case vect_pointer_var:
5049 prefix = "vectp";
5050 break;
5051 default:
5052 gcc_unreachable ();
5055 if (name)
5057 char* tmp = concat (prefix, "_", name, NULL);
5058 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5059 free (tmp);
5061 else
5062 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5064 return new_vect_var;
5067 /* Duplicate points-to info on NAME from DR_INFO. */
5069 static void
5070 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5072 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5073 /* DR_PTR_INFO is for a base SSA name, not including constant or
5074 variable offsets in the ref so its alignment info does not apply. */
5075 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5078 /* Function vect_create_addr_base_for_vector_ref.
5080 Create an expression that computes the address of the first memory location
5081 that will be accessed for a data reference.
5083 Input:
5084 STMT_INFO: The statement containing the data reference.
5085 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5086 OFFSET: Optional. If supplied, it is be added to the initial address.
5087 LOOP: Specify relative to which loop-nest should the address be computed.
5088 For example, when the dataref is in an inner-loop nested in an
5089 outer-loop that is now being vectorized, LOOP can be either the
5090 outer-loop, or the inner-loop. The first memory location accessed
5091 by the following dataref ('in' points to short):
5093 for (i=0; i<N; i++)
5094 for (j=0; j<M; j++)
5095 s += in[i+j]
5097 is as follows:
5098 if LOOP=i_loop: &in (relative to i_loop)
5099 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5101 Output:
5102 1. Return an SSA_NAME whose value is the address of the memory location of
5103 the first vector of the data reference.
5104 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5105 these statement(s) which define the returned SSA_NAME.
5107 FORNOW: We are only handling array accesses with step 1. */
5109 tree
5110 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5111 gimple_seq *new_stmt_list,
5112 tree offset)
5114 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5115 struct data_reference *dr = dr_info->dr;
5116 const char *base_name;
5117 tree addr_base;
5118 tree dest;
5119 gimple_seq seq = NULL;
5120 tree vect_ptr_type;
5121 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5122 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5124 tree data_ref_base = unshare_expr (drb->base_address);
5125 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5126 tree init = unshare_expr (drb->init);
5128 if (loop_vinfo)
5129 base_name = get_name (data_ref_base);
5130 else
5132 base_offset = ssize_int (0);
5133 init = ssize_int (0);
5134 base_name = get_name (DR_REF (dr));
5137 /* Create base_offset */
5138 base_offset = size_binop (PLUS_EXPR,
5139 fold_convert (sizetype, base_offset),
5140 fold_convert (sizetype, init));
5142 if (offset)
5144 offset = fold_convert (sizetype, offset);
5145 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5146 base_offset, offset);
5149 /* base + base_offset */
5150 if (loop_vinfo)
5151 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5152 else
5153 addr_base = build1 (ADDR_EXPR,
5154 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5155 /* Strip zero offset components since we don't need
5156 them and they can confuse late diagnostics if
5157 we CSE them wrongly. See PR106904 for example. */
5158 unshare_expr (strip_zero_offset_components
5159 (DR_REF (dr))));
5161 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5162 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5163 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5164 gimple_seq_add_seq (new_stmt_list, seq);
5166 if (DR_PTR_INFO (dr)
5167 && TREE_CODE (addr_base) == SSA_NAME
5168 /* We should only duplicate pointer info to newly created SSA names. */
5169 && SSA_NAME_VAR (addr_base) == dest)
5171 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5172 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5175 if (dump_enabled_p ())
5176 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5178 return addr_base;
5182 /* Function vect_create_data_ref_ptr.
5184 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5185 location accessed in the loop by STMT_INFO, along with the def-use update
5186 chain to appropriately advance the pointer through the loop iterations.
5187 Also set aliasing information for the pointer. This pointer is used by
5188 the callers to this function to create a memory reference expression for
5189 vector load/store access.
5191 Input:
5192 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5193 GIMPLE_ASSIGN <name, data-ref> or
5194 GIMPLE_ASSIGN <data-ref, name>.
5195 2. AGGR_TYPE: the type of the reference, which should be either a vector
5196 or an array.
5197 3. AT_LOOP: the loop where the vector memref is to be created.
5198 4. OFFSET (optional): a byte offset to be added to the initial address
5199 accessed by the data-ref in STMT_INFO.
5200 5. BSI: location where the new stmts are to be placed if there is no loop
5201 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5202 pointing to the initial address.
5203 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5204 to the IV during each iteration of the loop. NULL says to move
5205 by one copy of AGGR_TYPE up or down, depending on the step of the
5206 data reference.
5208 Output:
5209 1. Declare a new ptr to vector_type, and have it point to the base of the
5210 data reference (initial addressed accessed by the data reference).
5211 For example, for vector of type V8HI, the following code is generated:
5213 v8hi *ap;
5214 ap = (v8hi *)initial_address;
5216 if OFFSET is not supplied:
5217 initial_address = &a[init];
5218 if OFFSET is supplied:
5219 initial_address = &a[init] + OFFSET;
5220 if BYTE_OFFSET is supplied:
5221 initial_address = &a[init] + BYTE_OFFSET;
5223 Return the initial_address in INITIAL_ADDRESS.
5225 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5226 update the pointer in each iteration of the loop.
5228 Return the increment stmt that updates the pointer in PTR_INCR.
5230 3. Return the pointer. */
5232 tree
5233 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5234 tree aggr_type, class loop *at_loop, tree offset,
5235 tree *initial_address, gimple_stmt_iterator *gsi,
5236 gimple **ptr_incr, bool only_init,
5237 tree iv_step)
5239 const char *base_name;
5240 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5241 class loop *loop = NULL;
5242 bool nested_in_vect_loop = false;
5243 class loop *containing_loop = NULL;
5244 tree aggr_ptr_type;
5245 tree aggr_ptr;
5246 tree new_temp;
5247 gimple_seq new_stmt_list = NULL;
5248 edge pe = NULL;
5249 basic_block new_bb;
5250 tree aggr_ptr_init;
5251 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5252 struct data_reference *dr = dr_info->dr;
5253 tree aptr;
5254 gimple_stmt_iterator incr_gsi;
5255 bool insert_after;
5256 tree indx_before_incr, indx_after_incr;
5257 gimple *incr;
5258 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5260 gcc_assert (iv_step != NULL_TREE
5261 || TREE_CODE (aggr_type) == ARRAY_TYPE
5262 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5264 if (loop_vinfo)
5266 loop = LOOP_VINFO_LOOP (loop_vinfo);
5267 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5268 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5269 pe = loop_preheader_edge (loop);
5271 else
5273 gcc_assert (bb_vinfo);
5274 only_init = true;
5275 *ptr_incr = NULL;
5278 /* Create an expression for the first address accessed by this load
5279 in LOOP. */
5280 base_name = get_name (DR_BASE_ADDRESS (dr));
5282 if (dump_enabled_p ())
5284 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5285 dump_printf_loc (MSG_NOTE, vect_location,
5286 "create %s-pointer variable to type: %T",
5287 get_tree_code_name (TREE_CODE (aggr_type)),
5288 aggr_type);
5289 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5290 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5291 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5292 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5293 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5294 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5295 else
5296 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5297 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5300 /* (1) Create the new aggregate-pointer variable.
5301 Vector and array types inherit the alias set of their component
5302 type by default so we need to use a ref-all pointer if the data
5303 reference does not conflict with the created aggregated data
5304 reference because it is not addressable. */
5305 bool need_ref_all = false;
5306 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5307 get_alias_set (DR_REF (dr))))
5308 need_ref_all = true;
5309 /* Likewise for any of the data references in the stmt group. */
5310 else if (DR_GROUP_SIZE (stmt_info) > 1)
5312 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5315 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5316 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5317 get_alias_set (DR_REF (sdr))))
5319 need_ref_all = true;
5320 break;
5322 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5324 while (sinfo);
5326 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5327 need_ref_all);
5328 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5331 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5332 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5333 def-use update cycles for the pointer: one relative to the outer-loop
5334 (LOOP), which is what steps (3) and (4) below do. The other is relative
5335 to the inner-loop (which is the inner-most loop containing the dataref),
5336 and this is done be step (5) below.
5338 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5339 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5340 redundant. Steps (3),(4) create the following:
5342 vp0 = &base_addr;
5343 LOOP: vp1 = phi(vp0,vp2)
5346 vp2 = vp1 + step
5347 goto LOOP
5349 If there is an inner-loop nested in loop, then step (5) will also be
5350 applied, and an additional update in the inner-loop will be created:
5352 vp0 = &base_addr;
5353 LOOP: vp1 = phi(vp0,vp2)
5355 inner: vp3 = phi(vp1,vp4)
5356 vp4 = vp3 + inner_step
5357 if () goto inner
5359 vp2 = vp1 + step
5360 if () goto LOOP */
5362 /* (2) Calculate the initial address of the aggregate-pointer, and set
5363 the aggregate-pointer to point to it before the loop. */
5365 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5367 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5368 stmt_info, &new_stmt_list,
5369 offset);
5370 if (new_stmt_list)
5372 if (pe)
5374 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5375 gcc_assert (!new_bb);
5377 else
5378 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5381 *initial_address = new_temp;
5382 aggr_ptr_init = new_temp;
5384 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5385 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5386 inner-loop nested in LOOP (during outer-loop vectorization). */
5388 /* No update in loop is required. */
5389 if (only_init && (!loop_vinfo || at_loop == loop))
5390 aptr = aggr_ptr_init;
5391 else
5393 /* Accesses to invariant addresses should be handled specially
5394 by the caller. */
5395 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5396 gcc_assert (!integer_zerop (step));
5398 if (iv_step == NULL_TREE)
5400 /* The step of the aggregate pointer is the type size,
5401 negated for downward accesses. */
5402 iv_step = TYPE_SIZE_UNIT (aggr_type);
5403 if (tree_int_cst_sgn (step) == -1)
5404 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5407 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5409 create_iv (aggr_ptr_init, PLUS_EXPR,
5410 fold_convert (aggr_ptr_type, iv_step),
5411 aggr_ptr, loop, &incr_gsi, insert_after,
5412 &indx_before_incr, &indx_after_incr);
5413 incr = gsi_stmt (incr_gsi);
5415 /* Copy the points-to information if it exists. */
5416 if (DR_PTR_INFO (dr))
5418 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5419 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5421 if (ptr_incr)
5422 *ptr_incr = incr;
5424 aptr = indx_before_incr;
5427 if (!nested_in_vect_loop || only_init)
5428 return aptr;
5431 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5432 nested in LOOP, if exists. */
5434 gcc_assert (nested_in_vect_loop);
5435 if (!only_init)
5437 standard_iv_increment_position (containing_loop, &incr_gsi,
5438 &insert_after);
5439 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5440 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5441 &indx_before_incr, &indx_after_incr);
5442 incr = gsi_stmt (incr_gsi);
5444 /* Copy the points-to information if it exists. */
5445 if (DR_PTR_INFO (dr))
5447 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5448 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5450 if (ptr_incr)
5451 *ptr_incr = incr;
5453 return indx_before_incr;
5455 else
5456 gcc_unreachable ();
5460 /* Function bump_vector_ptr
5462 Increment a pointer (to a vector type) by vector-size. If requested,
5463 i.e. if PTR-INCR is given, then also connect the new increment stmt
5464 to the existing def-use update-chain of the pointer, by modifying
5465 the PTR_INCR as illustrated below:
5467 The pointer def-use update-chain before this function:
5468 DATAREF_PTR = phi (p_0, p_2)
5469 ....
5470 PTR_INCR: p_2 = DATAREF_PTR + step
5472 The pointer def-use update-chain after this function:
5473 DATAREF_PTR = phi (p_0, p_2)
5474 ....
5475 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5476 ....
5477 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5479 Input:
5480 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5481 in the loop.
5482 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5483 the loop. The increment amount across iterations is expected
5484 to be vector_size.
5485 BSI - location where the new update stmt is to be placed.
5486 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5487 BUMP - optional. The offset by which to bump the pointer. If not given,
5488 the offset is assumed to be vector_size.
5490 Output: Return NEW_DATAREF_PTR as illustrated above.
5494 tree
5495 bump_vector_ptr (vec_info *vinfo,
5496 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5497 stmt_vec_info stmt_info, tree bump)
5499 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5500 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5501 tree update = TYPE_SIZE_UNIT (vectype);
5502 gimple *incr_stmt;
5503 ssa_op_iter iter;
5504 use_operand_p use_p;
5505 tree new_dataref_ptr;
5507 if (bump)
5508 update = bump;
5510 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5511 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5512 else if (is_gimple_min_invariant (dataref_ptr))
5513 /* When possible avoid emitting a separate increment stmt that will
5514 force the addressed object addressable. */
5515 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5516 fold_build2 (MEM_REF,
5517 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5518 dataref_ptr,
5519 fold_convert (ptr_type_node, update)));
5520 else
5521 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5522 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5523 dataref_ptr, update);
5524 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5525 /* Fold the increment, avoiding excessive chains use-def chains of
5526 those, leading to compile-time issues for passes until the next
5527 forwprop pass which would do this as well. */
5528 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5529 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5531 incr_stmt = gsi_stmt (fold_gsi);
5532 update_stmt (incr_stmt);
5535 /* Copy the points-to information if it exists. */
5536 if (DR_PTR_INFO (dr))
5538 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5539 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5542 if (!ptr_incr)
5543 return new_dataref_ptr;
5545 /* Update the vector-pointer's cross-iteration increment. */
5546 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5548 tree use = USE_FROM_PTR (use_p);
5550 if (use == dataref_ptr)
5551 SET_USE (use_p, new_dataref_ptr);
5552 else
5553 gcc_assert (operand_equal_p (use, update, 0));
5556 return new_dataref_ptr;
5560 /* Copy memory reference info such as base/clique from the SRC reference
5561 to the DEST MEM_REF. */
5563 void
5564 vect_copy_ref_info (tree dest, tree src)
5566 if (TREE_CODE (dest) != MEM_REF)
5567 return;
5569 tree src_base = src;
5570 while (handled_component_p (src_base))
5571 src_base = TREE_OPERAND (src_base, 0);
5572 if (TREE_CODE (src_base) != MEM_REF
5573 && TREE_CODE (src_base) != TARGET_MEM_REF)
5574 return;
5576 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5577 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5581 /* Function vect_create_destination_var.
5583 Create a new temporary of type VECTYPE. */
5585 tree
5586 vect_create_destination_var (tree scalar_dest, tree vectype)
5588 tree vec_dest;
5589 const char *name;
5590 char *new_name;
5591 tree type;
5592 enum vect_var_kind kind;
5594 kind = vectype
5595 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5596 ? vect_mask_var
5597 : vect_simple_var
5598 : vect_scalar_var;
5599 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5601 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5603 name = get_name (scalar_dest);
5604 if (name)
5605 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5606 else
5607 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5608 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5609 free (new_name);
5611 return vec_dest;
5614 /* Function vect_grouped_store_supported.
5616 Returns TRUE if interleave high and interleave low permutations
5617 are supported, and FALSE otherwise. */
5619 bool
5620 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5622 machine_mode mode = TYPE_MODE (vectype);
5624 /* vect_permute_store_chain requires the group size to be equal to 3 or
5625 be a power of two. */
5626 if (count != 3 && exact_log2 (count) == -1)
5628 if (dump_enabled_p ())
5629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5630 "the size of the group of accesses"
5631 " is not a power of 2 or not eqaul to 3\n");
5632 return false;
5635 /* Check that the permutation is supported. */
5636 if (VECTOR_MODE_P (mode))
5638 unsigned int i;
5639 if (count == 3)
5641 unsigned int j0 = 0, j1 = 0, j2 = 0;
5642 unsigned int i, j;
5644 unsigned int nelt;
5645 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5647 if (dump_enabled_p ())
5648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649 "cannot handle groups of 3 stores for"
5650 " variable-length vectors\n");
5651 return false;
5654 vec_perm_builder sel (nelt, nelt, 1);
5655 sel.quick_grow (nelt);
5656 vec_perm_indices indices;
5657 for (j = 0; j < 3; j++)
5659 int nelt0 = ((3 - j) * nelt) % 3;
5660 int nelt1 = ((3 - j) * nelt + 1) % 3;
5661 int nelt2 = ((3 - j) * nelt + 2) % 3;
5662 for (i = 0; i < nelt; i++)
5664 if (3 * i + nelt0 < nelt)
5665 sel[3 * i + nelt0] = j0++;
5666 if (3 * i + nelt1 < nelt)
5667 sel[3 * i + nelt1] = nelt + j1++;
5668 if (3 * i + nelt2 < nelt)
5669 sel[3 * i + nelt2] = 0;
5671 indices.new_vector (sel, 2, nelt);
5672 if (!can_vec_perm_const_p (mode, mode, indices))
5674 if (dump_enabled_p ())
5675 dump_printf (MSG_MISSED_OPTIMIZATION,
5676 "permutation op not supported by target.\n");
5677 return false;
5680 for (i = 0; i < nelt; i++)
5682 if (3 * i + nelt0 < nelt)
5683 sel[3 * i + nelt0] = 3 * i + nelt0;
5684 if (3 * i + nelt1 < nelt)
5685 sel[3 * i + nelt1] = 3 * i + nelt1;
5686 if (3 * i + nelt2 < nelt)
5687 sel[3 * i + nelt2] = nelt + j2++;
5689 indices.new_vector (sel, 2, nelt);
5690 if (!can_vec_perm_const_p (mode, mode, indices))
5692 if (dump_enabled_p ())
5693 dump_printf (MSG_MISSED_OPTIMIZATION,
5694 "permutation op not supported by target.\n");
5695 return false;
5698 return true;
5700 else
5702 /* If length is not equal to 3 then only power of 2 is supported. */
5703 gcc_assert (pow2p_hwi (count));
5704 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5706 /* The encoding has 2 interleaved stepped patterns. */
5707 if(!multiple_p (nelt, 2))
5708 return false;
5709 vec_perm_builder sel (nelt, 2, 3);
5710 sel.quick_grow (6);
5711 for (i = 0; i < 3; i++)
5713 sel[i * 2] = i;
5714 sel[i * 2 + 1] = i + nelt;
5716 vec_perm_indices indices (sel, 2, nelt);
5717 if (can_vec_perm_const_p (mode, mode, indices))
5719 for (i = 0; i < 6; i++)
5720 sel[i] += exact_div (nelt, 2);
5721 indices.new_vector (sel, 2, nelt);
5722 if (can_vec_perm_const_p (mode, mode, indices))
5723 return true;
5728 if (dump_enabled_p ())
5729 dump_printf (MSG_MISSED_OPTIMIZATION,
5730 "permutation op not supported by target.\n");
5731 return false;
5734 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5735 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5737 internal_fn
5738 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5739 bool masked_p)
5741 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5742 vec_mask_len_store_lanes_optab, vectype,
5743 count))
5744 return IFN_MASK_LEN_STORE_LANES;
5745 else if (masked_p)
5747 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5748 vec_mask_store_lanes_optab, vectype,
5749 count))
5750 return IFN_MASK_STORE_LANES;
5752 else
5754 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5755 vec_store_lanes_optab, vectype, count))
5756 return IFN_STORE_LANES;
5758 return IFN_LAST;
5762 /* Function vect_permute_store_chain.
5764 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5765 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5766 the data correctly for the stores. Return the final references for stores
5767 in RESULT_CHAIN.
5769 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5770 The input is 4 vectors each containing 8 elements. We assign a number to
5771 each element, the input sequence is:
5773 1st vec: 0 1 2 3 4 5 6 7
5774 2nd vec: 8 9 10 11 12 13 14 15
5775 3rd vec: 16 17 18 19 20 21 22 23
5776 4th vec: 24 25 26 27 28 29 30 31
5778 The output sequence should be:
5780 1st vec: 0 8 16 24 1 9 17 25
5781 2nd vec: 2 10 18 26 3 11 19 27
5782 3rd vec: 4 12 20 28 5 13 21 30
5783 4th vec: 6 14 22 30 7 15 23 31
5785 i.e., we interleave the contents of the four vectors in their order.
5787 We use interleave_high/low instructions to create such output. The input of
5788 each interleave_high/low operation is two vectors:
5789 1st vec 2nd vec
5790 0 1 2 3 4 5 6 7
5791 the even elements of the result vector are obtained left-to-right from the
5792 high/low elements of the first vector. The odd elements of the result are
5793 obtained left-to-right from the high/low elements of the second vector.
5794 The output of interleave_high will be: 0 4 1 5
5795 and of interleave_low: 2 6 3 7
5798 The permutation is done in log LENGTH stages. In each stage interleave_high
5799 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5800 where the first argument is taken from the first half of DR_CHAIN and the
5801 second argument from it's second half.
5802 In our example,
5804 I1: interleave_high (1st vec, 3rd vec)
5805 I2: interleave_low (1st vec, 3rd vec)
5806 I3: interleave_high (2nd vec, 4th vec)
5807 I4: interleave_low (2nd vec, 4th vec)
5809 The output for the first stage is:
5811 I1: 0 16 1 17 2 18 3 19
5812 I2: 4 20 5 21 6 22 7 23
5813 I3: 8 24 9 25 10 26 11 27
5814 I4: 12 28 13 29 14 30 15 31
5816 The output of the second stage, i.e. the final result is:
5818 I1: 0 8 16 24 1 9 17 25
5819 I2: 2 10 18 26 3 11 19 27
5820 I3: 4 12 20 28 5 13 21 30
5821 I4: 6 14 22 30 7 15 23 31. */
5823 void
5824 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5825 unsigned int length,
5826 stmt_vec_info stmt_info,
5827 gimple_stmt_iterator *gsi,
5828 vec<tree> *result_chain)
5830 tree vect1, vect2, high, low;
5831 gimple *perm_stmt;
5832 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5833 tree perm_mask_low, perm_mask_high;
5834 tree data_ref;
5835 tree perm3_mask_low, perm3_mask_high;
5836 unsigned int i, j, n, log_length = exact_log2 (length);
5838 result_chain->quick_grow (length);
5839 memcpy (result_chain->address (), dr_chain.address (),
5840 length * sizeof (tree));
5842 if (length == 3)
5844 /* vect_grouped_store_supported ensures that this is constant. */
5845 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5846 unsigned int j0 = 0, j1 = 0, j2 = 0;
5848 vec_perm_builder sel (nelt, nelt, 1);
5849 sel.quick_grow (nelt);
5850 vec_perm_indices indices;
5851 for (j = 0; j < 3; j++)
5853 int nelt0 = ((3 - j) * nelt) % 3;
5854 int nelt1 = ((3 - j) * nelt + 1) % 3;
5855 int nelt2 = ((3 - j) * nelt + 2) % 3;
5857 for (i = 0; i < nelt; i++)
5859 if (3 * i + nelt0 < nelt)
5860 sel[3 * i + nelt0] = j0++;
5861 if (3 * i + nelt1 < nelt)
5862 sel[3 * i + nelt1] = nelt + j1++;
5863 if (3 * i + nelt2 < nelt)
5864 sel[3 * i + nelt2] = 0;
5866 indices.new_vector (sel, 2, nelt);
5867 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5869 for (i = 0; i < nelt; i++)
5871 if (3 * i + nelt0 < nelt)
5872 sel[3 * i + nelt0] = 3 * i + nelt0;
5873 if (3 * i + nelt1 < nelt)
5874 sel[3 * i + nelt1] = 3 * i + nelt1;
5875 if (3 * i + nelt2 < nelt)
5876 sel[3 * i + nelt2] = nelt + j2++;
5878 indices.new_vector (sel, 2, nelt);
5879 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5881 vect1 = dr_chain[0];
5882 vect2 = dr_chain[1];
5884 /* Create interleaving stmt:
5885 low = VEC_PERM_EXPR <vect1, vect2,
5886 {j, nelt, *, j + 1, nelt + j + 1, *,
5887 j + 2, nelt + j + 2, *, ...}> */
5888 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5889 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5890 vect2, perm3_mask_low);
5891 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5893 vect1 = data_ref;
5894 vect2 = dr_chain[2];
5895 /* Create interleaving stmt:
5896 low = VEC_PERM_EXPR <vect1, vect2,
5897 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5898 6, 7, nelt + j + 2, ...}> */
5899 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5900 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5901 vect2, perm3_mask_high);
5902 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5903 (*result_chain)[j] = data_ref;
5906 else
5908 /* If length is not equal to 3 then only power of 2 is supported. */
5909 gcc_assert (pow2p_hwi (length));
5911 /* The encoding has 2 interleaved stepped patterns. */
5912 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5913 vec_perm_builder sel (nelt, 2, 3);
5914 sel.quick_grow (6);
5915 for (i = 0; i < 3; i++)
5917 sel[i * 2] = i;
5918 sel[i * 2 + 1] = i + nelt;
5920 vec_perm_indices indices (sel, 2, nelt);
5921 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5923 for (i = 0; i < 6; i++)
5924 sel[i] += exact_div (nelt, 2);
5925 indices.new_vector (sel, 2, nelt);
5926 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5928 for (i = 0, n = log_length; i < n; i++)
5930 for (j = 0; j < length/2; j++)
5932 vect1 = dr_chain[j];
5933 vect2 = dr_chain[j+length/2];
5935 /* Create interleaving stmt:
5936 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5937 ...}> */
5938 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5939 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5940 vect2, perm_mask_high);
5941 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5942 (*result_chain)[2*j] = high;
5944 /* Create interleaving stmt:
5945 low = VEC_PERM_EXPR <vect1, vect2,
5946 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5947 ...}> */
5948 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5949 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5950 vect2, perm_mask_low);
5951 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5952 (*result_chain)[2*j+1] = low;
5954 memcpy (dr_chain.address (), result_chain->address (),
5955 length * sizeof (tree));
5960 /* Function vect_setup_realignment
5962 This function is called when vectorizing an unaligned load using
5963 the dr_explicit_realign[_optimized] scheme.
5964 This function generates the following code at the loop prolog:
5966 p = initial_addr;
5967 x msq_init = *(floor(p)); # prolog load
5968 realignment_token = call target_builtin;
5969 loop:
5970 x msq = phi (msq_init, ---)
5972 The stmts marked with x are generated only for the case of
5973 dr_explicit_realign_optimized.
5975 The code above sets up a new (vector) pointer, pointing to the first
5976 location accessed by STMT_INFO, and a "floor-aligned" load using that
5977 pointer. It also generates code to compute the "realignment-token"
5978 (if the relevant target hook was defined), and creates a phi-node at the
5979 loop-header bb whose arguments are the result of the prolog-load (created
5980 by this function) and the result of a load that takes place in the loop
5981 (to be created by the caller to this function).
5983 For the case of dr_explicit_realign_optimized:
5984 The caller to this function uses the phi-result (msq) to create the
5985 realignment code inside the loop, and sets up the missing phi argument,
5986 as follows:
5987 loop:
5988 msq = phi (msq_init, lsq)
5989 lsq = *(floor(p')); # load in loop
5990 result = realign_load (msq, lsq, realignment_token);
5992 For the case of dr_explicit_realign:
5993 loop:
5994 msq = *(floor(p)); # load in loop
5995 p' = p + (VS-1);
5996 lsq = *(floor(p')); # load in loop
5997 result = realign_load (msq, lsq, realignment_token);
5999 Input:
6000 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6001 a memory location that may be unaligned.
6002 BSI - place where new code is to be inserted.
6003 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6004 is used.
6006 Output:
6007 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6008 target hook, if defined.
6009 Return value - the result of the loop-header phi node. */
6011 tree
6012 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6013 gimple_stmt_iterator *gsi, tree *realignment_token,
6014 enum dr_alignment_support alignment_support_scheme,
6015 tree init_addr,
6016 class loop **at_loop)
6018 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6019 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6020 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6021 struct data_reference *dr = dr_info->dr;
6022 class loop *loop = NULL;
6023 edge pe = NULL;
6024 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6025 tree vec_dest;
6026 gimple *inc;
6027 tree ptr;
6028 tree data_ref;
6029 basic_block new_bb;
6030 tree msq_init = NULL_TREE;
6031 tree new_temp;
6032 gphi *phi_stmt;
6033 tree msq = NULL_TREE;
6034 gimple_seq stmts = NULL;
6035 bool compute_in_loop = false;
6036 bool nested_in_vect_loop = false;
6037 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6038 class loop *loop_for_initial_load = NULL;
6040 if (loop_vinfo)
6042 loop = LOOP_VINFO_LOOP (loop_vinfo);
6043 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6046 gcc_assert (alignment_support_scheme == dr_explicit_realign
6047 || alignment_support_scheme == dr_explicit_realign_optimized);
6049 /* We need to generate three things:
6050 1. the misalignment computation
6051 2. the extra vector load (for the optimized realignment scheme).
6052 3. the phi node for the two vectors from which the realignment is
6053 done (for the optimized realignment scheme). */
6055 /* 1. Determine where to generate the misalignment computation.
6057 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6058 calculation will be generated by this function, outside the loop (in the
6059 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6060 caller, inside the loop.
6062 Background: If the misalignment remains fixed throughout the iterations of
6063 the loop, then both realignment schemes are applicable, and also the
6064 misalignment computation can be done outside LOOP. This is because we are
6065 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6066 are a multiple of VS (the Vector Size), and therefore the misalignment in
6067 different vectorized LOOP iterations is always the same.
6068 The problem arises only if the memory access is in an inner-loop nested
6069 inside LOOP, which is now being vectorized using outer-loop vectorization.
6070 This is the only case when the misalignment of the memory access may not
6071 remain fixed throughout the iterations of the inner-loop (as explained in
6072 detail in vect_supportable_dr_alignment). In this case, not only is the
6073 optimized realignment scheme not applicable, but also the misalignment
6074 computation (and generation of the realignment token that is passed to
6075 REALIGN_LOAD) have to be done inside the loop.
6077 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6078 or not, which in turn determines if the misalignment is computed inside
6079 the inner-loop, or outside LOOP. */
6081 if (init_addr != NULL_TREE || !loop_vinfo)
6083 compute_in_loop = true;
6084 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6088 /* 2. Determine where to generate the extra vector load.
6090 For the optimized realignment scheme, instead of generating two vector
6091 loads in each iteration, we generate a single extra vector load in the
6092 preheader of the loop, and in each iteration reuse the result of the
6093 vector load from the previous iteration. In case the memory access is in
6094 an inner-loop nested inside LOOP, which is now being vectorized using
6095 outer-loop vectorization, we need to determine whether this initial vector
6096 load should be generated at the preheader of the inner-loop, or can be
6097 generated at the preheader of LOOP. If the memory access has no evolution
6098 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6099 to be generated inside LOOP (in the preheader of the inner-loop). */
6101 if (nested_in_vect_loop)
6103 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6104 bool invariant_in_outerloop =
6105 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6106 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6108 else
6109 loop_for_initial_load = loop;
6110 if (at_loop)
6111 *at_loop = loop_for_initial_load;
6113 tree vuse = NULL_TREE;
6114 if (loop_for_initial_load)
6116 pe = loop_preheader_edge (loop_for_initial_load);
6117 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6118 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6120 if (!vuse)
6121 vuse = gimple_vuse (gsi_stmt (*gsi));
6123 /* 3. For the case of the optimized realignment, create the first vector
6124 load at the loop preheader. */
6126 if (alignment_support_scheme == dr_explicit_realign_optimized)
6128 /* Create msq_init = *(floor(p1)) in the loop preheader */
6129 gassign *new_stmt;
6131 gcc_assert (!compute_in_loop);
6132 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6133 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6134 loop_for_initial_load, NULL_TREE,
6135 &init_addr, NULL, &inc, true);
6136 if (TREE_CODE (ptr) == SSA_NAME)
6137 new_temp = copy_ssa_name (ptr);
6138 else
6139 new_temp = make_ssa_name (TREE_TYPE (ptr));
6140 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6141 tree type = TREE_TYPE (ptr);
6142 new_stmt = gimple_build_assign
6143 (new_temp, BIT_AND_EXPR, ptr,
6144 fold_build2 (MINUS_EXPR, type,
6145 build_int_cst (type, 0),
6146 build_int_cst (type, align)));
6147 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6148 gcc_assert (!new_bb);
6149 data_ref
6150 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6151 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6152 vect_copy_ref_info (data_ref, DR_REF (dr));
6153 new_stmt = gimple_build_assign (vec_dest, data_ref);
6154 new_temp = make_ssa_name (vec_dest, new_stmt);
6155 gimple_assign_set_lhs (new_stmt, new_temp);
6156 gimple_set_vuse (new_stmt, vuse);
6157 if (pe)
6159 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6160 gcc_assert (!new_bb);
6162 else
6163 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6165 msq_init = gimple_assign_lhs (new_stmt);
6168 /* 4. Create realignment token using a target builtin, if available.
6169 It is done either inside the containing loop, or before LOOP (as
6170 determined above). */
6172 if (targetm.vectorize.builtin_mask_for_load)
6174 gcall *new_stmt;
6175 tree builtin_decl;
6177 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6178 if (!init_addr)
6180 /* Generate the INIT_ADDR computation outside LOOP. */
6181 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6182 stmt_info, &stmts,
6183 NULL_TREE);
6184 if (loop)
6186 pe = loop_preheader_edge (loop);
6187 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6188 gcc_assert (!new_bb);
6190 else
6191 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6194 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6195 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6196 vec_dest =
6197 vect_create_destination_var (scalar_dest,
6198 gimple_call_return_type (new_stmt));
6199 new_temp = make_ssa_name (vec_dest, new_stmt);
6200 gimple_call_set_lhs (new_stmt, new_temp);
6202 if (compute_in_loop)
6203 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6204 else
6206 /* Generate the misalignment computation outside LOOP. */
6207 pe = loop_preheader_edge (loop);
6208 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6209 gcc_assert (!new_bb);
6212 *realignment_token = gimple_call_lhs (new_stmt);
6214 /* The result of the CALL_EXPR to this builtin is determined from
6215 the value of the parameter and no global variables are touched
6216 which makes the builtin a "const" function. Requiring the
6217 builtin to have the "const" attribute makes it unnecessary
6218 to call mark_call_clobbered. */
6219 gcc_assert (TREE_READONLY (builtin_decl));
6222 if (alignment_support_scheme == dr_explicit_realign)
6223 return msq;
6225 gcc_assert (!compute_in_loop);
6226 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6229 /* 5. Create msq = phi <msq_init, lsq> in loop */
6231 pe = loop_preheader_edge (containing_loop);
6232 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6233 msq = make_ssa_name (vec_dest);
6234 phi_stmt = create_phi_node (msq, containing_loop->header);
6235 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6237 return msq;
6241 /* Function vect_grouped_load_supported.
6243 COUNT is the size of the load group (the number of statements plus the
6244 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6245 only one statement, with a gap of COUNT - 1.
6247 Returns true if a suitable permute exists. */
6249 bool
6250 vect_grouped_load_supported (tree vectype, bool single_element_p,
6251 unsigned HOST_WIDE_INT count)
6253 machine_mode mode = TYPE_MODE (vectype);
6255 /* If this is single-element interleaving with an element distance
6256 that leaves unused vector loads around punt - we at least create
6257 very sub-optimal code in that case (and blow up memory,
6258 see PR65518). */
6259 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "single-element interleaving not supported "
6264 "for not adjacent vector loads\n");
6265 return false;
6268 /* vect_permute_load_chain requires the group size to be equal to 3 or
6269 be a power of two. */
6270 if (count != 3 && exact_log2 (count) == -1)
6272 if (dump_enabled_p ())
6273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6274 "the size of the group of accesses"
6275 " is not a power of 2 or not equal to 3\n");
6276 return false;
6279 /* Check that the permutation is supported. */
6280 if (VECTOR_MODE_P (mode))
6282 unsigned int i, j;
6283 if (count == 3)
6285 unsigned int nelt;
6286 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6288 if (dump_enabled_p ())
6289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6290 "cannot handle groups of 3 loads for"
6291 " variable-length vectors\n");
6292 return false;
6295 vec_perm_builder sel (nelt, nelt, 1);
6296 sel.quick_grow (nelt);
6297 vec_perm_indices indices;
6298 unsigned int k;
6299 for (k = 0; k < 3; k++)
6301 for (i = 0; i < nelt; i++)
6302 if (3 * i + k < 2 * nelt)
6303 sel[i] = 3 * i + k;
6304 else
6305 sel[i] = 0;
6306 indices.new_vector (sel, 2, nelt);
6307 if (!can_vec_perm_const_p (mode, mode, indices))
6309 if (dump_enabled_p ())
6310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6311 "shuffle of 3 loads is not supported by"
6312 " target\n");
6313 return false;
6315 for (i = 0, j = 0; i < nelt; i++)
6316 if (3 * i + k < 2 * nelt)
6317 sel[i] = i;
6318 else
6319 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6320 indices.new_vector (sel, 2, nelt);
6321 if (!can_vec_perm_const_p (mode, mode, indices))
6323 if (dump_enabled_p ())
6324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6325 "shuffle of 3 loads is not supported by"
6326 " target\n");
6327 return false;
6330 return true;
6332 else
6334 /* If length is not equal to 3 then only power of 2 is supported. */
6335 gcc_assert (pow2p_hwi (count));
6336 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6338 /* The encoding has a single stepped pattern. */
6339 vec_perm_builder sel (nelt, 1, 3);
6340 sel.quick_grow (3);
6341 for (i = 0; i < 3; i++)
6342 sel[i] = i * 2;
6343 vec_perm_indices indices (sel, 2, nelt);
6344 if (can_vec_perm_const_p (mode, mode, indices))
6346 for (i = 0; i < 3; i++)
6347 sel[i] = i * 2 + 1;
6348 indices.new_vector (sel, 2, nelt);
6349 if (can_vec_perm_const_p (mode, mode, indices))
6350 return true;
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "extract even/odd not supported by target\n");
6358 return false;
6361 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6362 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6364 internal_fn
6365 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6366 bool masked_p)
6368 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6369 vec_mask_len_load_lanes_optab, vectype,
6370 count))
6371 return IFN_MASK_LEN_LOAD_LANES;
6372 else if (masked_p)
6374 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6375 vec_mask_load_lanes_optab, vectype,
6376 count))
6377 return IFN_MASK_LOAD_LANES;
6379 else
6381 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6382 vectype, count))
6383 return IFN_LOAD_LANES;
6385 return IFN_LAST;
6388 /* Function vect_permute_load_chain.
6390 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6391 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6392 the input data correctly. Return the final references for loads in
6393 RESULT_CHAIN.
6395 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6396 The input is 4 vectors each containing 8 elements. We assign a number to each
6397 element, the input sequence is:
6399 1st vec: 0 1 2 3 4 5 6 7
6400 2nd vec: 8 9 10 11 12 13 14 15
6401 3rd vec: 16 17 18 19 20 21 22 23
6402 4th vec: 24 25 26 27 28 29 30 31
6404 The output sequence should be:
6406 1st vec: 0 4 8 12 16 20 24 28
6407 2nd vec: 1 5 9 13 17 21 25 29
6408 3rd vec: 2 6 10 14 18 22 26 30
6409 4th vec: 3 7 11 15 19 23 27 31
6411 i.e., the first output vector should contain the first elements of each
6412 interleaving group, etc.
6414 We use extract_even/odd instructions to create such output. The input of
6415 each extract_even/odd operation is two vectors
6416 1st vec 2nd vec
6417 0 1 2 3 4 5 6 7
6419 and the output is the vector of extracted even/odd elements. The output of
6420 extract_even will be: 0 2 4 6
6421 and of extract_odd: 1 3 5 7
6424 The permutation is done in log LENGTH stages. In each stage extract_even
6425 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6426 their order. In our example,
6428 E1: extract_even (1st vec, 2nd vec)
6429 E2: extract_odd (1st vec, 2nd vec)
6430 E3: extract_even (3rd vec, 4th vec)
6431 E4: extract_odd (3rd vec, 4th vec)
6433 The output for the first stage will be:
6435 E1: 0 2 4 6 8 10 12 14
6436 E2: 1 3 5 7 9 11 13 15
6437 E3: 16 18 20 22 24 26 28 30
6438 E4: 17 19 21 23 25 27 29 31
6440 In order to proceed and create the correct sequence for the next stage (or
6441 for the correct output, if the second stage is the last one, as in our
6442 example), we first put the output of extract_even operation and then the
6443 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6444 The input for the second stage is:
6446 1st vec (E1): 0 2 4 6 8 10 12 14
6447 2nd vec (E3): 16 18 20 22 24 26 28 30
6448 3rd vec (E2): 1 3 5 7 9 11 13 15
6449 4th vec (E4): 17 19 21 23 25 27 29 31
6451 The output of the second stage:
6453 E1: 0 4 8 12 16 20 24 28
6454 E2: 2 6 10 14 18 22 26 30
6455 E3: 1 5 9 13 17 21 25 29
6456 E4: 3 7 11 15 19 23 27 31
6458 And RESULT_CHAIN after reordering:
6460 1st vec (E1): 0 4 8 12 16 20 24 28
6461 2nd vec (E3): 1 5 9 13 17 21 25 29
6462 3rd vec (E2): 2 6 10 14 18 22 26 30
6463 4th vec (E4): 3 7 11 15 19 23 27 31. */
6465 static void
6466 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6467 unsigned int length,
6468 stmt_vec_info stmt_info,
6469 gimple_stmt_iterator *gsi,
6470 vec<tree> *result_chain)
6472 tree data_ref, first_vect, second_vect;
6473 tree perm_mask_even, perm_mask_odd;
6474 tree perm3_mask_low, perm3_mask_high;
6475 gimple *perm_stmt;
6476 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6477 unsigned int i, j, log_length = exact_log2 (length);
6479 result_chain->quick_grow (length);
6480 memcpy (result_chain->address (), dr_chain.address (),
6481 length * sizeof (tree));
6483 if (length == 3)
6485 /* vect_grouped_load_supported ensures that this is constant. */
6486 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6487 unsigned int k;
6489 vec_perm_builder sel (nelt, nelt, 1);
6490 sel.quick_grow (nelt);
6491 vec_perm_indices indices;
6492 for (k = 0; k < 3; k++)
6494 for (i = 0; i < nelt; i++)
6495 if (3 * i + k < 2 * nelt)
6496 sel[i] = 3 * i + k;
6497 else
6498 sel[i] = 0;
6499 indices.new_vector (sel, 2, nelt);
6500 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6502 for (i = 0, j = 0; i < nelt; i++)
6503 if (3 * i + k < 2 * nelt)
6504 sel[i] = i;
6505 else
6506 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6507 indices.new_vector (sel, 2, nelt);
6508 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6510 first_vect = dr_chain[0];
6511 second_vect = dr_chain[1];
6513 /* Create interleaving stmt (low part of):
6514 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6515 ...}> */
6516 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6517 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6518 second_vect, perm3_mask_low);
6519 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6521 /* Create interleaving stmt (high part of):
6522 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6523 ...}> */
6524 first_vect = data_ref;
6525 second_vect = dr_chain[2];
6526 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6527 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6528 second_vect, perm3_mask_high);
6529 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6530 (*result_chain)[k] = data_ref;
6533 else
6535 /* If length is not equal to 3 then only power of 2 is supported. */
6536 gcc_assert (pow2p_hwi (length));
6538 /* The encoding has a single stepped pattern. */
6539 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6540 vec_perm_builder sel (nelt, 1, 3);
6541 sel.quick_grow (3);
6542 for (i = 0; i < 3; ++i)
6543 sel[i] = i * 2;
6544 vec_perm_indices indices (sel, 2, nelt);
6545 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6547 for (i = 0; i < 3; ++i)
6548 sel[i] = i * 2 + 1;
6549 indices.new_vector (sel, 2, nelt);
6550 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6552 for (i = 0; i < log_length; i++)
6554 for (j = 0; j < length; j += 2)
6556 first_vect = dr_chain[j];
6557 second_vect = dr_chain[j+1];
6559 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6560 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6561 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6562 first_vect, second_vect,
6563 perm_mask_even);
6564 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6565 (*result_chain)[j/2] = data_ref;
6567 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6568 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6569 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6570 first_vect, second_vect,
6571 perm_mask_odd);
6572 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6573 (*result_chain)[j/2+length/2] = data_ref;
6575 memcpy (dr_chain.address (), result_chain->address (),
6576 length * sizeof (tree));
6581 /* Function vect_shift_permute_load_chain.
6583 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6584 sequence of stmts to reorder the input data accordingly.
6585 Return the final references for loads in RESULT_CHAIN.
6586 Return true if successed, false otherwise.
6588 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6589 The input is 3 vectors each containing 8 elements. We assign a
6590 number to each element, the input sequence is:
6592 1st vec: 0 1 2 3 4 5 6 7
6593 2nd vec: 8 9 10 11 12 13 14 15
6594 3rd vec: 16 17 18 19 20 21 22 23
6596 The output sequence should be:
6598 1st vec: 0 3 6 9 12 15 18 21
6599 2nd vec: 1 4 7 10 13 16 19 22
6600 3rd vec: 2 5 8 11 14 17 20 23
6602 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6604 First we shuffle all 3 vectors to get correct elements order:
6606 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6607 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6608 3rd vec: (16 19 22) (17 20 23) (18 21)
6610 Next we unite and shift vector 3 times:
6612 1st step:
6613 shift right by 6 the concatenation of:
6614 "1st vec" and "2nd vec"
6615 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6616 "2nd vec" and "3rd vec"
6617 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6618 "3rd vec" and "1st vec"
6619 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6620 | New vectors |
6622 So that now new vectors are:
6624 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6625 2nd vec: (10 13) (16 19 22) (17 20 23)
6626 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6628 2nd step:
6629 shift right by 5 the concatenation of:
6630 "1st vec" and "3rd vec"
6631 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6632 "2nd vec" and "1st vec"
6633 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6634 "3rd vec" and "2nd vec"
6635 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6636 | New vectors |
6638 So that now new vectors are:
6640 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6641 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6642 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6644 3rd step:
6645 shift right by 5 the concatenation of:
6646 "1st vec" and "1st vec"
6647 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6648 shift right by 3 the concatenation of:
6649 "2nd vec" and "2nd vec"
6650 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6651 | New vectors |
6653 So that now all vectors are READY:
6654 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6655 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6656 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6658 This algorithm is faster than one in vect_permute_load_chain if:
6659 1. "shift of a concatination" is faster than general permutation.
6660 This is usually so.
6661 2. The TARGET machine can't execute vector instructions in parallel.
6662 This is because each step of the algorithm depends on previous.
6663 The algorithm in vect_permute_load_chain is much more parallel.
6665 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6668 static bool
6669 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6670 unsigned int length,
6671 stmt_vec_info stmt_info,
6672 gimple_stmt_iterator *gsi,
6673 vec<tree> *result_chain)
6675 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6676 tree perm2_mask1, perm2_mask2, perm3_mask;
6677 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6678 gimple *perm_stmt;
6680 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6681 machine_mode vmode = TYPE_MODE (vectype);
6682 unsigned int i;
6683 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6685 unsigned HOST_WIDE_INT nelt, vf;
6686 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6687 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6688 /* Not supported for variable-length vectors. */
6689 return false;
6691 vec_perm_builder sel (nelt, nelt, 1);
6692 sel.quick_grow (nelt);
6694 result_chain->quick_grow (length);
6695 memcpy (result_chain->address (), dr_chain.address (),
6696 length * sizeof (tree));
6698 if (pow2p_hwi (length) && vf > 4)
6700 unsigned int j, log_length = exact_log2 (length);
6701 for (i = 0; i < nelt / 2; ++i)
6702 sel[i] = i * 2;
6703 for (i = 0; i < nelt / 2; ++i)
6704 sel[nelt / 2 + i] = i * 2 + 1;
6705 vec_perm_indices indices (sel, 2, nelt);
6706 if (!can_vec_perm_const_p (vmode, vmode, indices))
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 "shuffle of 2 fields structure is not \
6711 supported by target\n");
6712 return false;
6714 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6716 for (i = 0; i < nelt / 2; ++i)
6717 sel[i] = i * 2 + 1;
6718 for (i = 0; i < nelt / 2; ++i)
6719 sel[nelt / 2 + i] = i * 2;
6720 indices.new_vector (sel, 2, nelt);
6721 if (!can_vec_perm_const_p (vmode, vmode, indices))
6723 if (dump_enabled_p ())
6724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725 "shuffle of 2 fields structure is not \
6726 supported by target\n");
6727 return false;
6729 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6731 /* Generating permutation constant to shift all elements.
6732 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6733 for (i = 0; i < nelt; i++)
6734 sel[i] = nelt / 2 + i;
6735 indices.new_vector (sel, 2, nelt);
6736 if (!can_vec_perm_const_p (vmode, vmode, indices))
6738 if (dump_enabled_p ())
6739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740 "shift permutation is not supported by target\n");
6741 return false;
6743 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6745 /* Generating permutation constant to select vector from 2.
6746 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6747 for (i = 0; i < nelt / 2; i++)
6748 sel[i] = i;
6749 for (i = nelt / 2; i < nelt; i++)
6750 sel[i] = nelt + i;
6751 indices.new_vector (sel, 2, nelt);
6752 if (!can_vec_perm_const_p (vmode, vmode, indices))
6754 if (dump_enabled_p ())
6755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756 "select is not supported by target\n");
6757 return false;
6759 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6761 for (i = 0; i < log_length; i++)
6763 for (j = 0; j < length; j += 2)
6765 first_vect = dr_chain[j];
6766 second_vect = dr_chain[j + 1];
6768 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6769 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6770 first_vect, first_vect,
6771 perm2_mask1);
6772 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6773 vect[0] = data_ref;
6775 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6776 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6777 second_vect, second_vect,
6778 perm2_mask2);
6779 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6780 vect[1] = data_ref;
6782 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6783 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6784 vect[0], vect[1], shift1_mask);
6785 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6786 (*result_chain)[j/2 + length/2] = data_ref;
6788 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6789 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6790 vect[0], vect[1], select_mask);
6791 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6792 (*result_chain)[j/2] = data_ref;
6794 memcpy (dr_chain.address (), result_chain->address (),
6795 length * sizeof (tree));
6797 return true;
6799 if (length == 3 && vf > 2)
6801 unsigned int k = 0, l = 0;
6803 /* Generating permutation constant to get all elements in rigth order.
6804 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6805 for (i = 0; i < nelt; i++)
6807 if (3 * k + (l % 3) >= nelt)
6809 k = 0;
6810 l += (3 - (nelt % 3));
6812 sel[i] = 3 * k + (l % 3);
6813 k++;
6815 vec_perm_indices indices (sel, 2, nelt);
6816 if (!can_vec_perm_const_p (vmode, vmode, indices))
6818 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820 "shuffle of 3 fields structure is not \
6821 supported by target\n");
6822 return false;
6824 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6826 /* Generating permutation constant to shift all elements.
6827 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6828 for (i = 0; i < nelt; i++)
6829 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6830 indices.new_vector (sel, 2, nelt);
6831 if (!can_vec_perm_const_p (vmode, vmode, indices))
6833 if (dump_enabled_p ())
6834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6835 "shift permutation is not supported by target\n");
6836 return false;
6838 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6840 /* Generating permutation constant to shift all elements.
6841 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6842 for (i = 0; i < nelt; i++)
6843 sel[i] = 2 * (nelt / 3) + 1 + i;
6844 indices.new_vector (sel, 2, nelt);
6845 if (!can_vec_perm_const_p (vmode, vmode, indices))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 "shift permutation is not supported by target\n");
6850 return false;
6852 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6854 /* Generating permutation constant to shift all elements.
6855 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6856 for (i = 0; i < nelt; i++)
6857 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6858 indices.new_vector (sel, 2, nelt);
6859 if (!can_vec_perm_const_p (vmode, vmode, indices))
6861 if (dump_enabled_p ())
6862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6863 "shift permutation is not supported by target\n");
6864 return false;
6866 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6868 /* Generating permutation constant to shift all elements.
6869 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6870 for (i = 0; i < nelt; i++)
6871 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6872 indices.new_vector (sel, 2, nelt);
6873 if (!can_vec_perm_const_p (vmode, vmode, indices))
6875 if (dump_enabled_p ())
6876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6877 "shift permutation is not supported by target\n");
6878 return false;
6880 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6882 for (k = 0; k < 3; k++)
6884 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6885 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6886 dr_chain[k], dr_chain[k],
6887 perm3_mask);
6888 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6889 vect[k] = data_ref;
6892 for (k = 0; k < 3; k++)
6894 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6895 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6896 vect[k % 3], vect[(k + 1) % 3],
6897 shift1_mask);
6898 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6899 vect_shift[k] = data_ref;
6902 for (k = 0; k < 3; k++)
6904 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6905 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6906 vect_shift[(4 - k) % 3],
6907 vect_shift[(3 - k) % 3],
6908 shift2_mask);
6909 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6910 vect[k] = data_ref;
6913 (*result_chain)[3 - (nelt % 3)] = vect[2];
6915 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6916 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6917 vect[0], shift3_mask);
6918 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6919 (*result_chain)[nelt % 3] = data_ref;
6921 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6922 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6923 vect[1], shift4_mask);
6924 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6925 (*result_chain)[0] = data_ref;
6926 return true;
6928 return false;
6931 /* Function vect_transform_grouped_load.
6933 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6934 to perform their permutation and ascribe the result vectorized statements to
6935 the scalar statements.
6938 void
6939 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6940 vec<tree> dr_chain,
6941 int size, gimple_stmt_iterator *gsi)
6943 machine_mode mode;
6944 vec<tree> result_chain = vNULL;
6946 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6947 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6948 vectors, that are ready for vector computation. */
6949 result_chain.create (size);
6951 /* If reassociation width for vector type is 2 or greater target machine can
6952 execute 2 or more vector instructions in parallel. Otherwise try to
6953 get chain for loads group using vect_shift_permute_load_chain. */
6954 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6955 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6956 || pow2p_hwi (size)
6957 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6958 gsi, &result_chain))
6959 vect_permute_load_chain (vinfo, dr_chain,
6960 size, stmt_info, gsi, &result_chain);
6961 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6962 result_chain.release ();
6965 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6966 generated as part of the vectorization of STMT_INFO. Assign the statement
6967 for each vector to the associated scalar statement. */
6969 void
6970 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6971 vec<tree> result_chain)
6973 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6974 unsigned int i, gap_count;
6975 tree tmp_data_ref;
6977 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6978 Since we scan the chain starting from it's first node, their order
6979 corresponds the order of data-refs in RESULT_CHAIN. */
6980 stmt_vec_info next_stmt_info = first_stmt_info;
6981 gap_count = 1;
6982 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6984 if (!next_stmt_info)
6985 break;
6987 /* Skip the gaps. Loads created for the gaps will be removed by dead
6988 code elimination pass later. No need to check for the first stmt in
6989 the group, since it always exists.
6990 DR_GROUP_GAP is the number of steps in elements from the previous
6991 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6992 correspond to the gaps. */
6993 if (next_stmt_info != first_stmt_info
6994 && gap_count < DR_GROUP_GAP (next_stmt_info))
6996 gap_count++;
6997 continue;
7000 /* ??? The following needs cleanup after the removal of
7001 DR_GROUP_SAME_DR_STMT. */
7002 if (next_stmt_info)
7004 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7005 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7006 copies, and we put the new vector statement last. */
7007 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7009 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7010 gap_count = 1;
7015 /* Function vect_force_dr_alignment_p.
7017 Returns whether the alignment of a DECL can be forced to be aligned
7018 on ALIGNMENT bit boundary. */
7020 bool
7021 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7023 if (!VAR_P (decl))
7024 return false;
7026 if (decl_in_symtab_p (decl)
7027 && !symtab_node::get (decl)->can_increase_alignment_p ())
7028 return false;
7030 if (TREE_STATIC (decl))
7031 return (known_le (alignment,
7032 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7033 else
7034 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7037 /* Return whether the data reference DR_INFO is supported with respect to its
7038 alignment.
7039 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7040 it is aligned, i.e., check if it is possible to vectorize it with different
7041 alignment. */
7043 enum dr_alignment_support
7044 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7045 tree vectype, int misalignment)
7047 data_reference *dr = dr_info->dr;
7048 stmt_vec_info stmt_info = dr_info->stmt;
7049 machine_mode mode = TYPE_MODE (vectype);
7050 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7051 class loop *vect_loop = NULL;
7052 bool nested_in_vect_loop = false;
7054 if (misalignment == 0)
7055 return dr_aligned;
7057 /* For now assume all conditional loads/stores support unaligned
7058 access without any special code. */
7059 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7060 if (gimple_call_internal_p (stmt)
7061 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7062 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7063 return dr_unaligned_supported;
7065 if (loop_vinfo)
7067 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7068 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7071 /* Possibly unaligned access. */
7073 /* We can choose between using the implicit realignment scheme (generating
7074 a misaligned_move stmt) and the explicit realignment scheme (generating
7075 aligned loads with a REALIGN_LOAD). There are two variants to the
7076 explicit realignment scheme: optimized, and unoptimized.
7077 We can optimize the realignment only if the step between consecutive
7078 vector loads is equal to the vector size. Since the vector memory
7079 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7080 is guaranteed that the misalignment amount remains the same throughout the
7081 execution of the vectorized loop. Therefore, we can create the
7082 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7083 at the loop preheader.
7085 However, in the case of outer-loop vectorization, when vectorizing a
7086 memory access in the inner-loop nested within the LOOP that is now being
7087 vectorized, while it is guaranteed that the misalignment of the
7088 vectorized memory access will remain the same in different outer-loop
7089 iterations, it is *not* guaranteed that is will remain the same throughout
7090 the execution of the inner-loop. This is because the inner-loop advances
7091 with the original scalar step (and not in steps of VS). If the inner-loop
7092 step happens to be a multiple of VS, then the misalignment remains fixed
7093 and we can use the optimized realignment scheme. For example:
7095 for (i=0; i<N; i++)
7096 for (j=0; j<M; j++)
7097 s += a[i+j];
7099 When vectorizing the i-loop in the above example, the step between
7100 consecutive vector loads is 1, and so the misalignment does not remain
7101 fixed across the execution of the inner-loop, and the realignment cannot
7102 be optimized (as illustrated in the following pseudo vectorized loop):
7104 for (i=0; i<N; i+=4)
7105 for (j=0; j<M; j++){
7106 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7107 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7108 // (assuming that we start from an aligned address).
7111 We therefore have to use the unoptimized realignment scheme:
7113 for (i=0; i<N; i+=4)
7114 for (j=k; j<M; j+=4)
7115 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7116 // that the misalignment of the initial address is
7117 // 0).
7119 The loop can then be vectorized as follows:
7121 for (k=0; k<4; k++){
7122 rt = get_realignment_token (&vp[k]);
7123 for (i=0; i<N; i+=4){
7124 v1 = vp[i+k];
7125 for (j=k; j<M; j+=4){
7126 v2 = vp[i+j+VS-1];
7127 va = REALIGN_LOAD <v1,v2,rt>;
7128 vs += va;
7129 v1 = v2;
7132 } */
7134 if (DR_IS_READ (dr))
7136 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7137 && (!targetm.vectorize.builtin_mask_for_load
7138 || targetm.vectorize.builtin_mask_for_load ()))
7140 /* If we are doing SLP then the accesses need not have the
7141 same alignment, instead it depends on the SLP group size. */
7142 if (loop_vinfo
7143 && STMT_SLP_TYPE (stmt_info)
7144 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7145 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7146 * (DR_GROUP_SIZE
7147 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7148 TYPE_VECTOR_SUBPARTS (vectype))))
7150 else if (!loop_vinfo
7151 || (nested_in_vect_loop
7152 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7153 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7154 return dr_explicit_realign;
7155 else
7156 return dr_explicit_realign_optimized;
7160 bool is_packed = false;
7161 tree type = TREE_TYPE (DR_REF (dr));
7162 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7163 is_packed = not_size_aligned (DR_REF (dr));
7164 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7165 is_packed))
7166 return dr_unaligned_supported;
7168 /* Unsupported. */
7169 return dr_unaligned_unsupported;