fix single argument static_assert
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blobc531079d3bbf972c05ee97246db5927b8ca53c98
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
104 static cgraph_node*
105 simd_clone_call_p (gimple *stmt)
107 gcall *call = dyn_cast <gcall *> (stmt);
108 if (!call)
109 return NULL;
111 tree fndecl = NULL_TREE;
112 if (gimple_call_internal_p (call, IFN_MASK_CALL))
113 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
114 else
115 fndecl = gimple_call_fndecl (stmt);
117 if (fndecl == NULL_TREE)
118 return NULL;
120 cgraph_node *node = cgraph_node::get (fndecl);
121 if (node && node->simd_clones != NULL)
122 return node;
124 return NULL;
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
144 types. */
146 tree
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
149 HOST_WIDE_INT lhs, rhs;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
154 return scalar_type;
156 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
158 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
159 if (assign)
161 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
162 if (gimple_assign_cast_p (assign)
163 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
169 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
171 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
172 if (rhs < lhs)
173 scalar_type = rhs_type;
176 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
178 auto clone = node->simd_clones->simdclone;
179 for (unsigned int i = 0; i < clone->nargs; ++i)
181 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
183 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
184 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
185 if (rhs < lhs)
187 scalar_type = arg_scalar_type;
188 lhs = rhs;
193 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
195 unsigned int i = 0;
196 if (gimple_call_internal_p (call))
198 internal_fn ifn = gimple_call_internal_fn (call);
199 if (internal_load_fn_p (ifn))
200 /* For loads the LHS type does the trick. */
201 i = ~0U;
202 else if (internal_store_fn_p (ifn))
204 /* For stores use the tyep of the stored value. */
205 i = internal_fn_stored_value_index (ifn);
206 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
207 i = ~0U;
209 else if (internal_fn_mask_index (ifn) == 0)
210 i = 1;
212 if (i < gimple_call_num_args (call))
214 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
217 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
218 if (rhs < lhs)
219 scalar_type = rhs_type;
224 return scalar_type;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
232 static opt_result
233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
238 return opt_result::failure_at (vect_location,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
241 " == 0\n");
243 opt_result res
244 = runtime_alias_check_p (ddr, loop,
245 optimize_loop_nest_for_speed_p (loop));
246 if (!res)
247 return res;
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
255 static void
256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
258 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
259 for (unsigned int i = 0; i < checks.length(); ++i)
260 if (checks[i] == value)
261 return;
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "need run-time check that %T is nonzero\n",
266 value);
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
274 static bool
275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
277 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
278 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
283 return true;
285 /* If there is a loop invariant read involved we might vectorize it in
286 the prologue, breaking scalar oder with respect to the in-loop store. */
287 if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
288 || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
289 return false;
291 /* STMT_A and STMT_B belong to overlapping groups. All loads are
292 emitted at the position of the first scalar load.
293 Stores in a group are emitted at the position of the last scalar store.
294 Compute that position and check whether the resulting order matches
295 the current one. */
296 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
297 if (il_a)
299 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
300 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
301 s = DR_GROUP_NEXT_ELEMENT (s))
302 il_a = get_later_stmt (il_a, s);
303 else /* DR_IS_READ */
304 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
305 s = DR_GROUP_NEXT_ELEMENT (s))
306 if (get_later_stmt (il_a, s) == il_a)
307 il_a = s;
309 else
310 il_a = stmtinfo_a;
311 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
312 if (il_b)
314 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
315 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
316 s = DR_GROUP_NEXT_ELEMENT (s))
317 il_b = get_later_stmt (il_b, s);
318 else /* DR_IS_READ */
319 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
320 s = DR_GROUP_NEXT_ELEMENT (s))
321 if (get_later_stmt (il_b, s) == il_b)
322 il_b = s;
324 else
325 il_b = stmtinfo_b;
326 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
327 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
330 /* A subroutine of vect_analyze_data_ref_dependence. Handle
331 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
332 distances. These distances are conservatively correct but they don't
333 reflect a guaranteed dependence.
335 Return true if this function does all the work necessary to avoid
336 an alias or false if the caller should use the dependence distances
337 to limit the vectorization factor in the usual way. LOOP_DEPTH is
338 the depth of the loop described by LOOP_VINFO and the other arguments
339 are as for vect_analyze_data_ref_dependence. */
341 static bool
342 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
343 loop_vec_info loop_vinfo,
344 int loop_depth, unsigned int *max_vf)
346 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
347 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
349 int dist = dist_v[loop_depth];
350 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
352 /* If the user asserted safelen >= DIST consecutive iterations
353 can be executed concurrently, assume independence.
355 ??? An alternative would be to add the alias check even
356 in this case, and vectorize the fallback loop with the
357 maximum VF set to safelen. However, if the user has
358 explicitly given a length, it's less likely that that
359 would be a win. */
360 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
362 if ((unsigned int) loop->safelen < *max_vf)
363 *max_vf = loop->safelen;
364 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
365 continue;
368 /* For dependence distances of 2 or more, we have the option
369 of limiting VF or checking for an alias at runtime.
370 Prefer to check at runtime if we can, to avoid limiting
371 the VF unnecessarily when the bases are in fact independent.
373 Note that the alias checks will be removed if the VF ends up
374 being small enough. */
375 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
376 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
377 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
378 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
379 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
382 return true;
386 /* Function vect_analyze_data_ref_dependence.
388 FIXME: I needed to change the sense of the returned flag.
390 Return FALSE if there (might) exist a dependence between a memory-reference
391 DRA and a memory-reference DRB. When versioning for alias may check a
392 dependence at run-time, return TRUE. Adjust *MAX_VF according to
393 the data dependence. */
395 static opt_result
396 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
397 loop_vec_info loop_vinfo,
398 unsigned int *max_vf)
400 unsigned int i;
401 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
402 struct data_reference *dra = DDR_A (ddr);
403 struct data_reference *drb = DDR_B (ddr);
404 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
405 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
406 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
407 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
408 lambda_vector dist_v;
409 unsigned int loop_depth;
411 /* If user asserted safelen consecutive iterations can be
412 executed concurrently, assume independence. */
413 auto apply_safelen = [&]()
415 if (loop->safelen >= 2)
417 if ((unsigned int) loop->safelen < *max_vf)
418 *max_vf = loop->safelen;
419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 return true;
422 return false;
425 /* In loop analysis all data references should be vectorizable. */
426 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
427 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
428 gcc_unreachable ();
430 /* Independent data accesses. */
431 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
432 return opt_result::success ();
434 if (dra == drb
435 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
436 return opt_result::success ();
438 /* We do not have to consider dependences between accesses that belong
439 to the same group, unless the stride could be smaller than the
440 group size. */
441 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
442 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
443 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
444 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
445 return opt_result::success ();
447 /* Even if we have an anti-dependence then, as the vectorized loop covers at
448 least two scalar iterations, there is always also a true dependence.
449 As the vectorizer does not re-order loads and stores we can ignore
450 the anti-dependence if TBAA can disambiguate both DRs similar to the
451 case with known negative distance anti-dependences (positive
452 distance anti-dependences would violate TBAA constraints). */
453 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
454 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
455 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
456 get_alias_set (DR_REF (drb))))
457 return opt_result::success ();
459 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
460 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
462 if (apply_safelen ())
463 return opt_result::success ();
465 return opt_result::failure_at
466 (stmtinfo_a->stmt,
467 "possible alias involving gather/scatter between %T and %T\n",
468 DR_REF (dra), DR_REF (drb));
471 /* Unknown data dependence. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
474 if (apply_safelen ())
475 return opt_result::success ();
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
479 "versioning for alias required: "
480 "can't determine dependence between %T and %T\n",
481 DR_REF (dra), DR_REF (drb));
483 /* Add to list of ddrs that need to be tested at run-time. */
484 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
487 /* Known data dependence. */
488 if (DDR_NUM_DIST_VECTS (ddr) == 0)
490 if (apply_safelen ())
491 return opt_result::success ();
493 if (dump_enabled_p ())
494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
495 "versioning for alias required: "
496 "bad dist vector for %T and %T\n",
497 DR_REF (dra), DR_REF (drb));
498 /* Add to list of ddrs that need to be tested at run-time. */
499 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
502 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
504 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
505 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
506 loop_depth, max_vf))
507 return opt_result::success ();
509 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
511 int dist = dist_v[loop_depth];
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance = %d.\n", dist);
517 if (dist == 0)
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance == 0 between %T and %T\n",
522 DR_REF (dra), DR_REF (drb));
524 /* When we perform grouped accesses and perform implicit CSE
525 by detecting equal accesses and doing disambiguation with
526 runtime alias tests like for
527 .. = a[i];
528 .. = a[i+1];
529 a[i] = ..;
530 a[i+1] = ..;
531 *p = ..;
532 .. = a[i];
533 .. = a[i+1];
534 where we will end up loading { a[i], a[i+1] } once, make
535 sure that inserting group loads before the first load and
536 stores after the last store will do the right thing.
537 Similar for groups like
538 a[i] = ...;
539 ... = a[i];
540 a[i+1] = ...;
541 where loads from the group interleave with the store. */
542 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
543 return opt_result::failure_at (stmtinfo_a->stmt,
544 "READ_WRITE dependence"
545 " in interleaving.\n");
547 if (loop->safelen < 2)
549 tree indicator = dr_zero_step_indicator (dra);
550 if (!indicator || integer_zerop (indicator))
551 return opt_result::failure_at (stmtinfo_a->stmt,
552 "access also has a zero step\n");
553 else if (TREE_CODE (indicator) != INTEGER_CST)
554 vect_check_nonzero_value (loop_vinfo, indicator);
556 continue;
559 if (dist > 0 && DDR_REVERSED_P (ddr))
561 /* If DDR_REVERSED_P the order of the data-refs in DDR was
562 reversed (to make distance vector positive), and the actual
563 distance is negative. */
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "dependence distance negative.\n");
567 /* When doing outer loop vectorization, we need to check if there is
568 a backward dependence at the inner loop level if the dependence
569 at the outer loop is reversed. See PR81740. */
570 if (nested_in_vect_loop_p (loop, stmtinfo_a)
571 || nested_in_vect_loop_p (loop, stmtinfo_b))
573 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
574 DDR_LOOP_NEST (ddr));
575 if (dist_v[inner_depth] < 0)
576 return opt_result::failure_at (stmtinfo_a->stmt,
577 "not vectorized, dependence "
578 "between data-refs %T and %T\n",
579 DR_REF (dra), DR_REF (drb));
581 /* Record a negative dependence distance to later limit the
582 amount of stmt copying / unrolling we can perform.
583 Only need to handle read-after-write dependence. */
584 if (DR_IS_READ (drb)
585 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
586 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
587 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
588 continue;
591 unsigned int abs_dist = abs (dist);
592 if (abs_dist >= 2 && abs_dist < *max_vf)
594 /* The dependence distance requires reduction of the maximal
595 vectorization factor. */
596 *max_vf = abs_dist;
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "adjusting maximal vectorization factor to %i\n",
600 *max_vf);
603 if (abs_dist >= *max_vf)
605 /* Dependence distance does not create dependence, as far as
606 vectorization is concerned, in this case. */
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location,
609 "dependence distance >= VF.\n");
610 continue;
613 return opt_result::failure_at (stmtinfo_a->stmt,
614 "not vectorized, possible dependence "
615 "between data-refs %T and %T\n",
616 DR_REF (dra), DR_REF (drb));
619 return opt_result::success ();
622 /* Function vect_analyze_early_break_dependences.
624 Examine all the data references in the loop and make sure that if we have
625 multiple exits that we are able to safely move stores such that they become
626 safe for vectorization. The function also calculates the place where to move
627 the instructions to and computes what the new vUSE chain should be.
629 This works in tandem with the CFG that will be produced by
630 slpeel_tree_duplicate_loop_to_edge_cfg later on.
632 This function tries to validate whether an early break vectorization
633 is possible for the current instruction sequence. Returns True i
634 possible, otherwise False.
636 Requirements:
637 - Any memory access must be to a fixed size buffer.
638 - There must not be any loads and stores to the same object.
639 - Multiple loads are allowed as long as they don't alias.
641 NOTE:
642 This implementation is very conservative. Any overlapping loads/stores
643 that take place before the early break statement gets rejected aside from
644 WAR dependencies.
646 i.e.:
648 a[i] = 8
649 c = a[i]
650 if (b[i])
653 is not allowed, but
655 c = a[i]
656 a[i] = 8
657 if (b[i])
660 is which is the common case. */
662 static opt_result
663 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
665 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
667 /* List of all load data references found during traversal. */
668 auto_vec<data_reference *> bases;
669 basic_block dest_bb = NULL;
671 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
672 class loop *loop_nest = loop_outer (loop);
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_NOTE, vect_location,
676 "loop contains multiple exits, analyzing"
677 " statement dependencies.\n");
679 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "alternate exit has been chosen as main exit.\n");
684 /* Since we don't support general control flow, the location we'll move the
685 side-effects to is always the latch connected exit. When we support
686 general control flow we can do better but for now this is fine. Move
687 side-effects to the in-loop destination of the last early exit. For the
688 PEELED case we move the side-effects to the latch block as this is
689 guaranteed to be the last block to be executed when a vector iteration
690 finished. */
691 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
692 dest_bb = loop->latch;
693 else
694 dest_bb = single_pred (loop->latch);
696 /* We start looking from dest_bb, for the non-PEELED case we don't want to
697 move any stores already present, but we do want to read and validate the
698 loads. */
699 basic_block bb = dest_bb;
701 /* We move stores across all loads to the beginning of dest_bb, so
702 the first block processed below doesn't need dependence checking. */
703 bool check_deps = false;
707 gimple_stmt_iterator gsi = gsi_last_bb (bb);
709 /* Now analyze all the remaining statements and try to determine which
710 instructions are allowed/needed to be moved. */
711 while (!gsi_end_p (gsi))
713 gimple *stmt = gsi_stmt (gsi);
714 gsi_prev (&gsi);
715 if (is_gimple_debug (stmt))
716 continue;
718 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
719 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
720 if (!dr_ref)
721 continue;
723 /* We know everything below dest_bb is safe since we know we
724 had a full vector iteration when reaching it. Either by
725 the loop entry / IV exit test being last or because this
726 is the loop latch itself. */
727 if (!check_deps)
728 continue;
730 /* Check if vector accesses to the object will be within bounds.
731 must be a constant or assume loop will be versioned or niters
732 bounded by VF so accesses are within range. We only need to check
733 the reads since writes are moved to a safe place where if we get
734 there we know they are safe to perform. */
735 if (DR_IS_READ (dr_ref)
736 && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
738 if (dump_enabled_p ())
739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
740 "early breaks not supported: vectorization "
741 "would %s beyond size of obj.\n",
742 DR_IS_READ (dr_ref) ? "read" : "write");
743 return opt_result::failure_at (stmt,
744 "can't safely apply code motion to "
745 "dependencies of %G to vectorize "
746 "the early exit.\n", stmt);
749 if (DR_IS_READ (dr_ref))
750 bases.safe_push (dr_ref);
751 else if (DR_IS_WRITE (dr_ref))
753 /* We are moving writes down in the CFG. To be sure that this
754 is valid after vectorization we have to check all the loads
755 we are sinking the stores past to see if any of them may
756 alias or are the same object.
758 Same objects will not be an issue because unless the store
759 is marked volatile the value can be forwarded. If the
760 store is marked volatile we don't vectorize the loop
761 anyway.
763 That leaves the check for aliasing. We don't really need
764 to care about the stores aliasing with each other since the
765 stores are moved in order so the effects are still observed
766 correctly. This leaves the check for WAR dependencies
767 which we would be introducing here if the DR can alias.
768 The check is quadratic in loads/stores but I have not found
769 a better API to do this. I believe all loads and stores
770 must be checked. We also must check them when we
771 encountered the store, since we don't care about loads past
772 the store. */
774 for (auto dr_read : bases)
775 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
779 vect_location,
780 "early breaks not supported: "
781 "overlapping loads and stores "
782 "found before the break "
783 "statement.\n");
785 return opt_result::failure_at (stmt,
786 "can't safely apply code motion to dependencies"
787 " to vectorize the early exit. %G may alias with"
788 " %G\n", stmt, dr_read->stmt);
792 if (gimple_vdef (stmt))
794 if (dump_enabled_p ())
795 dump_printf_loc (MSG_NOTE, vect_location,
796 "==> recording stmt %G", stmt);
798 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
800 else if (gimple_vuse (stmt))
802 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_NOTE, vect_location,
805 "marked statement for vUSE update: %G", stmt);
809 if (!single_pred_p (bb))
811 gcc_assert (bb == loop->header);
812 break;
815 /* If we possibly sink through a virtual PHI make sure to elide that. */
816 if (gphi *vphi = get_virtual_phi (bb))
817 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
819 /* All earlier blocks need dependence checking. */
820 check_deps = true;
821 bb = single_pred (bb);
823 while (1);
825 /* We don't allow outer -> inner loop transitions which should have been
826 trapped already during loop form analysis. */
827 gcc_assert (dest_bb->loop_father == loop);
829 /* Check that the destination block we picked has only one pred. To relax this we
830 have to take special care when moving the statements. We don't currently support
831 such control flow however this check is there to simplify how we handle
832 labels that may be present anywhere in the IL. This check is to ensure that the
833 labels aren't significant for the CFG. */
834 if (!single_pred (dest_bb))
835 return opt_result::failure_at (vect_location,
836 "chosen loop exit block (BB %d) does not have a "
837 "single predecessor which is currently not "
838 "supported for early break vectorization.\n",
839 dest_bb->index);
841 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
843 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
845 /* All uses shall be updated to that of the first load. Entries are
846 stored in reverse order. */
847 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
848 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
850 if (dump_enabled_p ())
851 dump_printf_loc (MSG_NOTE, vect_location,
852 "will update use: %T, mem_ref: %G", vuse, g);
856 if (dump_enabled_p ())
857 dump_printf_loc (MSG_NOTE, vect_location,
858 "recorded statements to be moved to BB %d\n",
859 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
861 return opt_result::success ();
864 /* Function vect_analyze_data_ref_dependences.
866 Examine all the data references in the loop, and make sure there do not
867 exist any data dependences between them. Set *MAX_VF according to
868 the maximum vectorization factor the data dependences allow. */
870 opt_result
871 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
872 unsigned int *max_vf)
874 unsigned int i;
875 struct data_dependence_relation *ddr;
877 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
879 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
881 LOOP_VINFO_DDRS (loop_vinfo)
882 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
883 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
884 /* We do not need read-read dependences. */
885 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
886 &LOOP_VINFO_DDRS (loop_vinfo),
887 LOOP_VINFO_LOOP_NEST (loop_vinfo),
888 false);
889 gcc_assert (res);
892 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
894 /* For epilogues we either have no aliases or alias versioning
895 was applied to original loop. Therefore we may just get max_vf
896 using VF of original loop. */
897 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
898 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
899 else
900 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
902 opt_result res
903 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
904 if (!res)
905 return res;
908 /* If we have early break statements in the loop, check to see if they
909 are of a form we can vectorizer. */
910 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
911 return vect_analyze_early_break_dependences (loop_vinfo);
913 return opt_result::success ();
917 /* Function vect_slp_analyze_data_ref_dependence.
919 Return TRUE if there (might) exist a dependence between a memory-reference
920 DRA and a memory-reference DRB for VINFO. When versioning for alias
921 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
922 according to the data dependence. */
924 static bool
925 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
926 struct data_dependence_relation *ddr)
928 struct data_reference *dra = DDR_A (ddr);
929 struct data_reference *drb = DDR_B (ddr);
930 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
931 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
933 /* We need to check dependences of statements marked as unvectorizable
934 as well, they still can prohibit vectorization. */
936 /* Independent data accesses. */
937 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
938 return false;
940 if (dra == drb)
941 return false;
943 /* Read-read is OK. */
944 if (DR_IS_READ (dra) && DR_IS_READ (drb))
945 return false;
947 /* If dra and drb are part of the same interleaving chain consider
948 them independent. */
949 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
950 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
951 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
952 return false;
954 /* Unknown data dependence. */
955 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
957 if (dump_enabled_p ())
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
959 "can't determine dependence between %T and %T\n",
960 DR_REF (dra), DR_REF (drb));
962 else if (dump_enabled_p ())
963 dump_printf_loc (MSG_NOTE, vect_location,
964 "determined dependence between %T and %T\n",
965 DR_REF (dra), DR_REF (drb));
967 return true;
971 /* Analyze dependences involved in the transform of a store SLP NODE. */
973 static bool
974 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
976 /* This walks over all stmts involved in the SLP store done
977 in NODE verifying we can sink them up to the last stmt in the
978 group. */
979 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
980 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
982 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
984 stmt_vec_info access_info
985 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
986 if (access_info == last_access_info)
987 continue;
988 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
989 ao_ref ref;
990 bool ref_initialized_p = false;
991 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
992 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
994 gimple *stmt = gsi_stmt (gsi);
995 if (! gimple_vuse (stmt))
996 continue;
998 /* If we couldn't record a (single) data reference for this
999 stmt we have to resort to the alias oracle. */
1000 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1001 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1002 if (!dr_b)
1004 /* We are moving a store - this means
1005 we cannot use TBAA for disambiguation. */
1006 if (!ref_initialized_p)
1007 ao_ref_init (&ref, DR_REF (dr_a));
1008 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1009 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1010 return false;
1011 continue;
1014 gcc_assert (!gimple_visited_p (stmt));
1016 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1017 dr_b, vNULL);
1018 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1019 free_dependence_relation (ddr);
1020 if (dependent)
1021 return false;
1024 return true;
1027 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1028 contain the vector of scalar stores of this instance if we are
1029 disambiguating the loads. */
1031 static bool
1032 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1033 vec<stmt_vec_info> stores,
1034 stmt_vec_info last_store_info)
1036 /* This walks over all stmts involved in the SLP load done
1037 in NODE verifying we can hoist them up to the first stmt in the
1038 group. */
1039 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1040 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1042 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1044 stmt_vec_info access_info
1045 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1046 if (access_info == first_access_info)
1047 continue;
1048 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1049 ao_ref ref;
1050 bool ref_initialized_p = false;
1051 hash_set<stmt_vec_info> grp_visited;
1052 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1053 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1055 gimple *stmt = gsi_stmt (gsi);
1056 if (! gimple_vdef (stmt))
1057 continue;
1059 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1061 /* If we run into a store of this same instance (we've just
1062 marked those) then delay dependence checking until we run
1063 into the last store because this is where it will have
1064 been sunk to (and we verified that we can do that already). */
1065 if (gimple_visited_p (stmt))
1067 if (stmt_info != last_store_info)
1068 continue;
1070 for (stmt_vec_info &store_info : stores)
1072 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1073 ddr_p ddr = initialize_data_dependence_relation
1074 (dr_a, store_dr, vNULL);
1075 bool dependent
1076 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1077 free_dependence_relation (ddr);
1078 if (dependent)
1079 return false;
1081 continue;
1084 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1086 /* We are hoisting a load - this means we can use TBAA for
1087 disambiguation. */
1088 if (!ref_initialized_p)
1089 ao_ref_init (&ref, DR_REF (dr_a));
1090 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1092 /* If we couldn't record a (single) data reference for this
1093 stmt we have to give up now. */
1094 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1095 if (!dr_b)
1096 return false;
1097 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1098 dr_b, vNULL);
1099 bool dependent
1100 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1101 free_dependence_relation (ddr);
1102 if (dependent)
1103 return false;
1105 /* No dependence. */
1106 return true;
1108 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1110 /* When we run into a store group we have to honor
1111 that earlier stores might be moved here. We don't
1112 know exactly which and where to since we lack a
1113 back-mapping from DR to SLP node, so assume all
1114 earlier stores are sunk here. It's enough to
1115 consider the last stmt of a group for this.
1116 ??? Both this and the fact that we disregard that
1117 the conflicting instance might be removed later
1118 is overly conservative. */
1119 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1120 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1121 store_info != NULL;
1122 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1123 if ((store_info == stmt_info
1124 || get_later_stmt (store_info, stmt_info) == stmt_info)
1125 && !check_hoist (store_info))
1126 return false;
1128 else
1130 if (!check_hoist (stmt_info))
1131 return false;
1135 return true;
1139 /* Function vect_analyze_data_ref_dependences.
1141 Examine all the data references in the basic-block, and make sure there
1142 do not exist any data dependences between them. Set *MAX_VF according to
1143 the maximum vectorization factor the data dependences allow. */
1145 bool
1146 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1148 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1150 /* The stores of this instance are at the root of the SLP tree. */
1151 slp_tree store = NULL;
1152 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1153 store = SLP_INSTANCE_TREE (instance);
1155 /* Verify we can sink stores to the vectorized stmt insert location. */
1156 stmt_vec_info last_store_info = NULL;
1157 if (store)
1159 if (! vect_slp_analyze_store_dependences (vinfo, store))
1160 return false;
1162 /* Mark stores in this instance and remember the last one. */
1163 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1164 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1165 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1168 bool res = true;
1170 /* Verify we can sink loads to the vectorized stmt insert location,
1171 special-casing stores of this instance. */
1172 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1173 if (! vect_slp_analyze_load_dependences (vinfo, load,
1174 store
1175 ? SLP_TREE_SCALAR_STMTS (store)
1176 : vNULL, last_store_info))
1178 res = false;
1179 break;
1182 /* Unset the visited flag. */
1183 if (store)
1184 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1185 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1187 return res;
1190 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1191 applied. */
1194 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1196 HOST_WIDE_INT diff = 0;
1197 /* Alignment is only analyzed for the first element of a DR group,
1198 use that but adjust misalignment by the offset of the access. */
1199 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1201 dr_vec_info *first_dr
1202 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1203 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1204 INTEGER_CSTs and the first element in the group has the lowest
1205 address. */
1206 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1207 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1208 gcc_assert (diff >= 0);
1209 dr_info = first_dr;
1212 int misalign = dr_info->misalignment;
1213 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1214 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1215 return misalign;
1217 /* If the access is only aligned for a vector type with smaller alignment
1218 requirement the access has unknown misalignment. */
1219 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1220 targetm.vectorize.preferred_vector_alignment (vectype)))
1221 return DR_MISALIGNMENT_UNKNOWN;
1223 /* Apply the offset from the DR group start and the externally supplied
1224 offset which can for example result from a negative stride access. */
1225 poly_int64 misalignment = misalign + diff + offset;
1227 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1228 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1229 unsigned HOST_WIDE_INT target_alignment_c
1230 = dr_info->target_alignment.to_constant ();
1231 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1232 return DR_MISALIGNMENT_UNKNOWN;
1233 return misalign;
1236 /* Record the base alignment guarantee given by DRB, which occurs
1237 in STMT_INFO. */
1239 static void
1240 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1241 innermost_loop_behavior *drb)
1243 bool existed;
1244 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1245 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1246 if (!existed || entry.second->base_alignment < drb->base_alignment)
1248 entry = std::make_pair (stmt_info, drb);
1249 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_NOTE, vect_location,
1251 "recording new base alignment for %T\n"
1252 " alignment: %d\n"
1253 " misalignment: %d\n"
1254 " based on: %G",
1255 drb->base_address,
1256 drb->base_alignment,
1257 drb->base_misalignment,
1258 stmt_info->stmt);
1262 /* If the region we're going to vectorize is reached, all unconditional
1263 data references occur at least once. We can therefore pool the base
1264 alignment guarantees from each unconditional reference. Do this by
1265 going through all the data references in VINFO and checking whether
1266 the containing statement makes the reference unconditionally. If so,
1267 record the alignment of the base address in VINFO so that it can be
1268 used for all other references with the same base. */
1270 void
1271 vect_record_base_alignments (vec_info *vinfo)
1273 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1274 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1275 for (data_reference *dr : vinfo->shared->datarefs)
1277 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1278 stmt_vec_info stmt_info = dr_info->stmt;
1279 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1280 && STMT_VINFO_VECTORIZABLE (stmt_info)
1281 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1283 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1285 /* If DR is nested in the loop that is being vectorized, we can also
1286 record the alignment of the base wrt the outer loop. */
1287 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1288 vect_record_base_alignment
1289 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1294 /* Function vect_compute_data_ref_alignment
1296 Compute the misalignment of the data reference DR_INFO when vectorizing
1297 with VECTYPE.
1299 Output:
1300 1. initialized misalignment info for DR_INFO
1302 FOR NOW: No analysis is actually performed. Misalignment is calculated
1303 only for trivial cases. TODO. */
1305 static void
1306 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1307 tree vectype)
1309 stmt_vec_info stmt_info = dr_info->stmt;
1310 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1311 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1312 class loop *loop = NULL;
1313 tree ref = DR_REF (dr_info->dr);
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_NOTE, vect_location,
1317 "vect_compute_data_ref_alignment:\n");
1319 if (loop_vinfo)
1320 loop = LOOP_VINFO_LOOP (loop_vinfo);
1322 /* Initialize misalignment to unknown. */
1323 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1325 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1326 return;
1328 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1329 bool step_preserves_misalignment_p;
1331 poly_uint64 vector_alignment
1332 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1333 BITS_PER_UNIT);
1334 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1336 /* If the main loop has peeled for alignment we have no way of knowing
1337 whether the data accesses in the epilogues are aligned. We can't at
1338 compile time answer the question whether we have entered the main loop or
1339 not. Fixes PR 92351. */
1340 if (loop_vinfo)
1342 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1343 if (orig_loop_vinfo
1344 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1345 return;
1348 unsigned HOST_WIDE_INT vect_align_c;
1349 if (!vector_alignment.is_constant (&vect_align_c))
1350 return;
1352 /* No step for BB vectorization. */
1353 if (!loop)
1355 gcc_assert (integer_zerop (drb->step));
1356 step_preserves_misalignment_p = true;
1359 /* In case the dataref is in an inner-loop of the loop that is being
1360 vectorized (LOOP), we use the base and misalignment information
1361 relative to the outer-loop (LOOP). This is ok only if the misalignment
1362 stays the same throughout the execution of the inner-loop, which is why
1363 we have to check that the stride of the dataref in the inner-loop evenly
1364 divides by the vector alignment. */
1365 else if (nested_in_vect_loop_p (loop, stmt_info))
1367 step_preserves_misalignment_p
1368 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1370 if (dump_enabled_p ())
1372 if (step_preserves_misalignment_p)
1373 dump_printf_loc (MSG_NOTE, vect_location,
1374 "inner step divides the vector alignment.\n");
1375 else
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "inner step doesn't divide the vector"
1378 " alignment.\n");
1382 /* Similarly we can only use base and misalignment information relative to
1383 an innermost loop if the misalignment stays the same throughout the
1384 execution of the loop. As above, this is the case if the stride of
1385 the dataref evenly divides by the alignment. */
1386 else
1388 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1389 step_preserves_misalignment_p
1390 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1392 if (!step_preserves_misalignment_p && dump_enabled_p ())
1393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1394 "step doesn't divide the vector alignment.\n");
1397 unsigned int base_alignment = drb->base_alignment;
1398 unsigned int base_misalignment = drb->base_misalignment;
1400 /* Calculate the maximum of the pooled base address alignment and the
1401 alignment that we can compute for DR itself. */
1402 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1403 = base_alignments->get (drb->base_address);
1404 if (entry
1405 && base_alignment < (*entry).second->base_alignment
1406 && (loop_vinfo
1407 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1408 gimple_bb (entry->first->stmt))
1409 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1410 || (entry->first->dr_aux.group <= dr_info->group)))))
1412 base_alignment = entry->second->base_alignment;
1413 base_misalignment = entry->second->base_misalignment;
1416 if (drb->offset_alignment < vect_align_c
1417 || !step_preserves_misalignment_p
1418 /* We need to know whether the step wrt the vectorized loop is
1419 negative when computing the starting misalignment below. */
1420 || TREE_CODE (drb->step) != INTEGER_CST)
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1424 "Unknown alignment for access: %T\n", ref);
1425 return;
1428 if (base_alignment < vect_align_c)
1430 unsigned int max_alignment;
1431 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1432 if (max_alignment < vect_align_c
1433 || !vect_can_force_dr_alignment_p (base,
1434 vect_align_c * BITS_PER_UNIT))
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location,
1438 "can't force alignment of ref: %T\n", ref);
1439 return;
1442 /* Force the alignment of the decl.
1443 NOTE: This is the only change to the code we make during
1444 the analysis phase, before deciding to vectorize the loop. */
1445 if (dump_enabled_p ())
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "force alignment of %T\n", ref);
1449 dr_info->base_decl = base;
1450 dr_info->base_misaligned = true;
1451 base_misalignment = 0;
1453 poly_int64 misalignment
1454 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1456 unsigned int const_misalignment;
1457 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 "Non-constant misalignment for access: %T\n", ref);
1462 return;
1465 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469 "misalign = %d bytes of ref %T\n",
1470 const_misalignment, ref);
1472 return;
1475 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1476 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1477 is made aligned via peeling. */
1479 static bool
1480 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1481 dr_vec_info *dr_peel_info)
1483 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1484 DR_TARGET_ALIGNMENT (dr_info)))
1486 poly_offset_int diff
1487 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1488 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1489 if (known_eq (diff, 0)
1490 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1491 return true;
1493 return false;
1496 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1497 aligned via peeling. */
1499 static bool
1500 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1501 dr_vec_info *dr_peel_info)
1503 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1504 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1505 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1506 DR_OFFSET (dr_peel_info->dr), 0)
1507 || !operand_equal_p (DR_STEP (dr_info->dr),
1508 DR_STEP (dr_peel_info->dr), 0))
1509 return false;
1511 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1514 /* Compute the value for dr_info->misalign so that the access appears
1515 aligned. This is used by peeling to compensate for dr_misalignment
1516 applying the offset for negative step. */
1519 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1521 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1522 return 0;
1524 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1525 poly_int64 misalignment
1526 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1527 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1529 unsigned HOST_WIDE_INT target_alignment_c;
1530 int misalign;
1531 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1532 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1533 return DR_MISALIGNMENT_UNKNOWN;
1534 return misalign;
1537 /* Function vect_update_misalignment_for_peel.
1538 Sets DR_INFO's misalignment
1539 - to 0 if it has the same alignment as DR_PEEL_INFO,
1540 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1541 - to -1 (unknown) otherwise.
1543 DR_INFO - the data reference whose misalignment is to be adjusted.
1544 DR_PEEL_INFO - the data reference whose misalignment is being made
1545 zero in the vector loop by the peel.
1546 NPEEL - the number of iterations in the peel loop if the misalignment
1547 of DR_PEEL_INFO is known at compile time. */
1549 static void
1550 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1551 dr_vec_info *dr_peel_info, int npeel)
1553 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1554 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1556 SET_DR_MISALIGNMENT (dr_info,
1557 vect_dr_misalign_for_aligned_access (dr_peel_info));
1558 return;
1561 unsigned HOST_WIDE_INT alignment;
1562 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1563 && known_alignment_for_access_p (dr_info,
1564 STMT_VINFO_VECTYPE (dr_info->stmt))
1565 && known_alignment_for_access_p (dr_peel_info,
1566 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1568 int misal = dr_info->misalignment;
1569 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1570 misal &= alignment - 1;
1571 set_dr_misalignment (dr_info, misal);
1572 return;
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1577 "to unknown (-1).\n");
1578 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1581 /* Return true if alignment is relevant for DR_INFO. */
1583 static bool
1584 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1586 stmt_vec_info stmt_info = dr_info->stmt;
1588 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1589 return false;
1591 /* For interleaving, only the alignment of the first access matters. */
1592 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1593 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1594 return false;
1596 /* Scatter-gather and invariant accesses continue to address individual
1597 scalars, so vector-level alignment is irrelevant. */
1598 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1599 || integer_zerop (DR_STEP (dr_info->dr)))
1600 return false;
1602 /* Strided accesses perform only component accesses, alignment is
1603 irrelevant for them. */
1604 if (STMT_VINFO_STRIDED_P (stmt_info)
1605 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1606 return false;
1608 return true;
1611 /* Given an memory reference EXP return whether its alignment is less
1612 than its size. */
1614 static bool
1615 not_size_aligned (tree exp)
1617 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1618 return true;
1620 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1621 > get_object_alignment (exp));
1624 /* Function vector_alignment_reachable_p
1626 Return true if vector alignment for DR_INFO is reachable by peeling
1627 a few loop iterations. Return false otherwise. */
1629 static bool
1630 vector_alignment_reachable_p (dr_vec_info *dr_info)
1632 stmt_vec_info stmt_info = dr_info->stmt;
1633 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1635 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1637 /* For interleaved access we peel only if number of iterations in
1638 the prolog loop ({VF - misalignment}), is a multiple of the
1639 number of the interleaved accesses. */
1640 int elem_size, mis_in_elements;
1642 /* FORNOW: handle only known alignment. */
1643 if (!known_alignment_for_access_p (dr_info, vectype))
1644 return false;
1646 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1648 elem_size = vector_element_size (vector_size, nelements);
1649 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1651 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1652 return false;
1655 /* If misalignment is known at the compile time then allow peeling
1656 only if natural alignment is reachable through peeling. */
1657 if (known_alignment_for_access_p (dr_info, vectype)
1658 && !aligned_access_p (dr_info, vectype))
1660 HOST_WIDE_INT elmsize =
1661 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1662 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_NOTE, vect_location,
1665 "data size = %wd. misalignment = %d.\n", elmsize,
1666 dr_misalignment (dr_info, vectype));
1668 if (dr_misalignment (dr_info, vectype) % elmsize)
1670 if (dump_enabled_p ())
1671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1672 "data size does not divide the misalignment.\n");
1673 return false;
1677 if (!known_alignment_for_access_p (dr_info, vectype))
1679 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1680 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "Unknown misalignment, %snaturally aligned\n",
1684 is_packed ? "not " : "");
1685 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1688 return true;
1692 /* Calculate the cost of the memory access represented by DR_INFO. */
1694 static void
1695 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1696 dr_alignment_support alignment_support_scheme,
1697 int misalignment,
1698 unsigned int *inside_cost,
1699 unsigned int *outside_cost,
1700 stmt_vector_for_cost *body_cost_vec,
1701 stmt_vector_for_cost *prologue_cost_vec)
1703 stmt_vec_info stmt_info = dr_info->stmt;
1704 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1705 int ncopies;
1707 if (PURE_SLP_STMT (stmt_info))
1708 ncopies = 1;
1709 else
1710 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1712 if (DR_IS_READ (dr_info->dr))
1713 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1714 misalignment, true, inside_cost,
1715 outside_cost, prologue_cost_vec, body_cost_vec, false);
1716 else
1717 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1718 misalignment, inside_cost, body_cost_vec);
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_NOTE, vect_location,
1722 "vect_get_data_access_cost: inside_cost = %d, "
1723 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1727 typedef struct _vect_peel_info
1729 dr_vec_info *dr_info;
1730 int npeel;
1731 unsigned int count;
1732 } *vect_peel_info;
1734 typedef struct _vect_peel_extended_info
1736 vec_info *vinfo;
1737 struct _vect_peel_info peel_info;
1738 unsigned int inside_cost;
1739 unsigned int outside_cost;
1740 } *vect_peel_extended_info;
1743 /* Peeling hashtable helpers. */
1745 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1747 static inline hashval_t hash (const _vect_peel_info *);
1748 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1751 inline hashval_t
1752 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1754 return (hashval_t) peel_info->npeel;
1757 inline bool
1758 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1760 return (a->npeel == b->npeel);
1764 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1766 static void
1767 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1768 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1769 int npeel, bool supportable_if_not_aligned)
1771 struct _vect_peel_info elem, *slot;
1772 _vect_peel_info **new_slot;
1774 elem.npeel = npeel;
1775 slot = peeling_htab->find (&elem);
1776 if (slot)
1777 slot->count++;
1778 else
1780 slot = XNEW (struct _vect_peel_info);
1781 slot->npeel = npeel;
1782 slot->dr_info = dr_info;
1783 slot->count = 1;
1784 new_slot = peeling_htab->find_slot (slot, INSERT);
1785 *new_slot = slot;
1788 /* If this DR is not supported with unknown misalignment then bias
1789 this slot when the cost model is disabled. */
1790 if (!supportable_if_not_aligned
1791 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1792 slot->count += VECT_MAX_COST;
1796 /* Traverse peeling hash table to find peeling option that aligns maximum
1797 number of data accesses. */
1800 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1801 _vect_peel_extended_info *max)
1803 vect_peel_info elem = *slot;
1805 if (elem->count > max->peel_info.count
1806 || (elem->count == max->peel_info.count
1807 && max->peel_info.npeel > elem->npeel))
1809 max->peel_info.npeel = elem->npeel;
1810 max->peel_info.count = elem->count;
1811 max->peel_info.dr_info = elem->dr_info;
1814 return 1;
1817 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1818 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1819 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1820 after peeling. */
1822 static void
1823 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1824 dr_vec_info *dr0_info,
1825 unsigned int *inside_cost,
1826 unsigned int *outside_cost,
1827 stmt_vector_for_cost *body_cost_vec,
1828 stmt_vector_for_cost *prologue_cost_vec,
1829 unsigned int npeel)
1831 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1833 bool dr0_alignment_known_p
1834 = (dr0_info
1835 && known_alignment_for_access_p (dr0_info,
1836 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1838 for (data_reference *dr : datarefs)
1840 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1841 if (!vect_relevant_for_alignment_p (dr_info))
1842 continue;
1844 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1845 dr_alignment_support alignment_support_scheme;
1846 int misalignment;
1847 unsigned HOST_WIDE_INT alignment;
1849 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1850 size_zero_node) < 0;
1851 poly_int64 off = 0;
1852 if (negative)
1853 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1854 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1856 if (npeel == 0)
1857 misalignment = dr_misalignment (dr_info, vectype, off);
1858 else if (dr_info == dr0_info
1859 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1860 misalignment = 0;
1861 else if (!dr0_alignment_known_p
1862 || !known_alignment_for_access_p (dr_info, vectype)
1863 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1864 misalignment = DR_MISALIGNMENT_UNKNOWN;
1865 else
1867 misalignment = dr_misalignment (dr_info, vectype, off);
1868 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1869 misalignment &= alignment - 1;
1871 alignment_support_scheme
1872 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1873 misalignment);
1875 vect_get_data_access_cost (loop_vinfo, dr_info,
1876 alignment_support_scheme, misalignment,
1877 inside_cost, outside_cost,
1878 body_cost_vec, prologue_cost_vec);
1882 /* Traverse peeling hash table and calculate cost for each peeling option.
1883 Find the one with the lowest cost. */
1886 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1887 _vect_peel_extended_info *min)
1889 vect_peel_info elem = *slot;
1890 int dummy;
1891 unsigned int inside_cost = 0, outside_cost = 0;
1892 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1893 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1894 epilogue_cost_vec;
1896 prologue_cost_vec.create (2);
1897 body_cost_vec.create (2);
1898 epilogue_cost_vec.create (2);
1900 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1901 &outside_cost, &body_cost_vec,
1902 &prologue_cost_vec, elem->npeel);
1904 body_cost_vec.release ();
1906 outside_cost += vect_get_known_peeling_cost
1907 (loop_vinfo, elem->npeel, &dummy,
1908 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1909 &prologue_cost_vec, &epilogue_cost_vec);
1911 /* Prologue and epilogue costs are added to the target model later.
1912 These costs depend only on the scalar iteration cost, the
1913 number of peeling iterations finally chosen, and the number of
1914 misaligned statements. So discard the information found here. */
1915 prologue_cost_vec.release ();
1916 epilogue_cost_vec.release ();
1918 if (inside_cost < min->inside_cost
1919 || (inside_cost == min->inside_cost
1920 && outside_cost < min->outside_cost))
1922 min->inside_cost = inside_cost;
1923 min->outside_cost = outside_cost;
1924 min->peel_info.dr_info = elem->dr_info;
1925 min->peel_info.npeel = elem->npeel;
1926 min->peel_info.count = elem->count;
1929 return 1;
1933 /* Choose best peeling option by traversing peeling hash table and either
1934 choosing an option with the lowest cost (if cost model is enabled) or the
1935 option that aligns as many accesses as possible. */
1937 static struct _vect_peel_extended_info
1938 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1939 loop_vec_info loop_vinfo)
1941 struct _vect_peel_extended_info res;
1943 res.peel_info.dr_info = NULL;
1944 res.vinfo = loop_vinfo;
1946 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1948 res.inside_cost = INT_MAX;
1949 res.outside_cost = INT_MAX;
1950 peeling_htab->traverse <_vect_peel_extended_info *,
1951 vect_peeling_hash_get_lowest_cost> (&res);
1953 else
1955 res.peel_info.count = 0;
1956 peeling_htab->traverse <_vect_peel_extended_info *,
1957 vect_peeling_hash_get_most_frequent> (&res);
1958 res.inside_cost = 0;
1959 res.outside_cost = 0;
1962 return res;
1965 /* Return true if the new peeling NPEEL is supported. */
1967 static bool
1968 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1969 unsigned npeel)
1971 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1972 enum dr_alignment_support supportable_dr_alignment;
1974 bool dr0_alignment_known_p
1975 = known_alignment_for_access_p (dr0_info,
1976 STMT_VINFO_VECTYPE (dr0_info->stmt));
1978 /* Ensure that all data refs can be vectorized after the peel. */
1979 for (data_reference *dr : datarefs)
1981 if (dr == dr0_info->dr)
1982 continue;
1984 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1985 if (!vect_relevant_for_alignment_p (dr_info)
1986 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1987 continue;
1989 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1990 int misalignment;
1991 unsigned HOST_WIDE_INT alignment;
1992 if (!dr0_alignment_known_p
1993 || !known_alignment_for_access_p (dr_info, vectype)
1994 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1995 misalignment = DR_MISALIGNMENT_UNKNOWN;
1996 else
1998 misalignment = dr_misalignment (dr_info, vectype);
1999 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2000 misalignment &= alignment - 1;
2002 supportable_dr_alignment
2003 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2004 misalignment);
2005 if (supportable_dr_alignment == dr_unaligned_unsupported)
2006 return false;
2009 return true;
2012 /* Compare two data-references DRA and DRB to group them into chunks
2013 with related alignment. */
2015 static int
2016 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2018 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2019 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2020 int cmp;
2022 /* Stabilize sort. */
2023 if (dra == drb)
2024 return 0;
2026 /* Ordering of DRs according to base. */
2027 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2028 DR_BASE_ADDRESS (drb));
2029 if (cmp != 0)
2030 return cmp;
2032 /* And according to DR_OFFSET. */
2033 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2034 if (cmp != 0)
2035 return cmp;
2037 /* And after step. */
2038 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2039 if (cmp != 0)
2040 return cmp;
2042 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2043 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2044 if (cmp == 0)
2045 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2046 return cmp;
2049 /* Function vect_enhance_data_refs_alignment
2051 This pass will use loop versioning and loop peeling in order to enhance
2052 the alignment of data references in the loop.
2054 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2055 original loop is to be vectorized. Any other loops that are created by
2056 the transformations performed in this pass - are not supposed to be
2057 vectorized. This restriction will be relaxed.
2059 This pass will require a cost model to guide it whether to apply peeling
2060 or versioning or a combination of the two. For example, the scheme that
2061 intel uses when given a loop with several memory accesses, is as follows:
2062 choose one memory access ('p') which alignment you want to force by doing
2063 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2064 other accesses are not necessarily aligned, or (2) use loop versioning to
2065 generate one loop in which all accesses are aligned, and another loop in
2066 which only 'p' is necessarily aligned.
2068 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2069 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2070 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2072 Devising a cost model is the most critical aspect of this work. It will
2073 guide us on which access to peel for, whether to use loop versioning, how
2074 many versions to create, etc. The cost model will probably consist of
2075 generic considerations as well as target specific considerations (on
2076 powerpc for example, misaligned stores are more painful than misaligned
2077 loads).
2079 Here are the general steps involved in alignment enhancements:
2081 -- original loop, before alignment analysis:
2082 for (i=0; i<N; i++){
2083 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2084 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2087 -- After vect_compute_data_refs_alignment:
2088 for (i=0; i<N; i++){
2089 x = q[i]; # DR_MISALIGNMENT(q) = 3
2090 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2093 -- Possibility 1: we do loop versioning:
2094 if (p is aligned) {
2095 for (i=0; i<N; i++){ # loop 1A
2096 x = q[i]; # DR_MISALIGNMENT(q) = 3
2097 p[i] = y; # DR_MISALIGNMENT(p) = 0
2100 else {
2101 for (i=0; i<N; i++){ # loop 1B
2102 x = q[i]; # DR_MISALIGNMENT(q) = 3
2103 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2107 -- Possibility 2: we do loop peeling:
2108 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2109 x = q[i];
2110 p[i] = y;
2112 for (i = 3; i < N; i++){ # loop 2A
2113 x = q[i]; # DR_MISALIGNMENT(q) = 0
2114 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2117 -- Possibility 3: combination of loop peeling and versioning:
2118 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2119 x = q[i];
2120 p[i] = y;
2122 if (p is aligned) {
2123 for (i = 3; i<N; i++){ # loop 3A
2124 x = q[i]; # DR_MISALIGNMENT(q) = 0
2125 p[i] = y; # DR_MISALIGNMENT(p) = 0
2128 else {
2129 for (i = 3; i<N; i++){ # loop 3B
2130 x = q[i]; # DR_MISALIGNMENT(q) = 0
2131 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2135 These loops are later passed to loop_transform to be vectorized. The
2136 vectorizer will use the alignment information to guide the transformation
2137 (whether to generate regular loads/stores, or with special handling for
2138 misalignment). */
2140 opt_result
2141 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2143 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2144 dr_vec_info *first_store = NULL;
2145 dr_vec_info *dr0_info = NULL;
2146 struct data_reference *dr;
2147 unsigned int i;
2148 bool do_peeling = false;
2149 bool do_versioning = false;
2150 unsigned int npeel = 0;
2151 bool one_misalignment_known = false;
2152 bool one_misalignment_unknown = false;
2153 bool one_dr_unsupportable = false;
2154 dr_vec_info *unsupportable_dr_info = NULL;
2155 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2156 hash_table<peel_info_hasher> peeling_htab (1);
2158 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2160 /* Reset data so we can safely be called multiple times. */
2161 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2162 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2164 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2165 return opt_result::success ();
2167 /* Sort the vector of datarefs so DRs that have the same or dependent
2168 alignment are next to each other. */
2169 auto_vec<data_reference_p> datarefs
2170 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2171 datarefs.qsort (dr_align_group_sort_cmp);
2173 /* Compute the number of DRs that become aligned when we peel
2174 a dataref so it becomes aligned. */
2175 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2176 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2177 unsigned i0;
2178 for (i0 = 0; i0 < datarefs.length (); ++i0)
2179 if (DR_BASE_ADDRESS (datarefs[i0]))
2180 break;
2181 for (i = i0 + 1; i <= datarefs.length (); ++i)
2183 if (i == datarefs.length ()
2184 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2185 DR_BASE_ADDRESS (datarefs[i]), 0)
2186 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2187 DR_OFFSET (datarefs[i]), 0)
2188 || !operand_equal_p (DR_STEP (datarefs[i0]),
2189 DR_STEP (datarefs[i]), 0))
2191 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2192 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2193 will get known misalignment if we align one of the refs
2194 with the largest DR_TARGET_ALIGNMENT. */
2195 for (unsigned j = i0; j < i; ++j)
2197 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2198 for (unsigned k = i0; k < i; ++k)
2200 if (k == j)
2201 continue;
2202 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2203 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2204 dr_infoj))
2205 n_same_align_refs[j]++;
2208 i0 = i;
2212 /* While cost model enhancements are expected in the future, the high level
2213 view of the code at this time is as follows:
2215 A) If there is a misaligned access then see if peeling to align
2216 this access can make all data references satisfy
2217 vect_supportable_dr_alignment. If so, update data structures
2218 as needed and return true.
2220 B) If peeling wasn't possible and there is a data reference with an
2221 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2222 then see if loop versioning checks can be used to make all data
2223 references satisfy vect_supportable_dr_alignment. If so, update
2224 data structures as needed and return true.
2226 C) If neither peeling nor versioning were successful then return false if
2227 any data reference does not satisfy vect_supportable_dr_alignment.
2229 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2231 Note, Possibility 3 above (which is peeling and versioning together) is not
2232 being done at this time. */
2234 /* (1) Peeling to force alignment. */
2236 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2237 Considerations:
2238 + How many accesses will become aligned due to the peeling
2239 - How many accesses will become unaligned due to the peeling,
2240 and the cost of misaligned accesses.
2241 - The cost of peeling (the extra runtime checks, the increase
2242 in code size). */
2244 FOR_EACH_VEC_ELT (datarefs, i, dr)
2246 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2247 if (!vect_relevant_for_alignment_p (dr_info))
2248 continue;
2250 stmt_vec_info stmt_info = dr_info->stmt;
2251 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2252 do_peeling = vector_alignment_reachable_p (dr_info);
2253 if (do_peeling)
2255 if (known_alignment_for_access_p (dr_info, vectype))
2257 unsigned int npeel_tmp = 0;
2258 bool negative = tree_int_cst_compare (DR_STEP (dr),
2259 size_zero_node) < 0;
2261 /* If known_alignment_for_access_p then we have set
2262 DR_MISALIGNMENT which is only done if we know it at compiler
2263 time, so it is safe to assume target alignment is constant.
2265 unsigned int target_align =
2266 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2267 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2268 poly_int64 off = 0;
2269 if (negative)
2270 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2271 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2272 mis = negative ? mis : -mis;
2273 if (mis != 0)
2274 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2276 /* For multiple types, it is possible that the bigger type access
2277 will have more than one peeling option. E.g., a loop with two
2278 types: one of size (vector size / 4), and the other one of
2279 size (vector size / 8). Vectorization factor will 8. If both
2280 accesses are misaligned by 3, the first one needs one scalar
2281 iteration to be aligned, and the second one needs 5. But the
2282 first one will be aligned also by peeling 5 scalar
2283 iterations, and in that case both accesses will be aligned.
2284 Hence, except for the immediate peeling amount, we also want
2285 to try to add full vector size, while we don't exceed
2286 vectorization factor.
2287 We do this automatically for cost model, since we calculate
2288 cost for every peeling option. */
2289 poly_uint64 nscalars = npeel_tmp;
2290 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2292 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2293 nscalars = (STMT_SLP_TYPE (stmt_info)
2294 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2297 /* Save info about DR in the hash table. Also include peeling
2298 amounts according to the explanation above. Indicate
2299 the alignment status when the ref is not aligned.
2300 ??? Rather than using unknown alignment here we should
2301 prune all entries from the peeling hashtable which cause
2302 DRs to be not supported. */
2303 bool supportable_if_not_aligned
2304 = vect_supportable_dr_alignment
2305 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2306 while (known_le (npeel_tmp, nscalars))
2308 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2309 dr_info, npeel_tmp,
2310 supportable_if_not_aligned);
2311 npeel_tmp += MAX (1, target_align / dr_size);
2314 one_misalignment_known = true;
2316 else
2318 /* If we don't know any misalignment values, we prefer
2319 peeling for data-ref that has the maximum number of data-refs
2320 with the same alignment, unless the target prefers to align
2321 stores over load. */
2322 unsigned same_align_drs = n_same_align_refs[i];
2323 if (!dr0_info
2324 || dr0_same_align_drs < same_align_drs)
2326 dr0_same_align_drs = same_align_drs;
2327 dr0_info = dr_info;
2329 /* For data-refs with the same number of related
2330 accesses prefer the one where the misalign
2331 computation will be invariant in the outermost loop. */
2332 else if (dr0_same_align_drs == same_align_drs)
2334 class loop *ivloop0, *ivloop;
2335 ivloop0 = outermost_invariant_loop_for_expr
2336 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2337 ivloop = outermost_invariant_loop_for_expr
2338 (loop, DR_BASE_ADDRESS (dr));
2339 if ((ivloop && !ivloop0)
2340 || (ivloop && ivloop0
2341 && flow_loop_nested_p (ivloop, ivloop0)))
2342 dr0_info = dr_info;
2345 one_misalignment_unknown = true;
2347 /* Check for data refs with unsupportable alignment that
2348 can be peeled. */
2349 enum dr_alignment_support supportable_dr_alignment
2350 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2351 DR_MISALIGNMENT_UNKNOWN);
2352 if (supportable_dr_alignment == dr_unaligned_unsupported)
2354 one_dr_unsupportable = true;
2355 unsupportable_dr_info = dr_info;
2358 if (!first_store && DR_IS_WRITE (dr))
2360 first_store = dr_info;
2361 first_store_same_align_drs = same_align_drs;
2365 else
2367 if (!aligned_access_p (dr_info, vectype))
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371 "vector alignment may not be reachable\n");
2372 break;
2377 /* Check if we can possibly peel the loop. */
2378 if (!vect_can_advance_ivs_p (loop_vinfo)
2379 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2380 loop_preheader_edge (loop))
2381 || loop->inner
2382 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2383 do_peeling = false;
2385 struct _vect_peel_extended_info peel_for_known_alignment;
2386 struct _vect_peel_extended_info peel_for_unknown_alignment;
2387 struct _vect_peel_extended_info best_peel;
2389 peel_for_unknown_alignment.inside_cost = INT_MAX;
2390 peel_for_unknown_alignment.outside_cost = INT_MAX;
2391 peel_for_unknown_alignment.peel_info.count = 0;
2393 if (do_peeling
2394 && one_misalignment_unknown)
2396 /* Check if the target requires to prefer stores over loads, i.e., if
2397 misaligned stores are more expensive than misaligned loads (taking
2398 drs with same alignment into account). */
2399 unsigned int load_inside_cost = 0;
2400 unsigned int load_outside_cost = 0;
2401 unsigned int store_inside_cost = 0;
2402 unsigned int store_outside_cost = 0;
2403 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2405 stmt_vector_for_cost dummy;
2406 dummy.create (2);
2407 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2408 &load_inside_cost,
2409 &load_outside_cost,
2410 &dummy, &dummy, estimated_npeels);
2411 dummy.release ();
2413 if (first_store)
2415 dummy.create (2);
2416 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2417 &store_inside_cost,
2418 &store_outside_cost,
2419 &dummy, &dummy,
2420 estimated_npeels);
2421 dummy.release ();
2423 else
2425 store_inside_cost = INT_MAX;
2426 store_outside_cost = INT_MAX;
2429 if (load_inside_cost > store_inside_cost
2430 || (load_inside_cost == store_inside_cost
2431 && load_outside_cost > store_outside_cost))
2433 dr0_info = first_store;
2434 dr0_same_align_drs = first_store_same_align_drs;
2435 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2436 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2438 else
2440 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2441 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2444 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2445 prologue_cost_vec.create (2);
2446 epilogue_cost_vec.create (2);
2448 int dummy2;
2449 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2450 (loop_vinfo, estimated_npeels, &dummy2,
2451 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2452 &prologue_cost_vec, &epilogue_cost_vec);
2454 prologue_cost_vec.release ();
2455 epilogue_cost_vec.release ();
2457 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2460 peel_for_unknown_alignment.peel_info.npeel = 0;
2461 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2463 best_peel = peel_for_unknown_alignment;
2465 peel_for_known_alignment.inside_cost = INT_MAX;
2466 peel_for_known_alignment.outside_cost = INT_MAX;
2467 peel_for_known_alignment.peel_info.count = 0;
2468 peel_for_known_alignment.peel_info.dr_info = NULL;
2470 if (do_peeling && one_misalignment_known)
2472 /* Peeling is possible, but there is no data access that is not supported
2473 unless aligned. So we try to choose the best possible peeling from
2474 the hash table. */
2475 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2476 (&peeling_htab, loop_vinfo);
2479 /* Compare costs of peeling for known and unknown alignment. */
2480 if (peel_for_known_alignment.peel_info.dr_info != NULL
2481 && peel_for_unknown_alignment.inside_cost
2482 >= peel_for_known_alignment.inside_cost)
2484 best_peel = peel_for_known_alignment;
2486 /* If the best peeling for known alignment has NPEEL == 0, perform no
2487 peeling at all except if there is an unsupportable dr that we can
2488 align. */
2489 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2490 do_peeling = false;
2493 /* If there is an unsupportable data ref, prefer this over all choices so far
2494 since we'd have to discard a chosen peeling except when it accidentally
2495 aligned the unsupportable data ref. */
2496 if (one_dr_unsupportable)
2497 dr0_info = unsupportable_dr_info;
2498 else if (do_peeling)
2500 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2501 TODO: Use nopeel_outside_cost or get rid of it? */
2502 unsigned nopeel_inside_cost = 0;
2503 unsigned nopeel_outside_cost = 0;
2505 stmt_vector_for_cost dummy;
2506 dummy.create (2);
2507 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2508 &nopeel_outside_cost, &dummy, &dummy, 0);
2509 dummy.release ();
2511 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2512 costs will be recorded. */
2513 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2514 prologue_cost_vec.create (2);
2515 epilogue_cost_vec.create (2);
2517 int dummy2;
2518 nopeel_outside_cost += vect_get_known_peeling_cost
2519 (loop_vinfo, 0, &dummy2,
2520 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2521 &prologue_cost_vec, &epilogue_cost_vec);
2523 prologue_cost_vec.release ();
2524 epilogue_cost_vec.release ();
2526 npeel = best_peel.peel_info.npeel;
2527 dr0_info = best_peel.peel_info.dr_info;
2529 /* If no peeling is not more expensive than the best peeling we
2530 have so far, don't perform any peeling. */
2531 if (nopeel_inside_cost <= best_peel.inside_cost)
2532 do_peeling = false;
2535 if (do_peeling)
2537 stmt_vec_info stmt_info = dr0_info->stmt;
2538 if (known_alignment_for_access_p (dr0_info,
2539 STMT_VINFO_VECTYPE (stmt_info)))
2541 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2542 size_zero_node) < 0;
2543 if (!npeel)
2545 /* Since it's known at compile time, compute the number of
2546 iterations in the peeled loop (the peeling factor) for use in
2547 updating DR_MISALIGNMENT values. The peeling factor is the
2548 vectorization factor minus the misalignment as an element
2549 count. */
2550 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2551 poly_int64 off = 0;
2552 if (negative)
2553 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2554 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2555 unsigned int mis
2556 = dr_misalignment (dr0_info, vectype, off);
2557 mis = negative ? mis : -mis;
2558 /* If known_alignment_for_access_p then we have set
2559 DR_MISALIGNMENT which is only done if we know it at compiler
2560 time, so it is safe to assume target alignment is constant.
2562 unsigned int target_align =
2563 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2564 npeel = ((mis & (target_align - 1))
2565 / vect_get_scalar_dr_size (dr0_info));
2568 /* For interleaved data access every iteration accesses all the
2569 members of the group, therefore we divide the number of iterations
2570 by the group size. */
2571 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2572 npeel /= DR_GROUP_SIZE (stmt_info);
2574 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_NOTE, vect_location,
2576 "Try peeling by %d\n", npeel);
2579 /* Ensure that all datarefs can be vectorized after the peel. */
2580 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2581 do_peeling = false;
2583 /* Check if all datarefs are supportable and log. */
2584 if (do_peeling
2585 && npeel == 0
2586 && known_alignment_for_access_p (dr0_info,
2587 STMT_VINFO_VECTYPE (stmt_info)))
2588 return opt_result::success ();
2590 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2591 if (do_peeling)
2593 unsigned max_allowed_peel
2594 = param_vect_max_peeling_for_alignment;
2595 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2596 max_allowed_peel = 0;
2597 if (max_allowed_peel != (unsigned)-1)
2599 unsigned max_peel = npeel;
2600 if (max_peel == 0)
2602 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2603 unsigned HOST_WIDE_INT target_align_c;
2604 if (target_align.is_constant (&target_align_c))
2605 max_peel =
2606 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2607 else
2609 do_peeling = false;
2610 if (dump_enabled_p ())
2611 dump_printf_loc (MSG_NOTE, vect_location,
2612 "Disable peeling, max peels set and vector"
2613 " alignment unknown\n");
2616 if (max_peel > max_allowed_peel)
2618 do_peeling = false;
2619 if (dump_enabled_p ())
2620 dump_printf_loc (MSG_NOTE, vect_location,
2621 "Disable peeling, max peels reached: %d\n", max_peel);
2626 /* Cost model #2 - if peeling may result in a remaining loop not
2627 iterating enough to be vectorized then do not peel. Since this
2628 is a cost heuristic rather than a correctness decision, use the
2629 most likely runtime value for variable vectorization factors. */
2630 if (do_peeling
2631 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2633 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2634 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2635 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2636 < assumed_vf + max_peel)
2637 do_peeling = false;
2640 if (do_peeling)
2642 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2643 If the misalignment of DR_i is identical to that of dr0 then set
2644 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2645 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2646 by the peeling factor times the element size of DR_i (MOD the
2647 vectorization factor times the size). Otherwise, the
2648 misalignment of DR_i must be set to unknown. */
2649 FOR_EACH_VEC_ELT (datarefs, i, dr)
2650 if (dr != dr0_info->dr)
2652 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2653 if (!vect_relevant_for_alignment_p (dr_info))
2654 continue;
2656 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2659 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2660 if (npeel)
2661 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2662 else
2663 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2664 SET_DR_MISALIGNMENT (dr0_info,
2665 vect_dr_misalign_for_aligned_access (dr0_info));
2666 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_NOTE, vect_location,
2669 "Alignment of access forced using peeling.\n");
2670 dump_printf_loc (MSG_NOTE, vect_location,
2671 "Peeling for alignment will be applied.\n");
2674 /* The inside-loop cost will be accounted for in vectorizable_load
2675 and vectorizable_store correctly with adjusted alignments.
2676 Drop the body_cst_vec on the floor here. */
2677 return opt_result::success ();
2681 /* (2) Versioning to force alignment. */
2683 /* Try versioning if:
2684 1) optimize loop for speed and the cost-model is not cheap
2685 2) there is at least one unsupported misaligned data ref with an unknown
2686 misalignment, and
2687 3) all misaligned data refs with a known misalignment are supported, and
2688 4) the number of runtime alignment checks is within reason. */
2690 do_versioning
2691 = (optimize_loop_nest_for_speed_p (loop)
2692 && !loop->inner /* FORNOW */
2693 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2695 if (do_versioning)
2697 FOR_EACH_VEC_ELT (datarefs, i, dr)
2699 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2700 if (!vect_relevant_for_alignment_p (dr_info))
2701 continue;
2703 stmt_vec_info stmt_info = dr_info->stmt;
2704 if (STMT_VINFO_STRIDED_P (stmt_info))
2706 do_versioning = false;
2707 break;
2710 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2711 bool negative = tree_int_cst_compare (DR_STEP (dr),
2712 size_zero_node) < 0;
2713 poly_int64 off = 0;
2714 if (negative)
2715 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2716 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2717 int misalignment;
2718 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2719 continue;
2721 enum dr_alignment_support supportable_dr_alignment
2722 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2723 misalignment);
2724 if (supportable_dr_alignment == dr_unaligned_unsupported)
2726 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2727 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2728 >= (unsigned) param_vect_max_version_for_alignment_checks))
2730 do_versioning = false;
2731 break;
2734 /* At present we don't support versioning for alignment
2735 with variable VF, since there's no guarantee that the
2736 VF is a power of two. We could relax this if we added
2737 a way of enforcing a power-of-two size. */
2738 unsigned HOST_WIDE_INT size;
2739 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2741 do_versioning = false;
2742 break;
2745 /* Forcing alignment in the first iteration is no good if
2746 we don't keep it across iterations. For now, just disable
2747 versioning in this case.
2748 ?? We could actually unroll the loop to achieve the required
2749 overall step alignment, and forcing the alignment could be
2750 done by doing some iterations of the non-vectorized loop. */
2751 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2752 * DR_STEP_ALIGNMENT (dr),
2753 DR_TARGET_ALIGNMENT (dr_info)))
2755 do_versioning = false;
2756 break;
2759 /* The rightmost bits of an aligned address must be zeros.
2760 Construct the mask needed for this test. For example,
2761 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2762 mask must be 15 = 0xf. */
2763 int mask = size - 1;
2765 /* FORNOW: use the same mask to test all potentially unaligned
2766 references in the loop. */
2767 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2768 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2770 do_versioning = false;
2771 break;
2774 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2775 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2779 /* Versioning requires at least one misaligned data reference. */
2780 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2781 do_versioning = false;
2782 else if (!do_versioning)
2783 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2786 if (do_versioning)
2788 const vec<stmt_vec_info> &may_misalign_stmts
2789 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2790 stmt_vec_info stmt_info;
2792 /* It can now be assumed that the data references in the statements
2793 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2794 of the loop being vectorized. */
2795 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2797 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2798 SET_DR_MISALIGNMENT (dr_info,
2799 vect_dr_misalign_for_aligned_access (dr_info));
2800 if (dump_enabled_p ())
2801 dump_printf_loc (MSG_NOTE, vect_location,
2802 "Alignment of access forced using versioning.\n");
2805 if (dump_enabled_p ())
2806 dump_printf_loc (MSG_NOTE, vect_location,
2807 "Versioning for alignment will be applied.\n");
2809 /* Peeling and versioning can't be done together at this time. */
2810 gcc_assert (! (do_peeling && do_versioning));
2812 return opt_result::success ();
2815 /* This point is reached if neither peeling nor versioning is being done. */
2816 gcc_assert (! (do_peeling || do_versioning));
2818 return opt_result::success ();
2822 /* Function vect_analyze_data_refs_alignment
2824 Analyze the alignment of the data-references in the loop.
2825 Return FALSE if a data reference is found that cannot be vectorized. */
2827 opt_result
2828 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2830 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2832 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2833 struct data_reference *dr;
2834 unsigned int i;
2836 vect_record_base_alignments (loop_vinfo);
2837 FOR_EACH_VEC_ELT (datarefs, i, dr)
2839 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2840 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2842 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2843 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2844 continue;
2845 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2846 STMT_VINFO_VECTYPE (dr_info->stmt));
2850 return opt_result::success ();
2854 /* Analyze alignment of DRs of stmts in NODE. */
2856 static bool
2857 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2859 /* Alignment is maintained in the first element of the group. */
2860 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2861 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2862 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2863 tree vectype = SLP_TREE_VECTYPE (node);
2864 poly_uint64 vector_alignment
2865 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2866 BITS_PER_UNIT);
2867 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2868 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2869 /* Re-analyze alignment when we're facing a vectorization with a bigger
2870 alignment requirement. */
2871 else if (known_lt (dr_info->target_alignment, vector_alignment))
2873 poly_uint64 old_target_alignment = dr_info->target_alignment;
2874 int old_misalignment = dr_info->misalignment;
2875 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2876 /* But keep knowledge about a smaller alignment. */
2877 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2878 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2880 dr_info->target_alignment = old_target_alignment;
2881 dr_info->misalignment = old_misalignment;
2884 /* When we ever face unordered target alignments the first one wins in terms
2885 of analyzing and the other will become unknown in dr_misalignment. */
2886 return true;
2889 /* Function vect_slp_analyze_instance_alignment
2891 Analyze the alignment of the data-references in the SLP instance.
2892 Return FALSE if a data reference is found that cannot be vectorized. */
2894 bool
2895 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2896 slp_instance instance)
2898 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2900 slp_tree node;
2901 unsigned i;
2902 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2903 if (! vect_slp_analyze_node_alignment (vinfo, node))
2904 return false;
2906 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2907 && ! vect_slp_analyze_node_alignment
2908 (vinfo, SLP_INSTANCE_TREE (instance)))
2909 return false;
2911 return true;
2915 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2916 accesses of legal size, step, etc. Detect gaps, single element
2917 interleaving, and other special cases. Set grouped access info.
2918 Collect groups of strided stores for further use in SLP analysis.
2919 Worker for vect_analyze_group_access. */
2921 static bool
2922 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2924 data_reference *dr = dr_info->dr;
2925 tree step = DR_STEP (dr);
2926 tree scalar_type = TREE_TYPE (DR_REF (dr));
2927 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2928 stmt_vec_info stmt_info = dr_info->stmt;
2929 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2930 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2931 HOST_WIDE_INT dr_step = -1;
2932 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2933 bool slp_impossible = false;
2935 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2936 size of the interleaving group (including gaps). */
2937 if (tree_fits_shwi_p (step))
2939 dr_step = tree_to_shwi (step);
2940 /* Check that STEP is a multiple of type size. Otherwise there is
2941 a non-element-sized gap at the end of the group which we
2942 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2943 ??? As we can handle non-constant step fine here we should
2944 simply remove uses of DR_GROUP_GAP between the last and first
2945 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2946 simply not include that gap. */
2947 if ((dr_step % type_size) != 0)
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "Step %T is not a multiple of the element size"
2952 " for %T\n",
2953 step, DR_REF (dr));
2954 return false;
2956 groupsize = absu_hwi (dr_step) / type_size;
2958 else
2959 groupsize = 0;
2961 /* Not consecutive access is possible only if it is a part of interleaving. */
2962 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2964 /* Check if it this DR is a part of interleaving, and is a single
2965 element of the group that is accessed in the loop. */
2967 /* Gaps are supported only for loads. STEP must be a multiple of the type
2968 size. */
2969 if (DR_IS_READ (dr)
2970 && (dr_step % type_size) == 0
2971 && groupsize > 0
2972 /* This could be UINT_MAX but as we are generating code in a very
2973 inefficient way we have to cap earlier.
2974 See PR91403 for example. */
2975 && groupsize <= 4096)
2977 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2978 DR_GROUP_SIZE (stmt_info) = groupsize;
2979 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "Detected single element interleaving %T"
2983 " step %T\n",
2984 DR_REF (dr), step);
2986 return true;
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991 "not consecutive access %G", stmt_info->stmt);
2993 if (bb_vinfo)
2995 /* Mark the statement as unvectorizable. */
2996 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2997 return true;
3000 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3002 STMT_VINFO_STRIDED_P (stmt_info) = true;
3003 return true;
3006 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3008 /* First stmt in the interleaving chain. Check the chain. */
3009 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3010 struct data_reference *data_ref = dr;
3011 unsigned int count = 1;
3012 tree prev_init = DR_INIT (data_ref);
3013 HOST_WIDE_INT diff, gaps = 0;
3015 /* By construction, all group members have INTEGER_CST DR_INITs. */
3016 while (next)
3018 /* We never have the same DR multiple times. */
3019 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3020 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3022 data_ref = STMT_VINFO_DATA_REF (next);
3024 /* All group members have the same STEP by construction. */
3025 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3027 /* Check that the distance between two accesses is equal to the type
3028 size. Otherwise, we have gaps. */
3029 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3030 - TREE_INT_CST_LOW (prev_init)) / type_size;
3031 if (diff < 1 || diff > UINT_MAX)
3033 /* For artificial testcases with array accesses with large
3034 constant indices we can run into overflow issues which
3035 can end up fooling the groupsize constraint below so
3036 check the individual gaps (which are represented as
3037 unsigned int) as well. */
3038 if (dump_enabled_p ())
3039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040 "interleaved access with gap larger "
3041 "than representable\n");
3042 return false;
3044 if (diff != 1)
3046 /* FORNOW: SLP of accesses with gaps is not supported. */
3047 slp_impossible = true;
3048 if (DR_IS_WRITE (data_ref))
3050 if (dump_enabled_p ())
3051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3052 "interleaved store with gaps\n");
3053 return false;
3056 gaps += diff - 1;
3059 last_accessed_element += diff;
3061 /* Store the gap from the previous member of the group. If there is no
3062 gap in the access, DR_GROUP_GAP is always 1. */
3063 DR_GROUP_GAP (next) = diff;
3065 prev_init = DR_INIT (data_ref);
3066 next = DR_GROUP_NEXT_ELEMENT (next);
3067 /* Count the number of data-refs in the chain. */
3068 count++;
3071 if (groupsize == 0)
3072 groupsize = count + gaps;
3074 /* This could be UINT_MAX but as we are generating code in a very
3075 inefficient way we have to cap earlier. See PR78699 for example. */
3076 if (groupsize > 4096)
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080 "group is too large\n");
3081 return false;
3084 /* Check that the size of the interleaving is equal to count for stores,
3085 i.e., that there are no gaps. */
3086 if (groupsize != count
3087 && !DR_IS_READ (dr))
3089 groupsize = count;
3090 STMT_VINFO_STRIDED_P (stmt_info) = true;
3093 /* If there is a gap after the last load in the group it is the
3094 difference between the groupsize and the last accessed
3095 element.
3096 When there is no gap, this difference should be 0. */
3097 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3099 DR_GROUP_SIZE (stmt_info) = groupsize;
3100 if (dump_enabled_p ())
3102 dump_printf_loc (MSG_NOTE, vect_location,
3103 "Detected interleaving ");
3104 if (DR_IS_READ (dr))
3105 dump_printf (MSG_NOTE, "load ");
3106 else if (STMT_VINFO_STRIDED_P (stmt_info))
3107 dump_printf (MSG_NOTE, "strided store ");
3108 else
3109 dump_printf (MSG_NOTE, "store ");
3110 dump_printf (MSG_NOTE, "of size %u\n",
3111 (unsigned)groupsize);
3112 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3113 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3114 while (next)
3116 if (DR_GROUP_GAP (next) != 1)
3117 dump_printf_loc (MSG_NOTE, vect_location,
3118 "\t<gap of %d elements>\n",
3119 DR_GROUP_GAP (next) - 1);
3120 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3121 next = DR_GROUP_NEXT_ELEMENT (next);
3123 if (DR_GROUP_GAP (stmt_info) != 0)
3124 dump_printf_loc (MSG_NOTE, vect_location,
3125 "\t<gap of %d elements>\n",
3126 DR_GROUP_GAP (stmt_info));
3129 /* SLP: create an SLP data structure for every interleaving group of
3130 stores for further analysis in vect_analyse_slp. */
3131 if (DR_IS_WRITE (dr) && !slp_impossible)
3133 if (loop_vinfo)
3134 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3135 if (bb_vinfo)
3136 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3140 return true;
3143 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3144 accesses of legal size, step, etc. Detect gaps, single element
3145 interleaving, and other special cases. Set grouped access info.
3146 Collect groups of strided stores for further use in SLP analysis. */
3148 static bool
3149 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3151 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3153 /* Dissolve the group if present. */
3154 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3155 while (stmt_info)
3157 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3158 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3159 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3160 stmt_info = next;
3162 return false;
3164 return true;
3167 /* Analyze the access pattern of the data-reference DR_INFO.
3168 In case of non-consecutive accesses call vect_analyze_group_access() to
3169 analyze groups of accesses. */
3171 static bool
3172 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3174 data_reference *dr = dr_info->dr;
3175 tree step = DR_STEP (dr);
3176 tree scalar_type = TREE_TYPE (DR_REF (dr));
3177 stmt_vec_info stmt_info = dr_info->stmt;
3178 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3179 class loop *loop = NULL;
3181 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3182 return true;
3184 if (loop_vinfo)
3185 loop = LOOP_VINFO_LOOP (loop_vinfo);
3187 if (loop_vinfo && !step)
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3191 "bad data-ref access in loop\n");
3192 return false;
3195 /* Allow loads with zero step in inner-loop vectorization. */
3196 if (loop_vinfo && integer_zerop (step))
3198 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3199 if (!nested_in_vect_loop_p (loop, stmt_info))
3200 return DR_IS_READ (dr);
3201 /* Allow references with zero step for outer loops marked
3202 with pragma omp simd only - it guarantees absence of
3203 loop-carried dependencies between inner loop iterations. */
3204 if (loop->safelen < 2)
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_NOTE, vect_location,
3208 "zero step in inner loop of nest\n");
3209 return false;
3213 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3215 /* Interleaved accesses are not yet supported within outer-loop
3216 vectorization for references in the inner-loop. */
3217 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3219 /* For the rest of the analysis we use the outer-loop step. */
3220 step = STMT_VINFO_DR_STEP (stmt_info);
3221 if (integer_zerop (step))
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_NOTE, vect_location,
3225 "zero step in outer loop.\n");
3226 return DR_IS_READ (dr);
3230 /* Consecutive? */
3231 if (TREE_CODE (step) == INTEGER_CST)
3233 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3234 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3235 || (dr_step < 0
3236 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3238 /* Mark that it is not interleaving. */
3239 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3240 return true;
3244 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3246 if (dump_enabled_p ())
3247 dump_printf_loc (MSG_NOTE, vect_location,
3248 "grouped access in outer loop.\n");
3249 return false;
3253 /* Assume this is a DR handled by non-constant strided load case. */
3254 if (TREE_CODE (step) != INTEGER_CST)
3255 return (STMT_VINFO_STRIDED_P (stmt_info)
3256 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3257 || vect_analyze_group_access (vinfo, dr_info)));
3259 /* Not consecutive access - check if it's a part of interleaving group. */
3260 return vect_analyze_group_access (vinfo, dr_info);
3263 /* Compare two data-references DRA and DRB to group them into chunks
3264 suitable for grouping. */
3266 static int
3267 dr_group_sort_cmp (const void *dra_, const void *drb_)
3269 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3270 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3271 data_reference_p dra = dra_info->dr;
3272 data_reference_p drb = drb_info->dr;
3273 int cmp;
3275 /* Stabilize sort. */
3276 if (dra == drb)
3277 return 0;
3279 /* Different group IDs lead never belong to the same group. */
3280 if (dra_info->group != drb_info->group)
3281 return dra_info->group < drb_info->group ? -1 : 1;
3283 /* Ordering of DRs according to base. */
3284 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3285 DR_BASE_ADDRESS (drb));
3286 if (cmp != 0)
3287 return cmp;
3289 /* And according to DR_OFFSET. */
3290 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3291 if (cmp != 0)
3292 return cmp;
3294 /* Put reads before writes. */
3295 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3296 return DR_IS_READ (dra) ? -1 : 1;
3298 /* Then sort after access size. */
3299 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3300 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3301 if (cmp != 0)
3302 return cmp;
3304 /* And after step. */
3305 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3306 if (cmp != 0)
3307 return cmp;
3309 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3310 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3311 if (cmp == 0)
3312 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3313 return cmp;
3316 /* If OP is the result of a conversion, return the unconverted value,
3317 otherwise return null. */
3319 static tree
3320 strip_conversion (tree op)
3322 if (TREE_CODE (op) != SSA_NAME)
3323 return NULL_TREE;
3324 gimple *stmt = SSA_NAME_DEF_STMT (op);
3325 if (!is_gimple_assign (stmt)
3326 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3327 return NULL_TREE;
3328 return gimple_assign_rhs1 (stmt);
3331 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3332 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3333 be grouped in SLP mode. */
3335 static bool
3336 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3337 bool allow_slp_p)
3339 if (gimple_assign_single_p (stmt1_info->stmt))
3340 return gimple_assign_single_p (stmt2_info->stmt);
3342 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3343 if (call1 && gimple_call_internal_p (call1))
3345 /* Check for two masked loads or two masked stores. */
3346 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3347 if (!call2 || !gimple_call_internal_p (call2))
3348 return false;
3349 internal_fn ifn = gimple_call_internal_fn (call1);
3350 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3351 return false;
3352 if (ifn != gimple_call_internal_fn (call2))
3353 return false;
3355 /* Check that the masks are the same. Cope with casts of masks,
3356 like those created by build_mask_conversion. */
3357 tree mask1 = gimple_call_arg (call1, 2);
3358 tree mask2 = gimple_call_arg (call2, 2);
3359 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3361 mask1 = strip_conversion (mask1);
3362 if (!mask1)
3363 return false;
3364 mask2 = strip_conversion (mask2);
3365 if (!mask2)
3366 return false;
3367 if (!operand_equal_p (mask1, mask2, 0))
3368 return false;
3370 return true;
3373 return false;
3376 /* Function vect_analyze_data_ref_accesses.
3378 Analyze the access pattern of all the data references in the loop.
3380 FORNOW: the only access pattern that is considered vectorizable is a
3381 simple step 1 (consecutive) access.
3383 FORNOW: handle only arrays and pointer accesses. */
3385 opt_result
3386 vect_analyze_data_ref_accesses (vec_info *vinfo,
3387 vec<int> *dataref_groups)
3389 unsigned int i;
3390 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3392 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3394 if (datarefs.is_empty ())
3395 return opt_result::success ();
3397 /* Sort the array of datarefs to make building the interleaving chains
3398 linear. Don't modify the original vector's order, it is needed for
3399 determining what dependencies are reversed. */
3400 vec<dr_vec_info *> datarefs_copy;
3401 datarefs_copy.create (datarefs.length ());
3402 for (unsigned i = 0; i < datarefs.length (); i++)
3404 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3405 /* If the caller computed DR grouping use that, otherwise group by
3406 basic blocks. */
3407 if (dataref_groups)
3408 dr_info->group = (*dataref_groups)[i];
3409 else
3410 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3411 datarefs_copy.quick_push (dr_info);
3413 datarefs_copy.qsort (dr_group_sort_cmp);
3414 hash_set<stmt_vec_info> to_fixup;
3416 /* Build the interleaving chains. */
3417 for (i = 0; i < datarefs_copy.length () - 1;)
3419 dr_vec_info *dr_info_a = datarefs_copy[i];
3420 data_reference_p dra = dr_info_a->dr;
3421 int dra_group_id = dr_info_a->group;
3422 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3423 stmt_vec_info lastinfo = NULL;
3424 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3425 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3427 ++i;
3428 continue;
3430 for (i = i + 1; i < datarefs_copy.length (); ++i)
3432 dr_vec_info *dr_info_b = datarefs_copy[i];
3433 data_reference_p drb = dr_info_b->dr;
3434 int drb_group_id = dr_info_b->group;
3435 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3436 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3437 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3438 break;
3440 /* ??? Imperfect sorting (non-compatible types, non-modulo
3441 accesses, same accesses) can lead to a group to be artificially
3442 split here as we don't just skip over those. If it really
3443 matters we can push those to a worklist and re-iterate
3444 over them. The we can just skip ahead to the next DR here. */
3446 /* DRs in a different DR group should not be put into the same
3447 interleaving group. */
3448 if (dra_group_id != drb_group_id)
3449 break;
3451 /* Check that the data-refs have same first location (except init)
3452 and they are both either store or load (not load and store,
3453 not masked loads or stores). */
3454 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3455 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3456 DR_BASE_ADDRESS (drb)) != 0
3457 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3458 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3459 break;
3461 /* Check that the data-refs have the same constant size. */
3462 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3463 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3464 if (!tree_fits_uhwi_p (sza)
3465 || !tree_fits_uhwi_p (szb)
3466 || !tree_int_cst_equal (sza, szb))
3467 break;
3469 /* Check that the data-refs have the same step. */
3470 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3471 break;
3473 /* Check the types are compatible.
3474 ??? We don't distinguish this during sorting. */
3475 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3476 TREE_TYPE (DR_REF (drb))))
3477 break;
3479 /* Check that the DR_INITs are compile-time constants. */
3480 if (!tree_fits_shwi_p (DR_INIT (dra))
3481 || !tree_fits_shwi_p (DR_INIT (drb)))
3482 break;
3484 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3485 just hold extra information. */
3486 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3487 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3488 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3489 break;
3491 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3492 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3493 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3494 HOST_WIDE_INT init_prev
3495 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3496 gcc_assert (init_a <= init_b
3497 && init_a <= init_prev
3498 && init_prev <= init_b);
3500 /* Do not place the same access in the interleaving chain twice. */
3501 if (init_b == init_prev)
3503 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3504 < gimple_uid (DR_STMT (drb)));
3505 /* Simply link in duplicates and fix up the chain below. */
3507 else
3509 /* If init_b == init_a + the size of the type * k, we have an
3510 interleaving, and DRA is accessed before DRB. */
3511 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3512 if (type_size_a == 0
3513 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3514 % type_size_a != 0))
3515 break;
3517 /* If we have a store, the accesses are adjacent. This splits
3518 groups into chunks we support (we don't support vectorization
3519 of stores with gaps). */
3520 if (!DR_IS_READ (dra)
3521 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3522 != type_size_a))
3523 break;
3525 /* If the step (if not zero or non-constant) is smaller than the
3526 difference between data-refs' inits this splits groups into
3527 suitable sizes. */
3528 if (tree_fits_shwi_p (DR_STEP (dra)))
3530 unsigned HOST_WIDE_INT step
3531 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3532 if (step != 0
3533 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3534 break;
3538 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_NOTE, vect_location,
3540 DR_IS_READ (dra)
3541 ? "Detected interleaving load %T and %T\n"
3542 : "Detected interleaving store %T and %T\n",
3543 DR_REF (dra), DR_REF (drb));
3545 /* Link the found element into the group list. */
3546 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3548 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3549 lastinfo = stmtinfo_a;
3551 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3552 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3553 lastinfo = stmtinfo_b;
3555 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3556 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3558 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3559 dump_printf_loc (MSG_NOTE, vect_location,
3560 "Load suitable for SLP vectorization only.\n");
3562 if (init_b == init_prev
3563 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3564 && dump_enabled_p ())
3565 dump_printf_loc (MSG_NOTE, vect_location,
3566 "Queuing group with duplicate access for fixup\n");
3570 /* Fixup groups with duplicate entries by splitting it. */
3571 while (1)
3573 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3574 if (!(it != to_fixup.end ()))
3575 break;
3576 stmt_vec_info grp = *it;
3577 to_fixup.remove (grp);
3579 /* Find the earliest duplicate group member. */
3580 unsigned first_duplicate = -1u;
3581 stmt_vec_info next, g = grp;
3582 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3584 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3585 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3586 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3587 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3588 g = next;
3590 if (first_duplicate == -1U)
3591 continue;
3593 /* Then move all stmts after the first duplicate to a new group.
3594 Note this is a heuristic but one with the property that *it
3595 is fixed up completely. */
3596 g = grp;
3597 stmt_vec_info newgroup = NULL, ng = grp;
3598 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3600 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3602 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3603 if (!newgroup)
3604 newgroup = next;
3605 else
3606 DR_GROUP_NEXT_ELEMENT (ng) = next;
3607 ng = next;
3608 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3610 else
3611 g = DR_GROUP_NEXT_ELEMENT (g);
3613 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3615 /* Fixup the new group which still may contain duplicates. */
3616 to_fixup.add (newgroup);
3619 dr_vec_info *dr_info;
3620 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3622 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3623 && !vect_analyze_data_ref_access (vinfo, dr_info))
3625 if (dump_enabled_p ())
3626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3627 "not vectorized: complicated access pattern.\n");
3629 if (is_a <bb_vec_info> (vinfo))
3631 /* Mark the statement as not vectorizable. */
3632 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3633 continue;
3635 else
3637 datarefs_copy.release ();
3638 return opt_result::failure_at (dr_info->stmt->stmt,
3639 "not vectorized:"
3640 " complicated access pattern.\n");
3645 datarefs_copy.release ();
3646 return opt_result::success ();
3649 /* Function vect_vfa_segment_size.
3651 Input:
3652 DR_INFO: The data reference.
3653 LENGTH_FACTOR: segment length to consider.
3655 Return a value suitable for the dr_with_seg_len::seg_len field.
3656 This is the "distance travelled" by the pointer from the first
3657 iteration in the segment to the last. Note that it does not include
3658 the size of the access; in effect it only describes the first byte. */
3660 static tree
3661 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3663 length_factor = size_binop (MINUS_EXPR,
3664 fold_convert (sizetype, length_factor),
3665 size_one_node);
3666 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3667 length_factor);
3670 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3671 gives the worst-case number of bytes covered by the segment. */
3673 static unsigned HOST_WIDE_INT
3674 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3676 stmt_vec_info stmt_vinfo = dr_info->stmt;
3677 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3678 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3679 unsigned HOST_WIDE_INT access_size = ref_size;
3680 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3682 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3683 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3685 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3686 int misalignment;
3687 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3688 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3689 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3690 == dr_explicit_realign_optimized))
3692 /* We might access a full vector's worth. */
3693 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3695 return access_size;
3698 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3699 describes. */
3701 static unsigned int
3702 vect_vfa_align (dr_vec_info *dr_info)
3704 return dr_alignment (dr_info->dr);
3707 /* Function vect_no_alias_p.
3709 Given data references A and B with equal base and offset, see whether
3710 the alias relation can be decided at compilation time. Return 1 if
3711 it can and the references alias, 0 if it can and the references do
3712 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3713 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3714 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3716 static int
3717 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3718 tree segment_length_a, tree segment_length_b,
3719 unsigned HOST_WIDE_INT access_size_a,
3720 unsigned HOST_WIDE_INT access_size_b)
3722 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3723 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3724 poly_uint64 const_length_a;
3725 poly_uint64 const_length_b;
3727 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3728 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3729 [a, a+12) */
3730 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3732 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3733 offset_a -= const_length_a;
3735 else
3736 const_length_a = tree_to_poly_uint64 (segment_length_a);
3737 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3739 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3740 offset_b -= const_length_b;
3742 else
3743 const_length_b = tree_to_poly_uint64 (segment_length_b);
3745 const_length_a += access_size_a;
3746 const_length_b += access_size_b;
3748 if (ranges_known_overlap_p (offset_a, const_length_a,
3749 offset_b, const_length_b))
3750 return 1;
3752 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3753 offset_b, const_length_b))
3754 return 0;
3756 return -1;
3759 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3760 in DDR is >= VF. */
3762 static bool
3763 dependence_distance_ge_vf (data_dependence_relation *ddr,
3764 unsigned int loop_depth, poly_uint64 vf)
3766 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3767 || DDR_NUM_DIST_VECTS (ddr) == 0)
3768 return false;
3770 /* If the dependence is exact, we should have limited the VF instead. */
3771 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3773 unsigned int i;
3774 lambda_vector dist_v;
3775 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3777 HOST_WIDE_INT dist = dist_v[loop_depth];
3778 if (dist != 0
3779 && !(dist > 0 && DDR_REVERSED_P (ddr))
3780 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3781 return false;
3784 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_NOTE, vect_location,
3786 "dependence distance between %T and %T is >= VF\n",
3787 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3789 return true;
3792 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3794 static void
3795 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3797 dump_printf (dump_kind, "%s (%T) >= ",
3798 lower_bound.unsigned_p ? "unsigned" : "abs",
3799 lower_bound.expr);
3800 dump_dec (dump_kind, lower_bound.min_value);
3803 /* Record that the vectorized loop requires the vec_lower_bound described
3804 by EXPR, UNSIGNED_P and MIN_VALUE. */
3806 static void
3807 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3808 poly_uint64 min_value)
3810 vec<vec_lower_bound> &lower_bounds
3811 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3812 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3813 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3815 unsigned_p &= lower_bounds[i].unsigned_p;
3816 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3817 if (lower_bounds[i].unsigned_p != unsigned_p
3818 || maybe_lt (lower_bounds[i].min_value, min_value))
3820 lower_bounds[i].unsigned_p = unsigned_p;
3821 lower_bounds[i].min_value = min_value;
3822 if (dump_enabled_p ())
3824 dump_printf_loc (MSG_NOTE, vect_location,
3825 "updating run-time check to ");
3826 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3827 dump_printf (MSG_NOTE, "\n");
3830 return;
3833 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3834 if (dump_enabled_p ())
3836 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3837 dump_lower_bound (MSG_NOTE, lower_bound);
3838 dump_printf (MSG_NOTE, "\n");
3840 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3843 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3844 will span fewer than GAP bytes. */
3846 static bool
3847 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3848 poly_int64 gap)
3850 stmt_vec_info stmt_info = dr_info->stmt;
3851 HOST_WIDE_INT count
3852 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3853 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3854 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3855 return (estimated_poly_value (gap)
3856 <= count * vect_get_scalar_dr_size (dr_info));
3859 /* Return true if we know that there is no alias between DR_INFO_A and
3860 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3861 When returning true, set *LOWER_BOUND_OUT to this N. */
3863 static bool
3864 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3865 poly_uint64 *lower_bound_out)
3867 /* Check that there is a constant gap of known sign between DR_A
3868 and DR_B. */
3869 data_reference *dr_a = dr_info_a->dr;
3870 data_reference *dr_b = dr_info_b->dr;
3871 poly_int64 init_a, init_b;
3872 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3873 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3874 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3875 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3876 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3877 || !ordered_p (init_a, init_b))
3878 return false;
3880 /* Sort DR_A and DR_B by the address they access. */
3881 if (maybe_lt (init_b, init_a))
3883 std::swap (init_a, init_b);
3884 std::swap (dr_info_a, dr_info_b);
3885 std::swap (dr_a, dr_b);
3888 /* If the two accesses could be dependent within a scalar iteration,
3889 make sure that we'd retain their order. */
3890 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3891 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3892 return false;
3894 /* There is no alias if abs (DR_STEP) is greater than or equal to
3895 the bytes spanned by the combination of the two accesses. */
3896 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3897 return true;
3900 /* Function vect_prune_runtime_alias_test_list.
3902 Prune a list of ddrs to be tested at run-time by versioning for alias.
3903 Merge several alias checks into one if possible.
3904 Return FALSE if resulting list of ddrs is longer then allowed by
3905 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3907 opt_result
3908 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3910 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3911 hash_set <tree_pair_hash> compared_objects;
3913 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3914 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3915 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3916 const vec<vec_object_pair> &check_unequal_addrs
3917 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3918 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3919 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3921 ddr_p ddr;
3922 unsigned int i;
3923 tree length_factor;
3925 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3927 /* Step values are irrelevant for aliasing if the number of vector
3928 iterations is equal to the number of scalar iterations (which can
3929 happen for fully-SLP loops). */
3930 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3932 if (!vf_one_p)
3934 /* Convert the checks for nonzero steps into bound tests. */
3935 tree value;
3936 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3937 vect_check_lower_bound (loop_vinfo, value, true, 1);
3940 if (may_alias_ddrs.is_empty ())
3941 return opt_result::success ();
3943 comp_alias_ddrs.create (may_alias_ddrs.length ());
3945 unsigned int loop_depth
3946 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3947 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3949 /* First, we collect all data ref pairs for aliasing checks. */
3950 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3952 poly_uint64 lower_bound;
3953 tree segment_length_a, segment_length_b;
3954 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3955 unsigned int align_a, align_b;
3957 /* Ignore the alias if the VF we chose ended up being no greater
3958 than the dependence distance. */
3959 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3960 continue;
3962 if (DDR_OBJECT_A (ddr))
3964 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3965 if (!compared_objects.add (new_pair))
3967 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_NOTE, vect_location,
3969 "checking that %T and %T"
3970 " have different addresses\n",
3971 new_pair.first, new_pair.second);
3972 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3974 continue;
3977 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3978 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3980 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3981 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3983 bool preserves_scalar_order_p
3984 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3985 bool ignore_step_p
3986 = (vf_one_p
3987 && (preserves_scalar_order_p
3988 || operand_equal_p (DR_STEP (dr_info_a->dr),
3989 DR_STEP (dr_info_b->dr))));
3991 /* Skip the pair if inter-iteration dependencies are irrelevant
3992 and intra-iteration dependencies are guaranteed to be honored. */
3993 if (ignore_step_p
3994 && (preserves_scalar_order_p
3995 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3996 &lower_bound)))
3998 if (dump_enabled_p ())
3999 dump_printf_loc (MSG_NOTE, vect_location,
4000 "no need for alias check between "
4001 "%T and %T when VF is 1\n",
4002 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4003 continue;
4006 /* See whether we can handle the alias using a bounds check on
4007 the step, and whether that's likely to be the best approach.
4008 (It might not be, for example, if the minimum step is much larger
4009 than the number of bytes handled by one vector iteration.) */
4010 if (!ignore_step_p
4011 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4012 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4013 &lower_bound)
4014 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4015 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4017 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4018 if (dump_enabled_p ())
4020 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4021 "%T and %T when the step %T is outside ",
4022 DR_REF (dr_info_a->dr),
4023 DR_REF (dr_info_b->dr),
4024 DR_STEP (dr_info_a->dr));
4025 if (unsigned_p)
4026 dump_printf (MSG_NOTE, "[0");
4027 else
4029 dump_printf (MSG_NOTE, "(");
4030 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4032 dump_printf (MSG_NOTE, ", ");
4033 dump_dec (MSG_NOTE, lower_bound);
4034 dump_printf (MSG_NOTE, ")\n");
4036 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4037 unsigned_p, lower_bound);
4038 continue;
4041 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4042 if (dr_group_first_a)
4044 stmt_info_a = dr_group_first_a;
4045 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4048 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4049 if (dr_group_first_b)
4051 stmt_info_b = dr_group_first_b;
4052 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4055 if (ignore_step_p)
4057 segment_length_a = size_zero_node;
4058 segment_length_b = size_zero_node;
4060 else
4062 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4063 DR_STEP (dr_info_b->dr), 0))
4064 length_factor = scalar_loop_iters;
4065 else
4066 length_factor = size_int (vect_factor);
4067 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4068 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4070 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4071 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4072 align_a = vect_vfa_align (dr_info_a);
4073 align_b = vect_vfa_align (dr_info_b);
4075 /* See whether the alias is known at compilation time. */
4076 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4077 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4078 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4079 DR_OFFSET (dr_info_b->dr), 0)
4080 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4081 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4082 && poly_int_tree_p (segment_length_a)
4083 && poly_int_tree_p (segment_length_b))
4085 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4086 segment_length_a,
4087 segment_length_b,
4088 access_size_a,
4089 access_size_b);
4090 if (res >= 0 && dump_enabled_p ())
4092 dump_printf_loc (MSG_NOTE, vect_location,
4093 "can tell at compile time that %T and %T",
4094 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4095 if (res == 0)
4096 dump_printf (MSG_NOTE, " do not alias\n");
4097 else
4098 dump_printf (MSG_NOTE, " alias\n");
4101 if (res == 0)
4102 continue;
4104 if (res == 1)
4105 return opt_result::failure_at (stmt_info_b->stmt,
4106 "not vectorized:"
4107 " compilation time alias: %G%G",
4108 stmt_info_a->stmt,
4109 stmt_info_b->stmt);
4112 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4113 access_size_a, align_a);
4114 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4115 access_size_b, align_b);
4116 /* Canonicalize the order to be the one that's needed for accurate
4117 RAW, WAR and WAW flags, in cases where the data references are
4118 well-ordered. The order doesn't really matter otherwise,
4119 but we might as well be consistent. */
4120 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4121 std::swap (dr_a, dr_b);
4123 dr_with_seg_len_pair_t dr_with_seg_len_pair
4124 (dr_a, dr_b, (preserves_scalar_order_p
4125 ? dr_with_seg_len_pair_t::WELL_ORDERED
4126 : dr_with_seg_len_pair_t::REORDERED));
4128 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4131 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4133 unsigned int count = (comp_alias_ddrs.length ()
4134 + check_unequal_addrs.length ());
4136 if (count
4137 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4138 == VECT_COST_MODEL_VERY_CHEAP))
4139 return opt_result::failure_at
4140 (vect_location, "would need a runtime alias check\n");
4142 if (dump_enabled_p ())
4143 dump_printf_loc (MSG_NOTE, vect_location,
4144 "improved number of alias checks from %d to %d\n",
4145 may_alias_ddrs.length (), count);
4146 unsigned limit = param_vect_max_version_for_alias_checks;
4147 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4148 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4149 if (count > limit)
4150 return opt_result::failure_at
4151 (vect_location,
4152 "number of versioning for alias run-time tests exceeds %d "
4153 "(--param vect-max-version-for-alias-checks)\n", limit);
4155 return opt_result::success ();
4158 /* Check whether we can use an internal function for a gather load
4159 or scatter store. READ_P is true for loads and false for stores.
4160 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4161 the type of the memory elements being loaded or stored. OFFSET_TYPE
4162 is the type of the offset that is being applied to the invariant
4163 base address. SCALE is the amount by which the offset should
4164 be multiplied *after* it has been converted to address width.
4166 Return true if the function is supported, storing the function id in
4167 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4169 bool
4170 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4171 tree vectype, tree memory_type, tree offset_type,
4172 int scale, internal_fn *ifn_out,
4173 tree *offset_vectype_out)
4175 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4176 unsigned int element_bits = vector_element_bits (vectype);
4177 if (element_bits != memory_bits)
4178 /* For now the vector elements must be the same width as the
4179 memory elements. */
4180 return false;
4182 /* Work out which function we need. */
4183 internal_fn ifn, alt_ifn, alt_ifn2;
4184 if (read_p)
4186 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4187 alt_ifn = IFN_MASK_GATHER_LOAD;
4188 /* When target supports MASK_LEN_GATHER_LOAD, we always
4189 use MASK_LEN_GATHER_LOAD regardless whether len and
4190 mask are valid or not. */
4191 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4193 else
4195 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4196 alt_ifn = IFN_MASK_SCATTER_STORE;
4197 /* When target supports MASK_LEN_SCATTER_STORE, we always
4198 use MASK_LEN_SCATTER_STORE regardless whether len and
4199 mask are valid or not. */
4200 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4203 for (;;)
4205 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4206 if (!offset_vectype)
4207 return false;
4209 /* Test whether the target supports this combination. */
4210 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4211 offset_vectype, scale))
4213 *ifn_out = ifn;
4214 *offset_vectype_out = offset_vectype;
4215 return true;
4217 else if (!masked_p
4218 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4219 memory_type,
4220 offset_vectype,
4221 scale))
4223 *ifn_out = alt_ifn;
4224 *offset_vectype_out = offset_vectype;
4225 return true;
4227 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4228 memory_type,
4229 offset_vectype, scale))
4231 *ifn_out = alt_ifn2;
4232 *offset_vectype_out = offset_vectype;
4233 return true;
4236 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4237 && TYPE_PRECISION (offset_type) >= element_bits)
4238 return false;
4240 offset_type = build_nonstandard_integer_type
4241 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4245 /* STMT_INFO is a call to an internal gather load or scatter store function.
4246 Describe the operation in INFO. */
4248 static void
4249 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4250 gather_scatter_info *info)
4252 gcall *call = as_a <gcall *> (stmt_info->stmt);
4253 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4254 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4256 info->ifn = gimple_call_internal_fn (call);
4257 info->decl = NULL_TREE;
4258 info->base = gimple_call_arg (call, 0);
4259 info->offset = gimple_call_arg (call, 1);
4260 info->offset_dt = vect_unknown_def_type;
4261 info->offset_vectype = NULL_TREE;
4262 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4263 info->element_type = TREE_TYPE (vectype);
4264 info->memory_type = TREE_TYPE (DR_REF (dr));
4267 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4268 gather load or scatter store. Describe the operation in *INFO if so. */
4270 bool
4271 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4272 gather_scatter_info *info)
4274 HOST_WIDE_INT scale = 1;
4275 poly_int64 pbitpos, pbitsize;
4276 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4277 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4278 tree offtype = NULL_TREE;
4279 tree decl = NULL_TREE, base, off;
4280 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4281 tree memory_type = TREE_TYPE (DR_REF (dr));
4282 machine_mode pmode;
4283 int punsignedp, reversep, pvolatilep = 0;
4284 internal_fn ifn;
4285 tree offset_vectype;
4286 bool masked_p = false;
4288 /* See whether this is already a call to a gather/scatter internal function.
4289 If not, see whether it's a masked load or store. */
4290 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4291 if (call && gimple_call_internal_p (call))
4293 ifn = gimple_call_internal_fn (call);
4294 if (internal_gather_scatter_fn_p (ifn))
4296 vect_describe_gather_scatter_call (stmt_info, info);
4297 return true;
4299 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4302 /* True if we should aim to use internal functions rather than
4303 built-in functions. */
4304 bool use_ifn_p = (DR_IS_READ (dr)
4305 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4306 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4308 base = DR_REF (dr);
4309 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4310 see if we can use the def stmt of the address. */
4311 if (masked_p
4312 && TREE_CODE (base) == MEM_REF
4313 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4314 && integer_zerop (TREE_OPERAND (base, 1))
4315 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4317 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4318 if (is_gimple_assign (def_stmt)
4319 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4320 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4323 /* The gather and scatter builtins need address of the form
4324 loop_invariant + vector * {1, 2, 4, 8}
4326 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4327 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4328 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4329 multiplications and additions in it. To get a vector, we need
4330 a single SSA_NAME that will be defined in the loop and will
4331 contain everything that is not loop invariant and that can be
4332 vectorized. The following code attempts to find such a preexistng
4333 SSA_NAME OFF and put the loop invariants into a tree BASE
4334 that can be gimplified before the loop. */
4335 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4336 &punsignedp, &reversep, &pvolatilep);
4337 if (reversep)
4338 return false;
4340 /* PR 107346. Packed structs can have fields at offsets that are not
4341 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4342 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4343 return false;
4345 /* We need to be able to form an address to the base which for example
4346 isn't possible for hard registers. */
4347 if (may_be_nonaddressable_p (base))
4348 return false;
4350 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4352 if (TREE_CODE (base) == MEM_REF)
4354 if (!integer_zerop (TREE_OPERAND (base, 1)))
4356 if (off == NULL_TREE)
4357 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4358 else
4359 off = size_binop (PLUS_EXPR, off,
4360 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4362 base = TREE_OPERAND (base, 0);
4364 else
4365 base = build_fold_addr_expr (base);
4367 if (off == NULL_TREE)
4368 off = size_zero_node;
4370 /* If base is not loop invariant, either off is 0, then we start with just
4371 the constant offset in the loop invariant BASE and continue with base
4372 as OFF, otherwise give up.
4373 We could handle that case by gimplifying the addition of base + off
4374 into some SSA_NAME and use that as off, but for now punt. */
4375 if (!expr_invariant_in_loop_p (loop, base))
4377 if (!integer_zerop (off))
4378 return false;
4379 off = base;
4380 base = size_int (pbytepos);
4382 /* Otherwise put base + constant offset into the loop invariant BASE
4383 and continue with OFF. */
4384 else
4386 base = fold_convert (sizetype, base);
4387 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4390 /* OFF at this point may be either a SSA_NAME or some tree expression
4391 from get_inner_reference. Try to peel off loop invariants from it
4392 into BASE as long as possible. */
4393 STRIP_NOPS (off);
4394 while (offtype == NULL_TREE)
4396 enum tree_code code;
4397 tree op0, op1, add = NULL_TREE;
4399 if (TREE_CODE (off) == SSA_NAME)
4401 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4403 if (expr_invariant_in_loop_p (loop, off))
4404 return false;
4406 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4407 break;
4409 op0 = gimple_assign_rhs1 (def_stmt);
4410 code = gimple_assign_rhs_code (def_stmt);
4411 op1 = gimple_assign_rhs2 (def_stmt);
4413 else
4415 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4416 return false;
4417 code = TREE_CODE (off);
4418 extract_ops_from_tree (off, &code, &op0, &op1);
4420 switch (code)
4422 case POINTER_PLUS_EXPR:
4423 case PLUS_EXPR:
4424 if (expr_invariant_in_loop_p (loop, op0))
4426 add = op0;
4427 off = op1;
4428 do_add:
4429 add = fold_convert (sizetype, add);
4430 if (scale != 1)
4431 add = size_binop (MULT_EXPR, add, size_int (scale));
4432 base = size_binop (PLUS_EXPR, base, add);
4433 continue;
4435 if (expr_invariant_in_loop_p (loop, op1))
4437 add = op1;
4438 off = op0;
4439 goto do_add;
4441 break;
4442 case MINUS_EXPR:
4443 if (expr_invariant_in_loop_p (loop, op1))
4445 add = fold_convert (sizetype, op1);
4446 add = size_binop (MINUS_EXPR, size_zero_node, add);
4447 off = op0;
4448 goto do_add;
4450 break;
4451 case MULT_EXPR:
4452 if (scale == 1 && tree_fits_shwi_p (op1))
4454 int new_scale = tree_to_shwi (op1);
4455 /* Only treat this as a scaling operation if the target
4456 supports it for at least some offset type. */
4457 if (use_ifn_p
4458 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4459 masked_p, vectype, memory_type,
4460 signed_char_type_node,
4461 new_scale, &ifn,
4462 &offset_vectype)
4463 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4464 masked_p, vectype, memory_type,
4465 unsigned_char_type_node,
4466 new_scale, &ifn,
4467 &offset_vectype))
4468 break;
4469 scale = new_scale;
4470 off = op0;
4471 continue;
4473 break;
4474 case SSA_NAME:
4475 off = op0;
4476 continue;
4477 CASE_CONVERT:
4478 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4479 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4480 break;
4482 /* Don't include the conversion if the target is happy with
4483 the current offset type. */
4484 if (use_ifn_p
4485 && TREE_CODE (off) == SSA_NAME
4486 && !POINTER_TYPE_P (TREE_TYPE (off))
4487 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4488 masked_p, vectype, memory_type,
4489 TREE_TYPE (off), scale, &ifn,
4490 &offset_vectype))
4491 break;
4493 if (TYPE_PRECISION (TREE_TYPE (op0))
4494 == TYPE_PRECISION (TREE_TYPE (off)))
4496 off = op0;
4497 continue;
4500 /* Include the conversion if it is widening and we're using
4501 the IFN path or the target can handle the converted from
4502 offset or the current size is not already the same as the
4503 data vector element size. */
4504 if ((TYPE_PRECISION (TREE_TYPE (op0))
4505 < TYPE_PRECISION (TREE_TYPE (off)))
4506 && (use_ifn_p
4507 || (DR_IS_READ (dr)
4508 ? (targetm.vectorize.builtin_gather
4509 && targetm.vectorize.builtin_gather (vectype,
4510 TREE_TYPE (op0),
4511 scale))
4512 : (targetm.vectorize.builtin_scatter
4513 && targetm.vectorize.builtin_scatter (vectype,
4514 TREE_TYPE (op0),
4515 scale)))
4516 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4517 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4519 off = op0;
4520 offtype = TREE_TYPE (off);
4521 STRIP_NOPS (off);
4522 continue;
4524 break;
4525 default:
4526 break;
4528 break;
4531 /* If at the end OFF still isn't a SSA_NAME or isn't
4532 defined in the loop, punt. */
4533 if (TREE_CODE (off) != SSA_NAME
4534 || expr_invariant_in_loop_p (loop, off))
4535 return false;
4537 if (offtype == NULL_TREE)
4538 offtype = TREE_TYPE (off);
4540 if (use_ifn_p)
4542 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4543 vectype, memory_type, offtype, scale,
4544 &ifn, &offset_vectype))
4545 ifn = IFN_LAST;
4546 decl = NULL_TREE;
4548 else
4550 if (DR_IS_READ (dr))
4552 if (targetm.vectorize.builtin_gather)
4553 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4555 else
4557 if (targetm.vectorize.builtin_scatter)
4558 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4560 ifn = IFN_LAST;
4561 /* The offset vector type will be read from DECL when needed. */
4562 offset_vectype = NULL_TREE;
4565 info->ifn = ifn;
4566 info->decl = decl;
4567 info->base = base;
4568 info->offset = off;
4569 info->offset_dt = vect_unknown_def_type;
4570 info->offset_vectype = offset_vectype;
4571 info->scale = scale;
4572 info->element_type = TREE_TYPE (vectype);
4573 info->memory_type = memory_type;
4574 return true;
4577 /* Find the data references in STMT, analyze them with respect to LOOP and
4578 append them to DATAREFS. Return false if datarefs in this stmt cannot
4579 be handled. */
4581 opt_result
4582 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4583 vec<data_reference_p> *datarefs,
4584 vec<int> *dataref_groups, int group_id)
4586 /* We can ignore clobbers for dataref analysis - they are removed during
4587 loop vectorization and BB vectorization checks dependences with a
4588 stmt walk. */
4589 if (gimple_clobber_p (stmt))
4590 return opt_result::success ();
4592 if (gimple_has_volatile_ops (stmt))
4593 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4594 stmt);
4596 if (stmt_can_throw_internal (cfun, stmt))
4597 return opt_result::failure_at (stmt,
4598 "not vectorized:"
4599 " statement can throw an exception: %G",
4600 stmt);
4602 auto_vec<data_reference_p, 2> refs;
4603 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4604 if (!res)
4605 return res;
4607 if (refs.is_empty ())
4608 return opt_result::success ();
4610 if (refs.length () > 1)
4612 while (!refs.is_empty ())
4613 free_data_ref (refs.pop ());
4614 return opt_result::failure_at (stmt,
4615 "not vectorized: more than one "
4616 "data ref in stmt: %G", stmt);
4619 data_reference_p dr = refs.pop ();
4620 if (gcall *call = dyn_cast <gcall *> (stmt))
4621 if (!gimple_call_internal_p (call)
4622 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4623 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4625 free_data_ref (dr);
4626 return opt_result::failure_at (stmt,
4627 "not vectorized: dr in a call %G", stmt);
4630 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4631 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4633 free_data_ref (dr);
4634 return opt_result::failure_at (stmt,
4635 "not vectorized:"
4636 " statement is an unsupported"
4637 " bitfield access %G", stmt);
4640 if (DR_BASE_ADDRESS (dr)
4641 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4643 free_data_ref (dr);
4644 return opt_result::failure_at (stmt,
4645 "not vectorized:"
4646 " base addr of dr is a constant\n");
4649 /* Check whether this may be a SIMD lane access and adjust the
4650 DR to make it easier for us to handle it. */
4651 if (loop
4652 && loop->simduid
4653 && (!DR_BASE_ADDRESS (dr)
4654 || !DR_OFFSET (dr)
4655 || !DR_INIT (dr)
4656 || !DR_STEP (dr)))
4658 struct data_reference *newdr
4659 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4660 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4661 if (DR_BASE_ADDRESS (newdr)
4662 && DR_OFFSET (newdr)
4663 && DR_INIT (newdr)
4664 && DR_STEP (newdr)
4665 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4666 && integer_zerop (DR_STEP (newdr)))
4668 tree base_address = DR_BASE_ADDRESS (newdr);
4669 tree off = DR_OFFSET (newdr);
4670 tree step = ssize_int (1);
4671 if (integer_zerop (off)
4672 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4674 off = TREE_OPERAND (base_address, 1);
4675 base_address = TREE_OPERAND (base_address, 0);
4677 STRIP_NOPS (off);
4678 if (TREE_CODE (off) == MULT_EXPR
4679 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4681 step = TREE_OPERAND (off, 1);
4682 off = TREE_OPERAND (off, 0);
4683 STRIP_NOPS (off);
4685 if (CONVERT_EXPR_P (off)
4686 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4687 < TYPE_PRECISION (TREE_TYPE (off))))
4688 off = TREE_OPERAND (off, 0);
4689 if (TREE_CODE (off) == SSA_NAME)
4691 gimple *def = SSA_NAME_DEF_STMT (off);
4692 /* Look through widening conversion. */
4693 if (is_gimple_assign (def)
4694 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4696 tree rhs1 = gimple_assign_rhs1 (def);
4697 if (TREE_CODE (rhs1) == SSA_NAME
4698 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4699 && (TYPE_PRECISION (TREE_TYPE (off))
4700 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4701 def = SSA_NAME_DEF_STMT (rhs1);
4703 if (is_gimple_call (def)
4704 && gimple_call_internal_p (def)
4705 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4707 tree arg = gimple_call_arg (def, 0);
4708 tree reft = TREE_TYPE (DR_REF (newdr));
4709 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4710 arg = SSA_NAME_VAR (arg);
4711 if (arg == loop->simduid
4712 /* For now. */
4713 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4715 DR_BASE_ADDRESS (newdr) = base_address;
4716 DR_OFFSET (newdr) = ssize_int (0);
4717 DR_STEP (newdr) = step;
4718 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4719 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4720 /* Mark as simd-lane access. */
4721 tree arg2 = gimple_call_arg (def, 1);
4722 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4723 free_data_ref (dr);
4724 datarefs->safe_push (newdr);
4725 if (dataref_groups)
4726 dataref_groups->safe_push (group_id);
4727 return opt_result::success ();
4732 free_data_ref (newdr);
4735 datarefs->safe_push (dr);
4736 if (dataref_groups)
4737 dataref_groups->safe_push (group_id);
4738 return opt_result::success ();
4741 /* Function vect_analyze_data_refs.
4743 Find all the data references in the loop or basic block.
4745 The general structure of the analysis of data refs in the vectorizer is as
4746 follows:
4747 1- vect_analyze_data_refs(loop/bb): call
4748 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4749 in the loop/bb and their dependences.
4750 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4751 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4752 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4756 opt_result
4757 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4759 class loop *loop = NULL;
4760 unsigned int i;
4761 struct data_reference *dr;
4762 tree scalar_type;
4764 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4766 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4767 loop = LOOP_VINFO_LOOP (loop_vinfo);
4769 /* Go through the data-refs, check that the analysis succeeded. Update
4770 pointer from stmt_vec_info struct to DR and vectype. */
4772 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4773 FOR_EACH_VEC_ELT (datarefs, i, dr)
4775 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4776 poly_uint64 vf;
4778 gcc_assert (DR_REF (dr));
4779 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4780 gcc_assert (!stmt_info->dr_aux.dr);
4781 stmt_info->dr_aux.dr = dr;
4782 stmt_info->dr_aux.stmt = stmt_info;
4784 /* Check that analysis of the data-ref succeeded. */
4785 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4786 || !DR_STEP (dr))
4788 bool maybe_gather
4789 = DR_IS_READ (dr)
4790 && !TREE_THIS_VOLATILE (DR_REF (dr));
4791 bool maybe_scatter
4792 = DR_IS_WRITE (dr)
4793 && !TREE_THIS_VOLATILE (DR_REF (dr));
4795 /* If target supports vector gather loads or scatter stores,
4796 see if they can't be used. */
4797 if (is_a <loop_vec_info> (vinfo)
4798 && !nested_in_vect_loop_p (loop, stmt_info))
4800 if (maybe_gather || maybe_scatter)
4802 if (maybe_gather)
4803 gatherscatter = GATHER;
4804 else
4805 gatherscatter = SCATTER;
4809 if (gatherscatter == SG_NONE)
4811 if (dump_enabled_p ())
4812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4813 "not vectorized: data ref analysis "
4814 "failed %G", stmt_info->stmt);
4815 if (is_a <bb_vec_info> (vinfo))
4817 /* In BB vectorization the ref can still participate
4818 in dependence analysis, we just can't vectorize it. */
4819 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4820 continue;
4822 return opt_result::failure_at (stmt_info->stmt,
4823 "not vectorized:"
4824 " data ref analysis failed: %G",
4825 stmt_info->stmt);
4829 /* See if this was detected as SIMD lane access. */
4830 if (dr->aux == (void *)-1
4831 || dr->aux == (void *)-2
4832 || dr->aux == (void *)-3
4833 || dr->aux == (void *)-4)
4835 if (nested_in_vect_loop_p (loop, stmt_info))
4836 return opt_result::failure_at (stmt_info->stmt,
4837 "not vectorized:"
4838 " data ref analysis failed: %G",
4839 stmt_info->stmt);
4840 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4841 = -(uintptr_t) dr->aux;
4844 tree base = get_base_address (DR_REF (dr));
4845 if (base && VAR_P (base) && DECL_NONALIASED (base))
4847 if (dump_enabled_p ())
4848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4849 "not vectorized: base object not addressable "
4850 "for stmt: %G", stmt_info->stmt);
4851 if (is_a <bb_vec_info> (vinfo))
4853 /* In BB vectorization the ref can still participate
4854 in dependence analysis, we just can't vectorize it. */
4855 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4856 continue;
4858 return opt_result::failure_at (stmt_info->stmt,
4859 "not vectorized: base object not"
4860 " addressable for stmt: %G",
4861 stmt_info->stmt);
4864 if (is_a <loop_vec_info> (vinfo)
4865 && DR_STEP (dr)
4866 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4868 if (nested_in_vect_loop_p (loop, stmt_info))
4869 return opt_result::failure_at (stmt_info->stmt,
4870 "not vectorized: "
4871 "not suitable for strided load %G",
4872 stmt_info->stmt);
4873 STMT_VINFO_STRIDED_P (stmt_info) = true;
4876 /* Update DR field in stmt_vec_info struct. */
4878 /* If the dataref is in an inner-loop of the loop that is considered for
4879 for vectorization, we also want to analyze the access relative to
4880 the outer-loop (DR contains information only relative to the
4881 inner-most enclosing loop). We do that by building a reference to the
4882 first location accessed by the inner-loop, and analyze it relative to
4883 the outer-loop. */
4884 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4886 /* Build a reference to the first location accessed by the
4887 inner loop: *(BASE + INIT + OFFSET). By construction,
4888 this address must be invariant in the inner loop, so we
4889 can consider it as being used in the outer loop. */
4890 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4891 tree offset = unshare_expr (DR_OFFSET (dr));
4892 tree init = unshare_expr (DR_INIT (dr));
4893 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4894 init, offset);
4895 tree init_addr = fold_build_pointer_plus (base, init_offset);
4896 tree init_ref = build_fold_indirect_ref (init_addr);
4898 if (dump_enabled_p ())
4899 dump_printf_loc (MSG_NOTE, vect_location,
4900 "analyze in outer loop: %T\n", init_ref);
4902 opt_result res
4903 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4904 init_ref, loop, stmt_info->stmt);
4905 if (!res)
4906 /* dr_analyze_innermost already explained the failure. */
4907 return res;
4909 if (dump_enabled_p ())
4910 dump_printf_loc (MSG_NOTE, vect_location,
4911 "\touter base_address: %T\n"
4912 "\touter offset from base address: %T\n"
4913 "\touter constant offset from base address: %T\n"
4914 "\touter step: %T\n"
4915 "\touter base alignment: %d\n\n"
4916 "\touter base misalignment: %d\n"
4917 "\touter offset alignment: %d\n"
4918 "\touter step alignment: %d\n",
4919 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4920 STMT_VINFO_DR_OFFSET (stmt_info),
4921 STMT_VINFO_DR_INIT (stmt_info),
4922 STMT_VINFO_DR_STEP (stmt_info),
4923 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4924 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4925 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4926 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4929 /* Set vectype for STMT. */
4930 scalar_type = TREE_TYPE (DR_REF (dr));
4931 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4932 if (!vectype)
4934 if (dump_enabled_p ())
4936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4937 "not vectorized: no vectype for stmt: %G",
4938 stmt_info->stmt);
4939 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4940 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4941 scalar_type);
4942 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4945 if (is_a <bb_vec_info> (vinfo))
4947 /* No vector type is fine, the ref can still participate
4948 in dependence analysis, we just can't vectorize it. */
4949 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4950 continue;
4952 if (fatal)
4953 *fatal = false;
4954 return opt_result::failure_at (stmt_info->stmt,
4955 "not vectorized:"
4956 " no vectype for stmt: %G"
4957 " scalar_type: %T\n",
4958 stmt_info->stmt, scalar_type);
4960 else
4962 if (dump_enabled_p ())
4963 dump_printf_loc (MSG_NOTE, vect_location,
4964 "got vectype for stmt: %G%T\n",
4965 stmt_info->stmt, vectype);
4968 /* Adjust the minimal vectorization factor according to the
4969 vector type. */
4970 vf = TYPE_VECTOR_SUBPARTS (vectype);
4971 *min_vf = upper_bound (*min_vf, vf);
4973 /* Leave the BB vectorizer to pick the vector type later, based on
4974 the final dataref group size and SLP node size. */
4975 if (is_a <loop_vec_info> (vinfo))
4976 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4978 if (gatherscatter != SG_NONE)
4980 gather_scatter_info gs_info;
4981 if (!vect_check_gather_scatter (stmt_info,
4982 as_a <loop_vec_info> (vinfo),
4983 &gs_info)
4984 || !get_vectype_for_scalar_type (vinfo,
4985 TREE_TYPE (gs_info.offset)))
4987 if (fatal)
4988 *fatal = false;
4989 return opt_result::failure_at
4990 (stmt_info->stmt,
4991 (gatherscatter == GATHER)
4992 ? "not vectorized: not suitable for gather load %G"
4993 : "not vectorized: not suitable for scatter store %G",
4994 stmt_info->stmt);
4996 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5000 /* We used to stop processing and prune the list here. Verify we no
5001 longer need to. */
5002 gcc_assert (i == datarefs.length ());
5004 return opt_result::success ();
5008 /* Function vect_get_new_vect_var.
5010 Returns a name for a new variable. The current naming scheme appends the
5011 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5012 the name of vectorizer generated variables, and appends that to NAME if
5013 provided. */
5015 tree
5016 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5018 const char *prefix;
5019 tree new_vect_var;
5021 switch (var_kind)
5023 case vect_simple_var:
5024 prefix = "vect";
5025 break;
5026 case vect_scalar_var:
5027 prefix = "stmp";
5028 break;
5029 case vect_mask_var:
5030 prefix = "mask";
5031 break;
5032 case vect_pointer_var:
5033 prefix = "vectp";
5034 break;
5035 default:
5036 gcc_unreachable ();
5039 if (name)
5041 char* tmp = concat (prefix, "_", name, NULL);
5042 new_vect_var = create_tmp_reg (type, tmp);
5043 free (tmp);
5045 else
5046 new_vect_var = create_tmp_reg (type, prefix);
5048 return new_vect_var;
5051 /* Like vect_get_new_vect_var but return an SSA name. */
5053 tree
5054 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5056 const char *prefix;
5057 tree new_vect_var;
5059 switch (var_kind)
5061 case vect_simple_var:
5062 prefix = "vect";
5063 break;
5064 case vect_scalar_var:
5065 prefix = "stmp";
5066 break;
5067 case vect_pointer_var:
5068 prefix = "vectp";
5069 break;
5070 default:
5071 gcc_unreachable ();
5074 if (name)
5076 char* tmp = concat (prefix, "_", name, NULL);
5077 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5078 free (tmp);
5080 else
5081 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5083 return new_vect_var;
5086 /* Duplicate points-to info on NAME from DR_INFO. */
5088 static void
5089 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5091 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5092 /* DR_PTR_INFO is for a base SSA name, not including constant or
5093 variable offsets in the ref so its alignment info does not apply. */
5094 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5097 /* Function vect_create_addr_base_for_vector_ref.
5099 Create an expression that computes the address of the first memory location
5100 that will be accessed for a data reference.
5102 Input:
5103 STMT_INFO: The statement containing the data reference.
5104 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5105 OFFSET: Optional. If supplied, it is be added to the initial address.
5106 LOOP: Specify relative to which loop-nest should the address be computed.
5107 For example, when the dataref is in an inner-loop nested in an
5108 outer-loop that is now being vectorized, LOOP can be either the
5109 outer-loop, or the inner-loop. The first memory location accessed
5110 by the following dataref ('in' points to short):
5112 for (i=0; i<N; i++)
5113 for (j=0; j<M; j++)
5114 s += in[i+j]
5116 is as follows:
5117 if LOOP=i_loop: &in (relative to i_loop)
5118 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5120 Output:
5121 1. Return an SSA_NAME whose value is the address of the memory location of
5122 the first vector of the data reference.
5123 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5124 these statement(s) which define the returned SSA_NAME.
5126 FORNOW: We are only handling array accesses with step 1. */
5128 tree
5129 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5130 gimple_seq *new_stmt_list,
5131 tree offset)
5133 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5134 struct data_reference *dr = dr_info->dr;
5135 const char *base_name;
5136 tree addr_base;
5137 tree dest;
5138 gimple_seq seq = NULL;
5139 tree vect_ptr_type;
5140 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5141 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5143 tree data_ref_base = unshare_expr (drb->base_address);
5144 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5145 tree init = unshare_expr (drb->init);
5147 if (loop_vinfo)
5148 base_name = get_name (data_ref_base);
5149 else
5151 base_offset = ssize_int (0);
5152 init = ssize_int (0);
5153 base_name = get_name (DR_REF (dr));
5156 /* Create base_offset */
5157 base_offset = size_binop (PLUS_EXPR,
5158 fold_convert (sizetype, base_offset),
5159 fold_convert (sizetype, init));
5161 if (offset)
5163 offset = fold_convert (sizetype, offset);
5164 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5165 base_offset, offset);
5168 /* base + base_offset */
5169 if (loop_vinfo)
5170 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5171 else
5172 addr_base = build1 (ADDR_EXPR,
5173 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5174 /* Strip zero offset components since we don't need
5175 them and they can confuse late diagnostics if
5176 we CSE them wrongly. See PR106904 for example. */
5177 unshare_expr (strip_zero_offset_components
5178 (DR_REF (dr))));
5180 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5181 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5182 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5183 gimple_seq_add_seq (new_stmt_list, seq);
5185 if (DR_PTR_INFO (dr)
5186 && TREE_CODE (addr_base) == SSA_NAME
5187 /* We should only duplicate pointer info to newly created SSA names. */
5188 && SSA_NAME_VAR (addr_base) == dest)
5190 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5191 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5194 if (dump_enabled_p ())
5195 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5197 return addr_base;
5201 /* Function vect_create_data_ref_ptr.
5203 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5204 location accessed in the loop by STMT_INFO, along with the def-use update
5205 chain to appropriately advance the pointer through the loop iterations.
5206 Also set aliasing information for the pointer. This pointer is used by
5207 the callers to this function to create a memory reference expression for
5208 vector load/store access.
5210 Input:
5211 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5212 GIMPLE_ASSIGN <name, data-ref> or
5213 GIMPLE_ASSIGN <data-ref, name>.
5214 2. AGGR_TYPE: the type of the reference, which should be either a vector
5215 or an array.
5216 3. AT_LOOP: the loop where the vector memref is to be created.
5217 4. OFFSET (optional): a byte offset to be added to the initial address
5218 accessed by the data-ref in STMT_INFO.
5219 5. BSI: location where the new stmts are to be placed if there is no loop
5220 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5221 pointing to the initial address.
5222 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5223 to the IV during each iteration of the loop. NULL says to move
5224 by one copy of AGGR_TYPE up or down, depending on the step of the
5225 data reference.
5227 Output:
5228 1. Declare a new ptr to vector_type, and have it point to the base of the
5229 data reference (initial addressed accessed by the data reference).
5230 For example, for vector of type V8HI, the following code is generated:
5232 v8hi *ap;
5233 ap = (v8hi *)initial_address;
5235 if OFFSET is not supplied:
5236 initial_address = &a[init];
5237 if OFFSET is supplied:
5238 initial_address = &a[init] + OFFSET;
5239 if BYTE_OFFSET is supplied:
5240 initial_address = &a[init] + BYTE_OFFSET;
5242 Return the initial_address in INITIAL_ADDRESS.
5244 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5245 update the pointer in each iteration of the loop.
5247 Return the increment stmt that updates the pointer in PTR_INCR.
5249 3. Return the pointer. */
5251 tree
5252 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5253 tree aggr_type, class loop *at_loop, tree offset,
5254 tree *initial_address, gimple_stmt_iterator *gsi,
5255 gimple **ptr_incr, bool only_init,
5256 tree iv_step)
5258 const char *base_name;
5259 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5260 class loop *loop = NULL;
5261 bool nested_in_vect_loop = false;
5262 class loop *containing_loop = NULL;
5263 tree aggr_ptr_type;
5264 tree aggr_ptr;
5265 tree new_temp;
5266 gimple_seq new_stmt_list = NULL;
5267 edge pe = NULL;
5268 basic_block new_bb;
5269 tree aggr_ptr_init;
5270 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5271 struct data_reference *dr = dr_info->dr;
5272 tree aptr;
5273 gimple_stmt_iterator incr_gsi;
5274 bool insert_after;
5275 tree indx_before_incr, indx_after_incr;
5276 gimple *incr;
5277 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5279 gcc_assert (iv_step != NULL_TREE
5280 || TREE_CODE (aggr_type) == ARRAY_TYPE
5281 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5283 if (loop_vinfo)
5285 loop = LOOP_VINFO_LOOP (loop_vinfo);
5286 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5287 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5288 pe = loop_preheader_edge (loop);
5290 else
5292 gcc_assert (bb_vinfo);
5293 only_init = true;
5294 *ptr_incr = NULL;
5297 /* Create an expression for the first address accessed by this load
5298 in LOOP. */
5299 base_name = get_name (DR_BASE_ADDRESS (dr));
5301 if (dump_enabled_p ())
5303 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5304 dump_printf_loc (MSG_NOTE, vect_location,
5305 "create %s-pointer variable to type: %T",
5306 get_tree_code_name (TREE_CODE (aggr_type)),
5307 aggr_type);
5308 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5309 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5310 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5311 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5312 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5313 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5314 else
5315 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5316 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5319 /* (1) Create the new aggregate-pointer variable.
5320 Vector and array types inherit the alias set of their component
5321 type by default so we need to use a ref-all pointer if the data
5322 reference does not conflict with the created aggregated data
5323 reference because it is not addressable. */
5324 bool need_ref_all = false;
5325 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5326 get_alias_set (DR_REF (dr))))
5327 need_ref_all = true;
5328 /* Likewise for any of the data references in the stmt group. */
5329 else if (DR_GROUP_SIZE (stmt_info) > 1)
5331 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5334 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5335 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5336 get_alias_set (DR_REF (sdr))))
5338 need_ref_all = true;
5339 break;
5341 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5343 while (sinfo);
5345 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5346 need_ref_all);
5347 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5350 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5351 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5352 def-use update cycles for the pointer: one relative to the outer-loop
5353 (LOOP), which is what steps (3) and (4) below do. The other is relative
5354 to the inner-loop (which is the inner-most loop containing the dataref),
5355 and this is done be step (5) below.
5357 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5358 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5359 redundant. Steps (3),(4) create the following:
5361 vp0 = &base_addr;
5362 LOOP: vp1 = phi(vp0,vp2)
5365 vp2 = vp1 + step
5366 goto LOOP
5368 If there is an inner-loop nested in loop, then step (5) will also be
5369 applied, and an additional update in the inner-loop will be created:
5371 vp0 = &base_addr;
5372 LOOP: vp1 = phi(vp0,vp2)
5374 inner: vp3 = phi(vp1,vp4)
5375 vp4 = vp3 + inner_step
5376 if () goto inner
5378 vp2 = vp1 + step
5379 if () goto LOOP */
5381 /* (2) Calculate the initial address of the aggregate-pointer, and set
5382 the aggregate-pointer to point to it before the loop. */
5384 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5386 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5387 stmt_info, &new_stmt_list,
5388 offset);
5389 if (new_stmt_list)
5391 if (pe)
5393 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5394 gcc_assert (!new_bb);
5396 else
5397 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5400 *initial_address = new_temp;
5401 aggr_ptr_init = new_temp;
5403 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5404 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5405 inner-loop nested in LOOP (during outer-loop vectorization). */
5407 /* No update in loop is required. */
5408 if (only_init && (!loop_vinfo || at_loop == loop))
5409 aptr = aggr_ptr_init;
5410 else
5412 /* Accesses to invariant addresses should be handled specially
5413 by the caller. */
5414 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5415 gcc_assert (!integer_zerop (step));
5417 if (iv_step == NULL_TREE)
5419 /* The step of the aggregate pointer is the type size,
5420 negated for downward accesses. */
5421 iv_step = TYPE_SIZE_UNIT (aggr_type);
5422 if (tree_int_cst_sgn (step) == -1)
5423 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5426 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5428 create_iv (aggr_ptr_init, PLUS_EXPR,
5429 fold_convert (aggr_ptr_type, iv_step),
5430 aggr_ptr, loop, &incr_gsi, insert_after,
5431 &indx_before_incr, &indx_after_incr);
5432 incr = gsi_stmt (incr_gsi);
5434 /* Copy the points-to information if it exists. */
5435 if (DR_PTR_INFO (dr))
5437 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5438 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5440 if (ptr_incr)
5441 *ptr_incr = incr;
5443 aptr = indx_before_incr;
5446 if (!nested_in_vect_loop || only_init)
5447 return aptr;
5450 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5451 nested in LOOP, if exists. */
5453 gcc_assert (nested_in_vect_loop);
5454 if (!only_init)
5456 standard_iv_increment_position (containing_loop, &incr_gsi,
5457 &insert_after);
5458 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5459 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5460 &indx_before_incr, &indx_after_incr);
5461 incr = gsi_stmt (incr_gsi);
5463 /* Copy the points-to information if it exists. */
5464 if (DR_PTR_INFO (dr))
5466 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5467 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5469 if (ptr_incr)
5470 *ptr_incr = incr;
5472 return indx_before_incr;
5474 else
5475 gcc_unreachable ();
5479 /* Function bump_vector_ptr
5481 Increment a pointer (to a vector type) by vector-size. If requested,
5482 i.e. if PTR-INCR is given, then also connect the new increment stmt
5483 to the existing def-use update-chain of the pointer, by modifying
5484 the PTR_INCR as illustrated below:
5486 The pointer def-use update-chain before this function:
5487 DATAREF_PTR = phi (p_0, p_2)
5488 ....
5489 PTR_INCR: p_2 = DATAREF_PTR + step
5491 The pointer def-use update-chain after this function:
5492 DATAREF_PTR = phi (p_0, p_2)
5493 ....
5494 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5495 ....
5496 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5498 Input:
5499 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5500 in the loop.
5501 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5502 the loop. The increment amount across iterations is expected
5503 to be vector_size.
5504 BSI - location where the new update stmt is to be placed.
5505 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5506 BUMP - optional. The offset by which to bump the pointer. If not given,
5507 the offset is assumed to be vector_size.
5509 Output: Return NEW_DATAREF_PTR as illustrated above.
5513 tree
5514 bump_vector_ptr (vec_info *vinfo,
5515 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5516 stmt_vec_info stmt_info, tree bump)
5518 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5519 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5520 tree update = TYPE_SIZE_UNIT (vectype);
5521 gimple *incr_stmt;
5522 ssa_op_iter iter;
5523 use_operand_p use_p;
5524 tree new_dataref_ptr;
5526 if (bump)
5527 update = bump;
5529 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5530 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5531 else if (is_gimple_min_invariant (dataref_ptr))
5532 /* When possible avoid emitting a separate increment stmt that will
5533 force the addressed object addressable. */
5534 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5535 fold_build2 (MEM_REF,
5536 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5537 dataref_ptr,
5538 fold_convert (ptr_type_node, update)));
5539 else
5540 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5541 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5542 dataref_ptr, update);
5543 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5544 /* Fold the increment, avoiding excessive chains use-def chains of
5545 those, leading to compile-time issues for passes until the next
5546 forwprop pass which would do this as well. */
5547 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5548 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5550 incr_stmt = gsi_stmt (fold_gsi);
5551 update_stmt (incr_stmt);
5554 /* Copy the points-to information if it exists. */
5555 if (DR_PTR_INFO (dr))
5557 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5558 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5561 if (!ptr_incr)
5562 return new_dataref_ptr;
5564 /* Update the vector-pointer's cross-iteration increment. */
5565 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5567 tree use = USE_FROM_PTR (use_p);
5569 if (use == dataref_ptr)
5570 SET_USE (use_p, new_dataref_ptr);
5571 else
5572 gcc_assert (operand_equal_p (use, update, 0));
5575 return new_dataref_ptr;
5579 /* Copy memory reference info such as base/clique from the SRC reference
5580 to the DEST MEM_REF. */
5582 void
5583 vect_copy_ref_info (tree dest, tree src)
5585 if (TREE_CODE (dest) != MEM_REF)
5586 return;
5588 tree src_base = src;
5589 while (handled_component_p (src_base))
5590 src_base = TREE_OPERAND (src_base, 0);
5591 if (TREE_CODE (src_base) != MEM_REF
5592 && TREE_CODE (src_base) != TARGET_MEM_REF)
5593 return;
5595 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5596 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5600 /* Function vect_create_destination_var.
5602 Create a new temporary of type VECTYPE. */
5604 tree
5605 vect_create_destination_var (tree scalar_dest, tree vectype)
5607 tree vec_dest;
5608 const char *name;
5609 char *new_name;
5610 tree type;
5611 enum vect_var_kind kind;
5613 kind = vectype
5614 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5615 ? vect_mask_var
5616 : vect_simple_var
5617 : vect_scalar_var;
5618 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5620 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5622 name = get_name (scalar_dest);
5623 if (name)
5624 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5625 else
5626 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5627 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5628 free (new_name);
5630 return vec_dest;
5633 /* Function vect_grouped_store_supported.
5635 Returns TRUE if interleave high and interleave low permutations
5636 are supported, and FALSE otherwise. */
5638 bool
5639 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5641 machine_mode mode = TYPE_MODE (vectype);
5643 /* vect_permute_store_chain requires the group size to be equal to 3 or
5644 be a power of two. */
5645 if (count != 3 && exact_log2 (count) == -1)
5647 if (dump_enabled_p ())
5648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649 "the size of the group of accesses"
5650 " is not a power of 2 or not eqaul to 3\n");
5651 return false;
5654 /* Check that the permutation is supported. */
5655 if (VECTOR_MODE_P (mode))
5657 unsigned int i;
5658 if (count == 3)
5660 unsigned int j0 = 0, j1 = 0, j2 = 0;
5661 unsigned int i, j;
5663 unsigned int nelt;
5664 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5666 if (dump_enabled_p ())
5667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668 "cannot handle groups of 3 stores for"
5669 " variable-length vectors\n");
5670 return false;
5673 vec_perm_builder sel (nelt, nelt, 1);
5674 sel.quick_grow (nelt);
5675 vec_perm_indices indices;
5676 for (j = 0; j < 3; j++)
5678 int nelt0 = ((3 - j) * nelt) % 3;
5679 int nelt1 = ((3 - j) * nelt + 1) % 3;
5680 int nelt2 = ((3 - j) * nelt + 2) % 3;
5681 for (i = 0; i < nelt; i++)
5683 if (3 * i + nelt0 < nelt)
5684 sel[3 * i + nelt0] = j0++;
5685 if (3 * i + nelt1 < nelt)
5686 sel[3 * i + nelt1] = nelt + j1++;
5687 if (3 * i + nelt2 < nelt)
5688 sel[3 * i + nelt2] = 0;
5690 indices.new_vector (sel, 2, nelt);
5691 if (!can_vec_perm_const_p (mode, mode, indices))
5693 if (dump_enabled_p ())
5694 dump_printf (MSG_MISSED_OPTIMIZATION,
5695 "permutation op not supported by target.\n");
5696 return false;
5699 for (i = 0; i < nelt; i++)
5701 if (3 * i + nelt0 < nelt)
5702 sel[3 * i + nelt0] = 3 * i + nelt0;
5703 if (3 * i + nelt1 < nelt)
5704 sel[3 * i + nelt1] = 3 * i + nelt1;
5705 if (3 * i + nelt2 < nelt)
5706 sel[3 * i + nelt2] = nelt + j2++;
5708 indices.new_vector (sel, 2, nelt);
5709 if (!can_vec_perm_const_p (mode, mode, indices))
5711 if (dump_enabled_p ())
5712 dump_printf (MSG_MISSED_OPTIMIZATION,
5713 "permutation op not supported by target.\n");
5714 return false;
5717 return true;
5719 else
5721 /* If length is not equal to 3 then only power of 2 is supported. */
5722 gcc_assert (pow2p_hwi (count));
5723 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5725 /* The encoding has 2 interleaved stepped patterns. */
5726 if(!multiple_p (nelt, 2))
5727 return false;
5728 vec_perm_builder sel (nelt, 2, 3);
5729 sel.quick_grow (6);
5730 for (i = 0; i < 3; i++)
5732 sel[i * 2] = i;
5733 sel[i * 2 + 1] = i + nelt;
5735 vec_perm_indices indices (sel, 2, nelt);
5736 if (can_vec_perm_const_p (mode, mode, indices))
5738 for (i = 0; i < 6; i++)
5739 sel[i] += exact_div (nelt, 2);
5740 indices.new_vector (sel, 2, nelt);
5741 if (can_vec_perm_const_p (mode, mode, indices))
5742 return true;
5747 if (dump_enabled_p ())
5748 dump_printf (MSG_MISSED_OPTIMIZATION,
5749 "permutation op not supported by target.\n");
5750 return false;
5753 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5754 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5756 internal_fn
5757 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5758 bool masked_p)
5760 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5761 vec_mask_len_store_lanes_optab, vectype,
5762 count))
5763 return IFN_MASK_LEN_STORE_LANES;
5764 else if (masked_p)
5766 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5767 vec_mask_store_lanes_optab, vectype,
5768 count))
5769 return IFN_MASK_STORE_LANES;
5771 else
5773 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5774 vec_store_lanes_optab, vectype, count))
5775 return IFN_STORE_LANES;
5777 return IFN_LAST;
5781 /* Function vect_permute_store_chain.
5783 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5784 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5785 the data correctly for the stores. Return the final references for stores
5786 in RESULT_CHAIN.
5788 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5789 The input is 4 vectors each containing 8 elements. We assign a number to
5790 each element, the input sequence is:
5792 1st vec: 0 1 2 3 4 5 6 7
5793 2nd vec: 8 9 10 11 12 13 14 15
5794 3rd vec: 16 17 18 19 20 21 22 23
5795 4th vec: 24 25 26 27 28 29 30 31
5797 The output sequence should be:
5799 1st vec: 0 8 16 24 1 9 17 25
5800 2nd vec: 2 10 18 26 3 11 19 27
5801 3rd vec: 4 12 20 28 5 13 21 30
5802 4th vec: 6 14 22 30 7 15 23 31
5804 i.e., we interleave the contents of the four vectors in their order.
5806 We use interleave_high/low instructions to create such output. The input of
5807 each interleave_high/low operation is two vectors:
5808 1st vec 2nd vec
5809 0 1 2 3 4 5 6 7
5810 the even elements of the result vector are obtained left-to-right from the
5811 high/low elements of the first vector. The odd elements of the result are
5812 obtained left-to-right from the high/low elements of the second vector.
5813 The output of interleave_high will be: 0 4 1 5
5814 and of interleave_low: 2 6 3 7
5817 The permutation is done in log LENGTH stages. In each stage interleave_high
5818 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5819 where the first argument is taken from the first half of DR_CHAIN and the
5820 second argument from it's second half.
5821 In our example,
5823 I1: interleave_high (1st vec, 3rd vec)
5824 I2: interleave_low (1st vec, 3rd vec)
5825 I3: interleave_high (2nd vec, 4th vec)
5826 I4: interleave_low (2nd vec, 4th vec)
5828 The output for the first stage is:
5830 I1: 0 16 1 17 2 18 3 19
5831 I2: 4 20 5 21 6 22 7 23
5832 I3: 8 24 9 25 10 26 11 27
5833 I4: 12 28 13 29 14 30 15 31
5835 The output of the second stage, i.e. the final result is:
5837 I1: 0 8 16 24 1 9 17 25
5838 I2: 2 10 18 26 3 11 19 27
5839 I3: 4 12 20 28 5 13 21 30
5840 I4: 6 14 22 30 7 15 23 31. */
5842 void
5843 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5844 unsigned int length,
5845 stmt_vec_info stmt_info,
5846 gimple_stmt_iterator *gsi,
5847 vec<tree> *result_chain)
5849 tree vect1, vect2, high, low;
5850 gimple *perm_stmt;
5851 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5852 tree perm_mask_low, perm_mask_high;
5853 tree data_ref;
5854 tree perm3_mask_low, perm3_mask_high;
5855 unsigned int i, j, n, log_length = exact_log2 (length);
5857 result_chain->quick_grow (length);
5858 memcpy (result_chain->address (), dr_chain.address (),
5859 length * sizeof (tree));
5861 if (length == 3)
5863 /* vect_grouped_store_supported ensures that this is constant. */
5864 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5865 unsigned int j0 = 0, j1 = 0, j2 = 0;
5867 vec_perm_builder sel (nelt, nelt, 1);
5868 sel.quick_grow (nelt);
5869 vec_perm_indices indices;
5870 for (j = 0; j < 3; j++)
5872 int nelt0 = ((3 - j) * nelt) % 3;
5873 int nelt1 = ((3 - j) * nelt + 1) % 3;
5874 int nelt2 = ((3 - j) * nelt + 2) % 3;
5876 for (i = 0; i < nelt; i++)
5878 if (3 * i + nelt0 < nelt)
5879 sel[3 * i + nelt0] = j0++;
5880 if (3 * i + nelt1 < nelt)
5881 sel[3 * i + nelt1] = nelt + j1++;
5882 if (3 * i + nelt2 < nelt)
5883 sel[3 * i + nelt2] = 0;
5885 indices.new_vector (sel, 2, nelt);
5886 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5888 for (i = 0; i < nelt; i++)
5890 if (3 * i + nelt0 < nelt)
5891 sel[3 * i + nelt0] = 3 * i + nelt0;
5892 if (3 * i + nelt1 < nelt)
5893 sel[3 * i + nelt1] = 3 * i + nelt1;
5894 if (3 * i + nelt2 < nelt)
5895 sel[3 * i + nelt2] = nelt + j2++;
5897 indices.new_vector (sel, 2, nelt);
5898 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5900 vect1 = dr_chain[0];
5901 vect2 = dr_chain[1];
5903 /* Create interleaving stmt:
5904 low = VEC_PERM_EXPR <vect1, vect2,
5905 {j, nelt, *, j + 1, nelt + j + 1, *,
5906 j + 2, nelt + j + 2, *, ...}> */
5907 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5908 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5909 vect2, perm3_mask_low);
5910 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5912 vect1 = data_ref;
5913 vect2 = dr_chain[2];
5914 /* Create interleaving stmt:
5915 low = VEC_PERM_EXPR <vect1, vect2,
5916 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5917 6, 7, nelt + j + 2, ...}> */
5918 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5919 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5920 vect2, perm3_mask_high);
5921 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5922 (*result_chain)[j] = data_ref;
5925 else
5927 /* If length is not equal to 3 then only power of 2 is supported. */
5928 gcc_assert (pow2p_hwi (length));
5930 /* The encoding has 2 interleaved stepped patterns. */
5931 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5932 vec_perm_builder sel (nelt, 2, 3);
5933 sel.quick_grow (6);
5934 for (i = 0; i < 3; i++)
5936 sel[i * 2] = i;
5937 sel[i * 2 + 1] = i + nelt;
5939 vec_perm_indices indices (sel, 2, nelt);
5940 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5942 for (i = 0; i < 6; i++)
5943 sel[i] += exact_div (nelt, 2);
5944 indices.new_vector (sel, 2, nelt);
5945 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5947 for (i = 0, n = log_length; i < n; i++)
5949 for (j = 0; j < length/2; j++)
5951 vect1 = dr_chain[j];
5952 vect2 = dr_chain[j+length/2];
5954 /* Create interleaving stmt:
5955 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5956 ...}> */
5957 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5958 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5959 vect2, perm_mask_high);
5960 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5961 (*result_chain)[2*j] = high;
5963 /* Create interleaving stmt:
5964 low = VEC_PERM_EXPR <vect1, vect2,
5965 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5966 ...}> */
5967 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5968 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5969 vect2, perm_mask_low);
5970 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5971 (*result_chain)[2*j+1] = low;
5973 memcpy (dr_chain.address (), result_chain->address (),
5974 length * sizeof (tree));
5979 /* Function vect_setup_realignment
5981 This function is called when vectorizing an unaligned load using
5982 the dr_explicit_realign[_optimized] scheme.
5983 This function generates the following code at the loop prolog:
5985 p = initial_addr;
5986 x msq_init = *(floor(p)); # prolog load
5987 realignment_token = call target_builtin;
5988 loop:
5989 x msq = phi (msq_init, ---)
5991 The stmts marked with x are generated only for the case of
5992 dr_explicit_realign_optimized.
5994 The code above sets up a new (vector) pointer, pointing to the first
5995 location accessed by STMT_INFO, and a "floor-aligned" load using that
5996 pointer. It also generates code to compute the "realignment-token"
5997 (if the relevant target hook was defined), and creates a phi-node at the
5998 loop-header bb whose arguments are the result of the prolog-load (created
5999 by this function) and the result of a load that takes place in the loop
6000 (to be created by the caller to this function).
6002 For the case of dr_explicit_realign_optimized:
6003 The caller to this function uses the phi-result (msq) to create the
6004 realignment code inside the loop, and sets up the missing phi argument,
6005 as follows:
6006 loop:
6007 msq = phi (msq_init, lsq)
6008 lsq = *(floor(p')); # load in loop
6009 result = realign_load (msq, lsq, realignment_token);
6011 For the case of dr_explicit_realign:
6012 loop:
6013 msq = *(floor(p)); # load in loop
6014 p' = p + (VS-1);
6015 lsq = *(floor(p')); # load in loop
6016 result = realign_load (msq, lsq, realignment_token);
6018 Input:
6019 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6020 a memory location that may be unaligned.
6021 BSI - place where new code is to be inserted.
6022 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6023 is used.
6025 Output:
6026 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6027 target hook, if defined.
6028 Return value - the result of the loop-header phi node. */
6030 tree
6031 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6032 gimple_stmt_iterator *gsi, tree *realignment_token,
6033 enum dr_alignment_support alignment_support_scheme,
6034 tree init_addr,
6035 class loop **at_loop)
6037 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6038 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6039 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6040 struct data_reference *dr = dr_info->dr;
6041 class loop *loop = NULL;
6042 edge pe = NULL;
6043 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6044 tree vec_dest;
6045 gimple *inc;
6046 tree ptr;
6047 tree data_ref;
6048 basic_block new_bb;
6049 tree msq_init = NULL_TREE;
6050 tree new_temp;
6051 gphi *phi_stmt;
6052 tree msq = NULL_TREE;
6053 gimple_seq stmts = NULL;
6054 bool compute_in_loop = false;
6055 bool nested_in_vect_loop = false;
6056 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6057 class loop *loop_for_initial_load = NULL;
6059 if (loop_vinfo)
6061 loop = LOOP_VINFO_LOOP (loop_vinfo);
6062 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6065 gcc_assert (alignment_support_scheme == dr_explicit_realign
6066 || alignment_support_scheme == dr_explicit_realign_optimized);
6068 /* We need to generate three things:
6069 1. the misalignment computation
6070 2. the extra vector load (for the optimized realignment scheme).
6071 3. the phi node for the two vectors from which the realignment is
6072 done (for the optimized realignment scheme). */
6074 /* 1. Determine where to generate the misalignment computation.
6076 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6077 calculation will be generated by this function, outside the loop (in the
6078 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6079 caller, inside the loop.
6081 Background: If the misalignment remains fixed throughout the iterations of
6082 the loop, then both realignment schemes are applicable, and also the
6083 misalignment computation can be done outside LOOP. This is because we are
6084 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6085 are a multiple of VS (the Vector Size), and therefore the misalignment in
6086 different vectorized LOOP iterations is always the same.
6087 The problem arises only if the memory access is in an inner-loop nested
6088 inside LOOP, which is now being vectorized using outer-loop vectorization.
6089 This is the only case when the misalignment of the memory access may not
6090 remain fixed throughout the iterations of the inner-loop (as explained in
6091 detail in vect_supportable_dr_alignment). In this case, not only is the
6092 optimized realignment scheme not applicable, but also the misalignment
6093 computation (and generation of the realignment token that is passed to
6094 REALIGN_LOAD) have to be done inside the loop.
6096 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6097 or not, which in turn determines if the misalignment is computed inside
6098 the inner-loop, or outside LOOP. */
6100 if (init_addr != NULL_TREE || !loop_vinfo)
6102 compute_in_loop = true;
6103 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6107 /* 2. Determine where to generate the extra vector load.
6109 For the optimized realignment scheme, instead of generating two vector
6110 loads in each iteration, we generate a single extra vector load in the
6111 preheader of the loop, and in each iteration reuse the result of the
6112 vector load from the previous iteration. In case the memory access is in
6113 an inner-loop nested inside LOOP, which is now being vectorized using
6114 outer-loop vectorization, we need to determine whether this initial vector
6115 load should be generated at the preheader of the inner-loop, or can be
6116 generated at the preheader of LOOP. If the memory access has no evolution
6117 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6118 to be generated inside LOOP (in the preheader of the inner-loop). */
6120 if (nested_in_vect_loop)
6122 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6123 bool invariant_in_outerloop =
6124 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6125 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6127 else
6128 loop_for_initial_load = loop;
6129 if (at_loop)
6130 *at_loop = loop_for_initial_load;
6132 tree vuse = NULL_TREE;
6133 if (loop_for_initial_load)
6135 pe = loop_preheader_edge (loop_for_initial_load);
6136 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6137 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6139 if (!vuse)
6140 vuse = gimple_vuse (gsi_stmt (*gsi));
6142 /* 3. For the case of the optimized realignment, create the first vector
6143 load at the loop preheader. */
6145 if (alignment_support_scheme == dr_explicit_realign_optimized)
6147 /* Create msq_init = *(floor(p1)) in the loop preheader */
6148 gassign *new_stmt;
6150 gcc_assert (!compute_in_loop);
6151 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6152 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6153 loop_for_initial_load, NULL_TREE,
6154 &init_addr, NULL, &inc, true);
6155 if (TREE_CODE (ptr) == SSA_NAME)
6156 new_temp = copy_ssa_name (ptr);
6157 else
6158 new_temp = make_ssa_name (TREE_TYPE (ptr));
6159 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6160 tree type = TREE_TYPE (ptr);
6161 new_stmt = gimple_build_assign
6162 (new_temp, BIT_AND_EXPR, ptr,
6163 fold_build2 (MINUS_EXPR, type,
6164 build_int_cst (type, 0),
6165 build_int_cst (type, align)));
6166 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6167 gcc_assert (!new_bb);
6168 data_ref
6169 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6170 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6171 vect_copy_ref_info (data_ref, DR_REF (dr));
6172 new_stmt = gimple_build_assign (vec_dest, data_ref);
6173 new_temp = make_ssa_name (vec_dest, new_stmt);
6174 gimple_assign_set_lhs (new_stmt, new_temp);
6175 gimple_set_vuse (new_stmt, vuse);
6176 if (pe)
6178 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6179 gcc_assert (!new_bb);
6181 else
6182 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6184 msq_init = gimple_assign_lhs (new_stmt);
6187 /* 4. Create realignment token using a target builtin, if available.
6188 It is done either inside the containing loop, or before LOOP (as
6189 determined above). */
6191 if (targetm.vectorize.builtin_mask_for_load)
6193 gcall *new_stmt;
6194 tree builtin_decl;
6196 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6197 if (!init_addr)
6199 /* Generate the INIT_ADDR computation outside LOOP. */
6200 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6201 stmt_info, &stmts,
6202 NULL_TREE);
6203 if (loop)
6205 pe = loop_preheader_edge (loop);
6206 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6207 gcc_assert (!new_bb);
6209 else
6210 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6213 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6214 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6215 vec_dest =
6216 vect_create_destination_var (scalar_dest,
6217 gimple_call_return_type (new_stmt));
6218 new_temp = make_ssa_name (vec_dest, new_stmt);
6219 gimple_call_set_lhs (new_stmt, new_temp);
6221 if (compute_in_loop)
6222 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6223 else
6225 /* Generate the misalignment computation outside LOOP. */
6226 pe = loop_preheader_edge (loop);
6227 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6228 gcc_assert (!new_bb);
6231 *realignment_token = gimple_call_lhs (new_stmt);
6233 /* The result of the CALL_EXPR to this builtin is determined from
6234 the value of the parameter and no global variables are touched
6235 which makes the builtin a "const" function. Requiring the
6236 builtin to have the "const" attribute makes it unnecessary
6237 to call mark_call_clobbered. */
6238 gcc_assert (TREE_READONLY (builtin_decl));
6241 if (alignment_support_scheme == dr_explicit_realign)
6242 return msq;
6244 gcc_assert (!compute_in_loop);
6245 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6248 /* 5. Create msq = phi <msq_init, lsq> in loop */
6250 pe = loop_preheader_edge (containing_loop);
6251 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6252 msq = make_ssa_name (vec_dest);
6253 phi_stmt = create_phi_node (msq, containing_loop->header);
6254 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6256 return msq;
6260 /* Function vect_grouped_load_supported.
6262 COUNT is the size of the load group (the number of statements plus the
6263 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6264 only one statement, with a gap of COUNT - 1.
6266 Returns true if a suitable permute exists. */
6268 bool
6269 vect_grouped_load_supported (tree vectype, bool single_element_p,
6270 unsigned HOST_WIDE_INT count)
6272 machine_mode mode = TYPE_MODE (vectype);
6274 /* If this is single-element interleaving with an element distance
6275 that leaves unused vector loads around punt - we at least create
6276 very sub-optimal code in that case (and blow up memory,
6277 see PR65518). */
6278 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6280 if (dump_enabled_p ())
6281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6282 "single-element interleaving not supported "
6283 "for not adjacent vector loads\n");
6284 return false;
6287 /* vect_permute_load_chain requires the group size to be equal to 3 or
6288 be a power of two. */
6289 if (count != 3 && exact_log2 (count) == -1)
6291 if (dump_enabled_p ())
6292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6293 "the size of the group of accesses"
6294 " is not a power of 2 or not equal to 3\n");
6295 return false;
6298 /* Check that the permutation is supported. */
6299 if (VECTOR_MODE_P (mode))
6301 unsigned int i, j;
6302 if (count == 3)
6304 unsigned int nelt;
6305 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6307 if (dump_enabled_p ())
6308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6309 "cannot handle groups of 3 loads for"
6310 " variable-length vectors\n");
6311 return false;
6314 vec_perm_builder sel (nelt, nelt, 1);
6315 sel.quick_grow (nelt);
6316 vec_perm_indices indices;
6317 unsigned int k;
6318 for (k = 0; k < 3; k++)
6320 for (i = 0; i < nelt; i++)
6321 if (3 * i + k < 2 * nelt)
6322 sel[i] = 3 * i + k;
6323 else
6324 sel[i] = 0;
6325 indices.new_vector (sel, 2, nelt);
6326 if (!can_vec_perm_const_p (mode, mode, indices))
6328 if (dump_enabled_p ())
6329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6330 "shuffle of 3 loads is not supported by"
6331 " target\n");
6332 return false;
6334 for (i = 0, j = 0; i < nelt; i++)
6335 if (3 * i + k < 2 * nelt)
6336 sel[i] = i;
6337 else
6338 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6339 indices.new_vector (sel, 2, nelt);
6340 if (!can_vec_perm_const_p (mode, mode, indices))
6342 if (dump_enabled_p ())
6343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6344 "shuffle of 3 loads is not supported by"
6345 " target\n");
6346 return false;
6349 return true;
6351 else
6353 /* If length is not equal to 3 then only power of 2 is supported. */
6354 gcc_assert (pow2p_hwi (count));
6355 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6357 /* The encoding has a single stepped pattern. */
6358 vec_perm_builder sel (nelt, 1, 3);
6359 sel.quick_grow (3);
6360 for (i = 0; i < 3; i++)
6361 sel[i] = i * 2;
6362 vec_perm_indices indices (sel, 2, nelt);
6363 if (can_vec_perm_const_p (mode, mode, indices))
6365 for (i = 0; i < 3; i++)
6366 sel[i] = i * 2 + 1;
6367 indices.new_vector (sel, 2, nelt);
6368 if (can_vec_perm_const_p (mode, mode, indices))
6369 return true;
6374 if (dump_enabled_p ())
6375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6376 "extract even/odd not supported by target\n");
6377 return false;
6380 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6381 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6383 internal_fn
6384 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6385 bool masked_p)
6387 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6388 vec_mask_len_load_lanes_optab, vectype,
6389 count))
6390 return IFN_MASK_LEN_LOAD_LANES;
6391 else if (masked_p)
6393 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6394 vec_mask_load_lanes_optab, vectype,
6395 count))
6396 return IFN_MASK_LOAD_LANES;
6398 else
6400 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6401 vectype, count))
6402 return IFN_LOAD_LANES;
6404 return IFN_LAST;
6407 /* Function vect_permute_load_chain.
6409 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6410 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6411 the input data correctly. Return the final references for loads in
6412 RESULT_CHAIN.
6414 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6415 The input is 4 vectors each containing 8 elements. We assign a number to each
6416 element, the input sequence is:
6418 1st vec: 0 1 2 3 4 5 6 7
6419 2nd vec: 8 9 10 11 12 13 14 15
6420 3rd vec: 16 17 18 19 20 21 22 23
6421 4th vec: 24 25 26 27 28 29 30 31
6423 The output sequence should be:
6425 1st vec: 0 4 8 12 16 20 24 28
6426 2nd vec: 1 5 9 13 17 21 25 29
6427 3rd vec: 2 6 10 14 18 22 26 30
6428 4th vec: 3 7 11 15 19 23 27 31
6430 i.e., the first output vector should contain the first elements of each
6431 interleaving group, etc.
6433 We use extract_even/odd instructions to create such output. The input of
6434 each extract_even/odd operation is two vectors
6435 1st vec 2nd vec
6436 0 1 2 3 4 5 6 7
6438 and the output is the vector of extracted even/odd elements. The output of
6439 extract_even will be: 0 2 4 6
6440 and of extract_odd: 1 3 5 7
6443 The permutation is done in log LENGTH stages. In each stage extract_even
6444 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6445 their order. In our example,
6447 E1: extract_even (1st vec, 2nd vec)
6448 E2: extract_odd (1st vec, 2nd vec)
6449 E3: extract_even (3rd vec, 4th vec)
6450 E4: extract_odd (3rd vec, 4th vec)
6452 The output for the first stage will be:
6454 E1: 0 2 4 6 8 10 12 14
6455 E2: 1 3 5 7 9 11 13 15
6456 E3: 16 18 20 22 24 26 28 30
6457 E4: 17 19 21 23 25 27 29 31
6459 In order to proceed and create the correct sequence for the next stage (or
6460 for the correct output, if the second stage is the last one, as in our
6461 example), we first put the output of extract_even operation and then the
6462 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6463 The input for the second stage is:
6465 1st vec (E1): 0 2 4 6 8 10 12 14
6466 2nd vec (E3): 16 18 20 22 24 26 28 30
6467 3rd vec (E2): 1 3 5 7 9 11 13 15
6468 4th vec (E4): 17 19 21 23 25 27 29 31
6470 The output of the second stage:
6472 E1: 0 4 8 12 16 20 24 28
6473 E2: 2 6 10 14 18 22 26 30
6474 E3: 1 5 9 13 17 21 25 29
6475 E4: 3 7 11 15 19 23 27 31
6477 And RESULT_CHAIN after reordering:
6479 1st vec (E1): 0 4 8 12 16 20 24 28
6480 2nd vec (E3): 1 5 9 13 17 21 25 29
6481 3rd vec (E2): 2 6 10 14 18 22 26 30
6482 4th vec (E4): 3 7 11 15 19 23 27 31. */
6484 static void
6485 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6486 unsigned int length,
6487 stmt_vec_info stmt_info,
6488 gimple_stmt_iterator *gsi,
6489 vec<tree> *result_chain)
6491 tree data_ref, first_vect, second_vect;
6492 tree perm_mask_even, perm_mask_odd;
6493 tree perm3_mask_low, perm3_mask_high;
6494 gimple *perm_stmt;
6495 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6496 unsigned int i, j, log_length = exact_log2 (length);
6498 result_chain->quick_grow (length);
6499 memcpy (result_chain->address (), dr_chain.address (),
6500 length * sizeof (tree));
6502 if (length == 3)
6504 /* vect_grouped_load_supported ensures that this is constant. */
6505 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6506 unsigned int k;
6508 vec_perm_builder sel (nelt, nelt, 1);
6509 sel.quick_grow (nelt);
6510 vec_perm_indices indices;
6511 for (k = 0; k < 3; k++)
6513 for (i = 0; i < nelt; i++)
6514 if (3 * i + k < 2 * nelt)
6515 sel[i] = 3 * i + k;
6516 else
6517 sel[i] = 0;
6518 indices.new_vector (sel, 2, nelt);
6519 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6521 for (i = 0, j = 0; i < nelt; i++)
6522 if (3 * i + k < 2 * nelt)
6523 sel[i] = i;
6524 else
6525 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6526 indices.new_vector (sel, 2, nelt);
6527 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6529 first_vect = dr_chain[0];
6530 second_vect = dr_chain[1];
6532 /* Create interleaving stmt (low part of):
6533 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6534 ...}> */
6535 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6536 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6537 second_vect, perm3_mask_low);
6538 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6540 /* Create interleaving stmt (high part of):
6541 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6542 ...}> */
6543 first_vect = data_ref;
6544 second_vect = dr_chain[2];
6545 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6546 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6547 second_vect, perm3_mask_high);
6548 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6549 (*result_chain)[k] = data_ref;
6552 else
6554 /* If length is not equal to 3 then only power of 2 is supported. */
6555 gcc_assert (pow2p_hwi (length));
6557 /* The encoding has a single stepped pattern. */
6558 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6559 vec_perm_builder sel (nelt, 1, 3);
6560 sel.quick_grow (3);
6561 for (i = 0; i < 3; ++i)
6562 sel[i] = i * 2;
6563 vec_perm_indices indices (sel, 2, nelt);
6564 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6566 for (i = 0; i < 3; ++i)
6567 sel[i] = i * 2 + 1;
6568 indices.new_vector (sel, 2, nelt);
6569 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6571 for (i = 0; i < log_length; i++)
6573 for (j = 0; j < length; j += 2)
6575 first_vect = dr_chain[j];
6576 second_vect = dr_chain[j+1];
6578 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6579 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6580 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6581 first_vect, second_vect,
6582 perm_mask_even);
6583 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6584 (*result_chain)[j/2] = data_ref;
6586 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6587 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6588 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6589 first_vect, second_vect,
6590 perm_mask_odd);
6591 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6592 (*result_chain)[j/2+length/2] = data_ref;
6594 memcpy (dr_chain.address (), result_chain->address (),
6595 length * sizeof (tree));
6600 /* Function vect_shift_permute_load_chain.
6602 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6603 sequence of stmts to reorder the input data accordingly.
6604 Return the final references for loads in RESULT_CHAIN.
6605 Return true if successed, false otherwise.
6607 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6608 The input is 3 vectors each containing 8 elements. We assign a
6609 number to each element, the input sequence is:
6611 1st vec: 0 1 2 3 4 5 6 7
6612 2nd vec: 8 9 10 11 12 13 14 15
6613 3rd vec: 16 17 18 19 20 21 22 23
6615 The output sequence should be:
6617 1st vec: 0 3 6 9 12 15 18 21
6618 2nd vec: 1 4 7 10 13 16 19 22
6619 3rd vec: 2 5 8 11 14 17 20 23
6621 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6623 First we shuffle all 3 vectors to get correct elements order:
6625 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6626 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6627 3rd vec: (16 19 22) (17 20 23) (18 21)
6629 Next we unite and shift vector 3 times:
6631 1st step:
6632 shift right by 6 the concatenation of:
6633 "1st vec" and "2nd vec"
6634 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6635 "2nd vec" and "3rd vec"
6636 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6637 "3rd vec" and "1st vec"
6638 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6639 | New vectors |
6641 So that now new vectors are:
6643 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6644 2nd vec: (10 13) (16 19 22) (17 20 23)
6645 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6647 2nd step:
6648 shift right by 5 the concatenation of:
6649 "1st vec" and "3rd vec"
6650 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6651 "2nd vec" and "1st vec"
6652 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6653 "3rd vec" and "2nd vec"
6654 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6655 | New vectors |
6657 So that now new vectors are:
6659 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6660 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6661 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6663 3rd step:
6664 shift right by 5 the concatenation of:
6665 "1st vec" and "1st vec"
6666 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6667 shift right by 3 the concatenation of:
6668 "2nd vec" and "2nd vec"
6669 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6670 | New vectors |
6672 So that now all vectors are READY:
6673 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6674 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6675 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6677 This algorithm is faster than one in vect_permute_load_chain if:
6678 1. "shift of a concatination" is faster than general permutation.
6679 This is usually so.
6680 2. The TARGET machine can't execute vector instructions in parallel.
6681 This is because each step of the algorithm depends on previous.
6682 The algorithm in vect_permute_load_chain is much more parallel.
6684 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6687 static bool
6688 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6689 unsigned int length,
6690 stmt_vec_info stmt_info,
6691 gimple_stmt_iterator *gsi,
6692 vec<tree> *result_chain)
6694 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6695 tree perm2_mask1, perm2_mask2, perm3_mask;
6696 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6697 gimple *perm_stmt;
6699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6700 machine_mode vmode = TYPE_MODE (vectype);
6701 unsigned int i;
6702 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6704 unsigned HOST_WIDE_INT nelt, vf;
6705 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6706 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6707 /* Not supported for variable-length vectors. */
6708 return false;
6710 vec_perm_builder sel (nelt, nelt, 1);
6711 sel.quick_grow (nelt);
6713 result_chain->quick_grow (length);
6714 memcpy (result_chain->address (), dr_chain.address (),
6715 length * sizeof (tree));
6717 if (pow2p_hwi (length) && vf > 4)
6719 unsigned int j, log_length = exact_log2 (length);
6720 for (i = 0; i < nelt / 2; ++i)
6721 sel[i] = i * 2;
6722 for (i = 0; i < nelt / 2; ++i)
6723 sel[nelt / 2 + i] = i * 2 + 1;
6724 vec_perm_indices indices (sel, 2, nelt);
6725 if (!can_vec_perm_const_p (vmode, vmode, indices))
6727 if (dump_enabled_p ())
6728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6729 "shuffle of 2 fields structure is not \
6730 supported by target\n");
6731 return false;
6733 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6735 for (i = 0; i < nelt / 2; ++i)
6736 sel[i] = i * 2 + 1;
6737 for (i = 0; i < nelt / 2; ++i)
6738 sel[nelt / 2 + i] = i * 2;
6739 indices.new_vector (sel, 2, nelt);
6740 if (!can_vec_perm_const_p (vmode, vmode, indices))
6742 if (dump_enabled_p ())
6743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744 "shuffle of 2 fields structure is not \
6745 supported by target\n");
6746 return false;
6748 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6750 /* Generating permutation constant to shift all elements.
6751 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6752 for (i = 0; i < nelt; i++)
6753 sel[i] = nelt / 2 + i;
6754 indices.new_vector (sel, 2, nelt);
6755 if (!can_vec_perm_const_p (vmode, vmode, indices))
6757 if (dump_enabled_p ())
6758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759 "shift permutation is not supported by target\n");
6760 return false;
6762 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6764 /* Generating permutation constant to select vector from 2.
6765 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6766 for (i = 0; i < nelt / 2; i++)
6767 sel[i] = i;
6768 for (i = nelt / 2; i < nelt; i++)
6769 sel[i] = nelt + i;
6770 indices.new_vector (sel, 2, nelt);
6771 if (!can_vec_perm_const_p (vmode, vmode, indices))
6773 if (dump_enabled_p ())
6774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6775 "select is not supported by target\n");
6776 return false;
6778 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6780 for (i = 0; i < log_length; i++)
6782 for (j = 0; j < length; j += 2)
6784 first_vect = dr_chain[j];
6785 second_vect = dr_chain[j + 1];
6787 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6788 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6789 first_vect, first_vect,
6790 perm2_mask1);
6791 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6792 vect[0] = data_ref;
6794 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6795 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6796 second_vect, second_vect,
6797 perm2_mask2);
6798 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6799 vect[1] = data_ref;
6801 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6802 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6803 vect[0], vect[1], shift1_mask);
6804 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6805 (*result_chain)[j/2 + length/2] = data_ref;
6807 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6808 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6809 vect[0], vect[1], select_mask);
6810 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6811 (*result_chain)[j/2] = data_ref;
6813 memcpy (dr_chain.address (), result_chain->address (),
6814 length * sizeof (tree));
6816 return true;
6818 if (length == 3 && vf > 2)
6820 unsigned int k = 0, l = 0;
6822 /* Generating permutation constant to get all elements in rigth order.
6823 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6824 for (i = 0; i < nelt; i++)
6826 if (3 * k + (l % 3) >= nelt)
6828 k = 0;
6829 l += (3 - (nelt % 3));
6831 sel[i] = 3 * k + (l % 3);
6832 k++;
6834 vec_perm_indices indices (sel, 2, nelt);
6835 if (!can_vec_perm_const_p (vmode, vmode, indices))
6837 if (dump_enabled_p ())
6838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6839 "shuffle of 3 fields structure is not \
6840 supported by target\n");
6841 return false;
6843 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6845 /* Generating permutation constant to shift all elements.
6846 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6847 for (i = 0; i < nelt; i++)
6848 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6849 indices.new_vector (sel, 2, nelt);
6850 if (!can_vec_perm_const_p (vmode, vmode, indices))
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854 "shift permutation is not supported by target\n");
6855 return false;
6857 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6859 /* Generating permutation constant to shift all elements.
6860 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6861 for (i = 0; i < nelt; i++)
6862 sel[i] = 2 * (nelt / 3) + 1 + i;
6863 indices.new_vector (sel, 2, nelt);
6864 if (!can_vec_perm_const_p (vmode, vmode, indices))
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868 "shift permutation is not supported by target\n");
6869 return false;
6871 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6873 /* Generating permutation constant to shift all elements.
6874 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6875 for (i = 0; i < nelt; i++)
6876 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6877 indices.new_vector (sel, 2, nelt);
6878 if (!can_vec_perm_const_p (vmode, vmode, indices))
6880 if (dump_enabled_p ())
6881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882 "shift permutation is not supported by target\n");
6883 return false;
6885 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6887 /* Generating permutation constant to shift all elements.
6888 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6889 for (i = 0; i < nelt; i++)
6890 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6891 indices.new_vector (sel, 2, nelt);
6892 if (!can_vec_perm_const_p (vmode, vmode, indices))
6894 if (dump_enabled_p ())
6895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896 "shift permutation is not supported by target\n");
6897 return false;
6899 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6901 for (k = 0; k < 3; k++)
6903 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6904 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6905 dr_chain[k], dr_chain[k],
6906 perm3_mask);
6907 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6908 vect[k] = data_ref;
6911 for (k = 0; k < 3; k++)
6913 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6914 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6915 vect[k % 3], vect[(k + 1) % 3],
6916 shift1_mask);
6917 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6918 vect_shift[k] = data_ref;
6921 for (k = 0; k < 3; k++)
6923 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6924 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6925 vect_shift[(4 - k) % 3],
6926 vect_shift[(3 - k) % 3],
6927 shift2_mask);
6928 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6929 vect[k] = data_ref;
6932 (*result_chain)[3 - (nelt % 3)] = vect[2];
6934 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6935 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6936 vect[0], shift3_mask);
6937 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6938 (*result_chain)[nelt % 3] = data_ref;
6940 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6941 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6942 vect[1], shift4_mask);
6943 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6944 (*result_chain)[0] = data_ref;
6945 return true;
6947 return false;
6950 /* Function vect_transform_grouped_load.
6952 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6953 to perform their permutation and ascribe the result vectorized statements to
6954 the scalar statements.
6957 void
6958 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6959 vec<tree> dr_chain,
6960 int size, gimple_stmt_iterator *gsi)
6962 machine_mode mode;
6963 vec<tree> result_chain = vNULL;
6965 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6966 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6967 vectors, that are ready for vector computation. */
6968 result_chain.create (size);
6970 /* If reassociation width for vector type is 2 or greater target machine can
6971 execute 2 or more vector instructions in parallel. Otherwise try to
6972 get chain for loads group using vect_shift_permute_load_chain. */
6973 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6974 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6975 || pow2p_hwi (size)
6976 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6977 gsi, &result_chain))
6978 vect_permute_load_chain (vinfo, dr_chain,
6979 size, stmt_info, gsi, &result_chain);
6980 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6981 result_chain.release ();
6984 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6985 generated as part of the vectorization of STMT_INFO. Assign the statement
6986 for each vector to the associated scalar statement. */
6988 void
6989 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6990 vec<tree> result_chain)
6992 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6993 unsigned int i, gap_count;
6994 tree tmp_data_ref;
6996 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6997 Since we scan the chain starting from it's first node, their order
6998 corresponds the order of data-refs in RESULT_CHAIN. */
6999 stmt_vec_info next_stmt_info = first_stmt_info;
7000 gap_count = 1;
7001 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7003 if (!next_stmt_info)
7004 break;
7006 /* Skip the gaps. Loads created for the gaps will be removed by dead
7007 code elimination pass later. No need to check for the first stmt in
7008 the group, since it always exists.
7009 DR_GROUP_GAP is the number of steps in elements from the previous
7010 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
7011 correspond to the gaps. */
7012 if (next_stmt_info != first_stmt_info
7013 && gap_count < DR_GROUP_GAP (next_stmt_info))
7015 gap_count++;
7016 continue;
7019 /* ??? The following needs cleanup after the removal of
7020 DR_GROUP_SAME_DR_STMT. */
7021 if (next_stmt_info)
7023 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7024 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7025 copies, and we put the new vector statement last. */
7026 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7028 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7029 gap_count = 1;
7034 /* Function vect_force_dr_alignment_p.
7036 Returns whether the alignment of a DECL can be forced to be aligned
7037 on ALIGNMENT bit boundary. */
7039 bool
7040 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7042 if (!VAR_P (decl))
7043 return false;
7045 if (decl_in_symtab_p (decl)
7046 && !symtab_node::get (decl)->can_increase_alignment_p ())
7047 return false;
7049 if (TREE_STATIC (decl))
7050 return (known_le (alignment,
7051 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7052 else
7053 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7056 /* Return whether the data reference DR_INFO is supported with respect to its
7057 alignment.
7058 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7059 it is aligned, i.e., check if it is possible to vectorize it with different
7060 alignment. */
7062 enum dr_alignment_support
7063 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7064 tree vectype, int misalignment)
7066 data_reference *dr = dr_info->dr;
7067 stmt_vec_info stmt_info = dr_info->stmt;
7068 machine_mode mode = TYPE_MODE (vectype);
7069 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7070 class loop *vect_loop = NULL;
7071 bool nested_in_vect_loop = false;
7073 if (misalignment == 0)
7074 return dr_aligned;
7076 /* For now assume all conditional loads/stores support unaligned
7077 access without any special code. */
7078 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7079 if (gimple_call_internal_p (stmt)
7080 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7081 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7082 return dr_unaligned_supported;
7084 if (loop_vinfo)
7086 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7087 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7090 /* Possibly unaligned access. */
7092 /* We can choose between using the implicit realignment scheme (generating
7093 a misaligned_move stmt) and the explicit realignment scheme (generating
7094 aligned loads with a REALIGN_LOAD). There are two variants to the
7095 explicit realignment scheme: optimized, and unoptimized.
7096 We can optimize the realignment only if the step between consecutive
7097 vector loads is equal to the vector size. Since the vector memory
7098 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7099 is guaranteed that the misalignment amount remains the same throughout the
7100 execution of the vectorized loop. Therefore, we can create the
7101 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7102 at the loop preheader.
7104 However, in the case of outer-loop vectorization, when vectorizing a
7105 memory access in the inner-loop nested within the LOOP that is now being
7106 vectorized, while it is guaranteed that the misalignment of the
7107 vectorized memory access will remain the same in different outer-loop
7108 iterations, it is *not* guaranteed that is will remain the same throughout
7109 the execution of the inner-loop. This is because the inner-loop advances
7110 with the original scalar step (and not in steps of VS). If the inner-loop
7111 step happens to be a multiple of VS, then the misalignment remains fixed
7112 and we can use the optimized realignment scheme. For example:
7114 for (i=0; i<N; i++)
7115 for (j=0; j<M; j++)
7116 s += a[i+j];
7118 When vectorizing the i-loop in the above example, the step between
7119 consecutive vector loads is 1, and so the misalignment does not remain
7120 fixed across the execution of the inner-loop, and the realignment cannot
7121 be optimized (as illustrated in the following pseudo vectorized loop):
7123 for (i=0; i<N; i+=4)
7124 for (j=0; j<M; j++){
7125 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7126 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7127 // (assuming that we start from an aligned address).
7130 We therefore have to use the unoptimized realignment scheme:
7132 for (i=0; i<N; i+=4)
7133 for (j=k; j<M; j+=4)
7134 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7135 // that the misalignment of the initial address is
7136 // 0).
7138 The loop can then be vectorized as follows:
7140 for (k=0; k<4; k++){
7141 rt = get_realignment_token (&vp[k]);
7142 for (i=0; i<N; i+=4){
7143 v1 = vp[i+k];
7144 for (j=k; j<M; j+=4){
7145 v2 = vp[i+j+VS-1];
7146 va = REALIGN_LOAD <v1,v2,rt>;
7147 vs += va;
7148 v1 = v2;
7151 } */
7153 if (DR_IS_READ (dr))
7155 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7156 && (!targetm.vectorize.builtin_mask_for_load
7157 || targetm.vectorize.builtin_mask_for_load ()))
7159 /* If we are doing SLP then the accesses need not have the
7160 same alignment, instead it depends on the SLP group size. */
7161 if (loop_vinfo
7162 && STMT_SLP_TYPE (stmt_info)
7163 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7164 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7165 * (DR_GROUP_SIZE
7166 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7167 TYPE_VECTOR_SUBPARTS (vectype))))
7169 else if (!loop_vinfo
7170 || (nested_in_vect_loop
7171 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7172 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7173 return dr_explicit_realign;
7174 else
7175 return dr_explicit_realign_optimized;
7179 bool is_packed = false;
7180 tree type = TREE_TYPE (DR_REF (dr));
7181 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7182 is_packed = not_size_aligned (DR_REF (dr));
7183 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7184 is_packed))
7185 return dr_unaligned_supported;
7187 /* Unsupported. */
7188 return dr_unaligned_unsupported;