c++: Prevent overwriting arguments when merging duplicates [PR112588]
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blob0495842b35050d024018c08a150d04a6dce0a86d
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
104 static cgraph_node*
105 simd_clone_call_p (gimple *stmt)
107 gcall *call = dyn_cast <gcall *> (stmt);
108 if (!call)
109 return NULL;
111 tree fndecl = NULL_TREE;
112 if (gimple_call_internal_p (call, IFN_MASK_CALL))
113 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
114 else
115 fndecl = gimple_call_fndecl (stmt);
117 if (fndecl == NULL_TREE)
118 return NULL;
120 cgraph_node *node = cgraph_node::get (fndecl);
121 if (node && node->simd_clones != NULL)
122 return node;
124 return NULL;
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
144 types. */
146 tree
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
149 HOST_WIDE_INT lhs, rhs;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
154 return scalar_type;
156 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
158 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
159 if (assign)
161 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
162 if (gimple_assign_cast_p (assign)
163 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
169 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
171 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
172 if (rhs < lhs)
173 scalar_type = rhs_type;
176 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
178 auto clone = node->simd_clones->simdclone;
179 for (unsigned int i = 0; i < clone->nargs; ++i)
181 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
183 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
184 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
185 if (rhs < lhs)
187 scalar_type = arg_scalar_type;
188 lhs = rhs;
193 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
195 unsigned int i = 0;
196 if (gimple_call_internal_p (call))
198 internal_fn ifn = gimple_call_internal_fn (call);
199 if (internal_load_fn_p (ifn))
200 /* For loads the LHS type does the trick. */
201 i = ~0U;
202 else if (internal_store_fn_p (ifn))
204 /* For stores use the tyep of the stored value. */
205 i = internal_fn_stored_value_index (ifn);
206 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
207 i = ~0U;
209 else if (internal_fn_mask_index (ifn) == 0)
210 i = 1;
212 if (i < gimple_call_num_args (call))
214 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
217 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
218 if (rhs < lhs)
219 scalar_type = rhs_type;
224 return scalar_type;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
232 static opt_result
233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
238 return opt_result::failure_at (vect_location,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
241 " == 0\n");
243 opt_result res
244 = runtime_alias_check_p (ddr, loop,
245 optimize_loop_nest_for_speed_p (loop));
246 if (!res)
247 return res;
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
255 static void
256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
258 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
259 for (unsigned int i = 0; i < checks.length(); ++i)
260 if (checks[i] == value)
261 return;
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "need run-time check that %T is nonzero\n",
266 value);
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
274 static bool
275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
277 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
278 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
283 return true;
285 /* STMT_A and STMT_B belong to overlapping groups. All loads are
286 emitted at the position of the first scalar load.
287 Stores in a group are emitted at the position of the last scalar store.
288 Compute that position and check whether the resulting order matches
289 the current one. */
290 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
291 if (il_a)
293 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
294 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
295 s = DR_GROUP_NEXT_ELEMENT (s))
296 il_a = get_later_stmt (il_a, s);
297 else /* DR_IS_READ */
298 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
299 s = DR_GROUP_NEXT_ELEMENT (s))
300 if (get_later_stmt (il_a, s) == il_a)
301 il_a = s;
303 else
304 il_a = stmtinfo_a;
305 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
306 if (il_b)
308 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
309 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
310 s = DR_GROUP_NEXT_ELEMENT (s))
311 il_b = get_later_stmt (il_b, s);
312 else /* DR_IS_READ */
313 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
314 s = DR_GROUP_NEXT_ELEMENT (s))
315 if (get_later_stmt (il_b, s) == il_b)
316 il_b = s;
318 else
319 il_b = stmtinfo_b;
320 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
321 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
324 /* A subroutine of vect_analyze_data_ref_dependence. Handle
325 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
326 distances. These distances are conservatively correct but they don't
327 reflect a guaranteed dependence.
329 Return true if this function does all the work necessary to avoid
330 an alias or false if the caller should use the dependence distances
331 to limit the vectorization factor in the usual way. LOOP_DEPTH is
332 the depth of the loop described by LOOP_VINFO and the other arguments
333 are as for vect_analyze_data_ref_dependence. */
335 static bool
336 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
337 loop_vec_info loop_vinfo,
338 int loop_depth, unsigned int *max_vf)
340 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
341 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
343 int dist = dist_v[loop_depth];
344 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
346 /* If the user asserted safelen >= DIST consecutive iterations
347 can be executed concurrently, assume independence.
349 ??? An alternative would be to add the alias check even
350 in this case, and vectorize the fallback loop with the
351 maximum VF set to safelen. However, if the user has
352 explicitly given a length, it's less likely that that
353 would be a win. */
354 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
356 if ((unsigned int) loop->safelen < *max_vf)
357 *max_vf = loop->safelen;
358 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
359 continue;
362 /* For dependence distances of 2 or more, we have the option
363 of limiting VF or checking for an alias at runtime.
364 Prefer to check at runtime if we can, to avoid limiting
365 the VF unnecessarily when the bases are in fact independent.
367 Note that the alias checks will be removed if the VF ends up
368 being small enough. */
369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
371 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
372 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
373 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
376 return true;
380 /* Function vect_analyze_data_ref_dependence.
382 FIXME: I needed to change the sense of the returned flag.
384 Return FALSE if there (might) exist a dependence between a memory-reference
385 DRA and a memory-reference DRB. When versioning for alias may check a
386 dependence at run-time, return TRUE. Adjust *MAX_VF according to
387 the data dependence. */
389 static opt_result
390 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
391 loop_vec_info loop_vinfo,
392 unsigned int *max_vf)
394 unsigned int i;
395 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
396 struct data_reference *dra = DDR_A (ddr);
397 struct data_reference *drb = DDR_B (ddr);
398 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
399 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
400 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
401 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
402 lambda_vector dist_v;
403 unsigned int loop_depth;
405 /* If user asserted safelen consecutive iterations can be
406 executed concurrently, assume independence. */
407 auto apply_safelen = [&]()
409 if (loop->safelen >= 2)
411 if ((unsigned int) loop->safelen < *max_vf)
412 *max_vf = loop->safelen;
413 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
414 return true;
416 return false;
419 /* In loop analysis all data references should be vectorizable. */
420 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
421 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
422 gcc_unreachable ();
424 /* Independent data accesses. */
425 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
426 return opt_result::success ();
428 if (dra == drb
429 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
430 return opt_result::success ();
432 /* We do not have to consider dependences between accesses that belong
433 to the same group, unless the stride could be smaller than the
434 group size. */
435 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
436 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
437 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
438 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
439 return opt_result::success ();
441 /* Even if we have an anti-dependence then, as the vectorized loop covers at
442 least two scalar iterations, there is always also a true dependence.
443 As the vectorizer does not re-order loads and stores we can ignore
444 the anti-dependence if TBAA can disambiguate both DRs similar to the
445 case with known negative distance anti-dependences (positive
446 distance anti-dependences would violate TBAA constraints). */
447 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
448 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
449 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
450 get_alias_set (DR_REF (drb))))
451 return opt_result::success ();
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
456 if (apply_safelen ())
457 return opt_result::success ();
459 return opt_result::failure_at
460 (stmtinfo_a->stmt,
461 "possible alias involving gather/scatter between %T and %T\n",
462 DR_REF (dra), DR_REF (drb));
465 /* Unknown data dependence. */
466 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
468 if (apply_safelen ())
469 return opt_result::success ();
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
473 "versioning for alias required: "
474 "can't determine dependence between %T and %T\n",
475 DR_REF (dra), DR_REF (drb));
477 /* Add to list of ddrs that need to be tested at run-time. */
478 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
481 /* Known data dependence. */
482 if (DDR_NUM_DIST_VECTS (ddr) == 0)
484 if (apply_safelen ())
485 return opt_result::success ();
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
489 "versioning for alias required: "
490 "bad dist vector for %T and %T\n",
491 DR_REF (dra), DR_REF (drb));
492 /* Add to list of ddrs that need to be tested at run-time. */
493 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
496 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
498 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
499 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
500 loop_depth, max_vf))
501 return opt_result::success ();
503 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
505 int dist = dist_v[loop_depth];
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location,
509 "dependence distance = %d.\n", dist);
511 if (dist == 0)
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance == 0 between %T and %T\n",
516 DR_REF (dra), DR_REF (drb));
518 /* When we perform grouped accesses and perform implicit CSE
519 by detecting equal accesses and doing disambiguation with
520 runtime alias tests like for
521 .. = a[i];
522 .. = a[i+1];
523 a[i] = ..;
524 a[i+1] = ..;
525 *p = ..;
526 .. = a[i];
527 .. = a[i+1];
528 where we will end up loading { a[i], a[i+1] } once, make
529 sure that inserting group loads before the first load and
530 stores after the last store will do the right thing.
531 Similar for groups like
532 a[i] = ...;
533 ... = a[i];
534 a[i+1] = ...;
535 where loads from the group interleave with the store. */
536 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
537 return opt_result::failure_at (stmtinfo_a->stmt,
538 "READ_WRITE dependence"
539 " in interleaving.\n");
541 if (loop->safelen < 2)
543 tree indicator = dr_zero_step_indicator (dra);
544 if (!indicator || integer_zerop (indicator))
545 return opt_result::failure_at (stmtinfo_a->stmt,
546 "access also has a zero step\n");
547 else if (TREE_CODE (indicator) != INTEGER_CST)
548 vect_check_nonzero_value (loop_vinfo, indicator);
550 continue;
553 if (dist > 0 && DDR_REVERSED_P (ddr))
555 /* If DDR_REVERSED_P the order of the data-refs in DDR was
556 reversed (to make distance vector positive), and the actual
557 distance is negative. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "dependence distance negative.\n");
561 /* When doing outer loop vectorization, we need to check if there is
562 a backward dependence at the inner loop level if the dependence
563 at the outer loop is reversed. See PR81740. */
564 if (nested_in_vect_loop_p (loop, stmtinfo_a)
565 || nested_in_vect_loop_p (loop, stmtinfo_b))
567 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
568 DDR_LOOP_NEST (ddr));
569 if (dist_v[inner_depth] < 0)
570 return opt_result::failure_at (stmtinfo_a->stmt,
571 "not vectorized, dependence "
572 "between data-refs %T and %T\n",
573 DR_REF (dra), DR_REF (drb));
575 /* Record a negative dependence distance to later limit the
576 amount of stmt copying / unrolling we can perform.
577 Only need to handle read-after-write dependence. */
578 if (DR_IS_READ (drb)
579 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
580 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
581 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
582 continue;
585 unsigned int abs_dist = abs (dist);
586 if (abs_dist >= 2 && abs_dist < *max_vf)
588 /* The dependence distance requires reduction of the maximal
589 vectorization factor. */
590 *max_vf = abs_dist;
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "adjusting maximal vectorization factor to %i\n",
594 *max_vf);
597 if (abs_dist >= *max_vf)
599 /* Dependence distance does not create dependence, as far as
600 vectorization is concerned, in this case. */
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "dependence distance >= VF.\n");
604 continue;
607 return opt_result::failure_at (stmtinfo_a->stmt,
608 "not vectorized, possible dependence "
609 "between data-refs %T and %T\n",
610 DR_REF (dra), DR_REF (drb));
613 return opt_result::success ();
616 /* Funcion vect_analyze_early_break_dependences.
618 Examime all the data references in the loop and make sure that if we have
619 mulitple exits that we are able to safely move stores such that they become
620 safe for vectorization. The function also calculates the place where to move
621 the instructions to and computes what the new vUSE chain should be.
623 This works in tandem with the CFG that will be produced by
624 slpeel_tree_duplicate_loop_to_edge_cfg later on.
626 This function tries to validate whether an early break vectorization
627 is possible for the current instruction sequence. Returns True i
628 possible, otherwise False.
630 Requirements:
631 - Any memory access must be to a fixed size buffer.
632 - There must not be any loads and stores to the same object.
633 - Multiple loads are allowed as long as they don't alias.
635 NOTE:
636 This implemementation is very conservative. Any overlappig loads/stores
637 that take place before the early break statement gets rejected aside from
638 WAR dependencies.
640 i.e.:
642 a[i] = 8
643 c = a[i]
644 if (b[i])
647 is not allowed, but
649 c = a[i]
650 a[i] = 8
651 if (b[i])
654 is which is the common case. */
656 static opt_result
657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
659 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
661 /* List of all load data references found during traversal. */
662 auto_vec<data_reference *> bases;
663 basic_block dest_bb = NULL;
665 hash_set <gimple *> visited;
666 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
667 class loop *loop_nest = loop_outer (loop);
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "loop contains multiple exits, analyzing"
672 " statement dependencies.\n");
674 /* Since we don't support general control flow, the location we'll move the
675 side-effects to is always the latch connected exit. When we support
676 general control flow we can do better but for now this is fine. */
677 dest_bb = single_pred (loop->latch);
678 basic_block bb = dest_bb;
682 /* If the destination block is also the header then we have nothing to do. */
683 if (!single_pred_p (bb))
684 continue;
686 bb = single_pred (bb);
687 gimple_stmt_iterator gsi = gsi_last_bb (bb);
689 /* Now analyze all the remaining statements and try to determine which
690 instructions are allowed/needed to be moved. */
691 while (!gsi_end_p (gsi))
693 gimple *stmt = gsi_stmt (gsi);
694 gsi_prev (&gsi);
695 if (!gimple_has_ops (stmt)
696 || is_gimple_debug (stmt))
697 continue;
699 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
700 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
701 if (!dr_ref)
702 continue;
704 /* We currently only support statically allocated objects due to
705 not having first-faulting loads support or peeling for
706 alignment support. Compute the size of the referenced object
707 (it could be dynamically allocated). */
708 tree obj = DR_BASE_ADDRESS (dr_ref);
709 if (!obj || TREE_CODE (obj) != ADDR_EXPR)
711 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "early breaks only supported on statically"
714 " allocated objects.\n");
715 return opt_result::failure_at (stmt,
716 "can't safely apply code motion to "
717 "dependencies of %G to vectorize "
718 "the early exit.\n", stmt);
721 tree refop = TREE_OPERAND (obj, 0);
722 tree refbase = get_base_address (refop);
723 if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
724 || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "early breaks only supported on"
729 " statically allocated objects.\n");
730 return opt_result::failure_at (stmt,
731 "can't safely apply code motion to "
732 "dependencies of %G to vectorize "
733 "the early exit.\n", stmt);
736 /* Check if vector accesses to the object will be within bounds.
737 must be a constant or assume loop will be versioned or niters
738 bounded by VF so accesses are within range. */
739 if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "early breaks not supported: vectorization "
744 "would %s beyond size of obj.",
745 DR_IS_READ (dr_ref) ? "read" : "write");
746 return opt_result::failure_at (stmt,
747 "can't safely apply code motion to "
748 "dependencies of %G to vectorize "
749 "the early exit.\n", stmt);
752 if (DR_IS_READ (dr_ref))
753 bases.safe_push (dr_ref);
754 else if (DR_IS_WRITE (dr_ref))
756 /* We are moving writes down in the CFG. To be sure that this
757 is valid after vectorization we have to check all the loads
758 we are sinking the stores past to see if any of them may
759 alias or are the same object.
761 Same objects will not be an issue because unless the store
762 is marked volatile the value can be forwarded. If the
763 store is marked volatile we don't vectorize the loop
764 anyway.
766 That leaves the check for aliasing. We don't really need
767 to care about the stores aliasing with each other since the
768 stores are moved in order so the effects are still observed
769 correctly. This leaves the check for WAR dependencies
770 which we would be introducing here if the DR can alias.
771 The check is quadratic in loads/stores but I have not found
772 a better API to do this. I believe all loads and stores
773 must be checked. We also must check them when we
774 encountered the store, since we don't care about loads past
775 the store. */
777 for (auto dr_read : bases)
778 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
780 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
782 vect_location,
783 "early breaks not supported: "
784 "overlapping loads and stores "
785 "found before the break "
786 "statement.\n");
788 return opt_result::failure_at (stmt,
789 "can't safely apply code motion to dependencies"
790 " to vectorize the early exit. %G may alias with"
791 " %G\n", stmt, dr_read->stmt);
795 if (gimple_vdef (stmt))
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE, vect_location,
799 "==> recording stmt %G", stmt);
801 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
803 else if (gimple_vuse (stmt))
805 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
806 if (dump_enabled_p ())
807 dump_printf_loc (MSG_NOTE, vect_location,
808 "marked statement for vUSE update: %G", stmt);
812 while (bb != loop->header);
814 /* We don't allow outer -> inner loop transitions which should have been
815 trapped already during loop form analysis. */
816 gcc_assert (dest_bb->loop_father == loop);
818 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
820 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
822 /* All uses shall be updated to that of the first load. Entries are
823 stored in reverse order. */
824 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
825 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE, vect_location,
829 "will update use: %T, mem_ref: %G", vuse, g);
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_NOTE, vect_location,
835 "recorded statements to be moved to BB %d\n",
836 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
838 return opt_result::success ();
841 /* Function vect_analyze_data_ref_dependences.
843 Examine all the data references in the loop, and make sure there do not
844 exist any data dependences between them. Set *MAX_VF according to
845 the maximum vectorization factor the data dependences allow. */
847 opt_result
848 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
849 unsigned int *max_vf)
851 unsigned int i;
852 struct data_dependence_relation *ddr;
854 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
856 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
858 LOOP_VINFO_DDRS (loop_vinfo)
859 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
860 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
861 /* We do not need read-read dependences. */
862 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
863 &LOOP_VINFO_DDRS (loop_vinfo),
864 LOOP_VINFO_LOOP_NEST (loop_vinfo),
865 false);
866 gcc_assert (res);
869 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
871 /* For epilogues we either have no aliases or alias versioning
872 was applied to original loop. Therefore we may just get max_vf
873 using VF of original loop. */
874 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
875 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
876 else
877 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
879 opt_result res
880 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
881 if (!res)
882 return res;
885 /* If we have early break statements in the loop, check to see if they
886 are of a form we can vectorizer. */
887 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
888 return vect_analyze_early_break_dependences (loop_vinfo);
890 return opt_result::success ();
894 /* Function vect_slp_analyze_data_ref_dependence.
896 Return TRUE if there (might) exist a dependence between a memory-reference
897 DRA and a memory-reference DRB for VINFO. When versioning for alias
898 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
899 according to the data dependence. */
901 static bool
902 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
903 struct data_dependence_relation *ddr)
905 struct data_reference *dra = DDR_A (ddr);
906 struct data_reference *drb = DDR_B (ddr);
907 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
908 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
910 /* We need to check dependences of statements marked as unvectorizable
911 as well, they still can prohibit vectorization. */
913 /* Independent data accesses. */
914 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
915 return false;
917 if (dra == drb)
918 return false;
920 /* Read-read is OK. */
921 if (DR_IS_READ (dra) && DR_IS_READ (drb))
922 return false;
924 /* If dra and drb are part of the same interleaving chain consider
925 them independent. */
926 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
927 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
928 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
929 return false;
931 /* Unknown data dependence. */
932 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
934 if (dump_enabled_p ())
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "can't determine dependence between %T and %T\n",
937 DR_REF (dra), DR_REF (drb));
939 else if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location,
941 "determined dependence between %T and %T\n",
942 DR_REF (dra), DR_REF (drb));
944 return true;
948 /* Analyze dependences involved in the transform of a store SLP NODE. */
950 static bool
951 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
953 /* This walks over all stmts involved in the SLP store done
954 in NODE verifying we can sink them up to the last stmt in the
955 group. */
956 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
957 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
959 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
961 stmt_vec_info access_info
962 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
963 if (access_info == last_access_info)
964 continue;
965 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
966 ao_ref ref;
967 bool ref_initialized_p = false;
968 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
969 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
971 gimple *stmt = gsi_stmt (gsi);
972 if (! gimple_vuse (stmt))
973 continue;
975 /* If we couldn't record a (single) data reference for this
976 stmt we have to resort to the alias oracle. */
977 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
978 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
979 if (!dr_b)
981 /* We are moving a store - this means
982 we cannot use TBAA for disambiguation. */
983 if (!ref_initialized_p)
984 ao_ref_init (&ref, DR_REF (dr_a));
985 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
986 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
987 return false;
988 continue;
991 gcc_assert (!gimple_visited_p (stmt));
993 ddr_p ddr = initialize_data_dependence_relation (dr_a,
994 dr_b, vNULL);
995 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
996 free_dependence_relation (ddr);
997 if (dependent)
998 return false;
1001 return true;
1004 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1005 contain the vector of scalar stores of this instance if we are
1006 disambiguating the loads. */
1008 static bool
1009 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1010 vec<stmt_vec_info> stores,
1011 stmt_vec_info last_store_info)
1013 /* This walks over all stmts involved in the SLP load done
1014 in NODE verifying we can hoist them up to the first stmt in the
1015 group. */
1016 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1017 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1019 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1021 stmt_vec_info access_info
1022 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1023 if (access_info == first_access_info)
1024 continue;
1025 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1026 ao_ref ref;
1027 bool ref_initialized_p = false;
1028 hash_set<stmt_vec_info> grp_visited;
1029 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1030 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1032 gimple *stmt = gsi_stmt (gsi);
1033 if (! gimple_vdef (stmt))
1034 continue;
1036 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1038 /* If we run into a store of this same instance (we've just
1039 marked those) then delay dependence checking until we run
1040 into the last store because this is where it will have
1041 been sunk to (and we verified that we can do that already). */
1042 if (gimple_visited_p (stmt))
1044 if (stmt_info != last_store_info)
1045 continue;
1047 for (stmt_vec_info &store_info : stores)
1049 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1050 ddr_p ddr = initialize_data_dependence_relation
1051 (dr_a, store_dr, vNULL);
1052 bool dependent
1053 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1054 free_dependence_relation (ddr);
1055 if (dependent)
1056 return false;
1058 continue;
1061 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1063 /* We are hoisting a load - this means we can use TBAA for
1064 disambiguation. */
1065 if (!ref_initialized_p)
1066 ao_ref_init (&ref, DR_REF (dr_a));
1067 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1069 /* If we couldn't record a (single) data reference for this
1070 stmt we have to give up now. */
1071 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1072 if (!dr_b)
1073 return false;
1074 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1075 dr_b, vNULL);
1076 bool dependent
1077 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1078 free_dependence_relation (ddr);
1079 if (dependent)
1080 return false;
1082 /* No dependence. */
1083 return true;
1085 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1087 /* When we run into a store group we have to honor
1088 that earlier stores might be moved here. We don't
1089 know exactly which and where to since we lack a
1090 back-mapping from DR to SLP node, so assume all
1091 earlier stores are sunk here. It's enough to
1092 consider the last stmt of a group for this.
1093 ??? Both this and the fact that we disregard that
1094 the conflicting instance might be removed later
1095 is overly conservative. */
1096 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1097 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1098 store_info != NULL;
1099 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1100 if ((store_info == stmt_info
1101 || get_later_stmt (store_info, stmt_info) == stmt_info)
1102 && !check_hoist (store_info))
1103 return false;
1105 else
1107 if (!check_hoist (stmt_info))
1108 return false;
1112 return true;
1116 /* Function vect_analyze_data_ref_dependences.
1118 Examine all the data references in the basic-block, and make sure there
1119 do not exist any data dependences between them. Set *MAX_VF according to
1120 the maximum vectorization factor the data dependences allow. */
1122 bool
1123 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1125 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1127 /* The stores of this instance are at the root of the SLP tree. */
1128 slp_tree store = NULL;
1129 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1130 store = SLP_INSTANCE_TREE (instance);
1132 /* Verify we can sink stores to the vectorized stmt insert location. */
1133 stmt_vec_info last_store_info = NULL;
1134 if (store)
1136 if (! vect_slp_analyze_store_dependences (vinfo, store))
1137 return false;
1139 /* Mark stores in this instance and remember the last one. */
1140 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1141 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1142 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1145 bool res = true;
1147 /* Verify we can sink loads to the vectorized stmt insert location,
1148 special-casing stores of this instance. */
1149 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1150 if (! vect_slp_analyze_load_dependences (vinfo, load,
1151 store
1152 ? SLP_TREE_SCALAR_STMTS (store)
1153 : vNULL, last_store_info))
1155 res = false;
1156 break;
1159 /* Unset the visited flag. */
1160 if (store)
1161 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1162 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1164 return res;
1167 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1168 applied. */
1171 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1173 HOST_WIDE_INT diff = 0;
1174 /* Alignment is only analyzed for the first element of a DR group,
1175 use that but adjust misalignment by the offset of the access. */
1176 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1178 dr_vec_info *first_dr
1179 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1180 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1181 INTEGER_CSTs and the first element in the group has the lowest
1182 address. */
1183 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1184 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1185 gcc_assert (diff >= 0);
1186 dr_info = first_dr;
1189 int misalign = dr_info->misalignment;
1190 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1191 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1192 return misalign;
1194 /* If the access is only aligned for a vector type with smaller alignment
1195 requirement the access has unknown misalignment. */
1196 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1197 targetm.vectorize.preferred_vector_alignment (vectype)))
1198 return DR_MISALIGNMENT_UNKNOWN;
1200 /* Apply the offset from the DR group start and the externally supplied
1201 offset which can for example result from a negative stride access. */
1202 poly_int64 misalignment = misalign + diff + offset;
1204 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1205 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1206 unsigned HOST_WIDE_INT target_alignment_c
1207 = dr_info->target_alignment.to_constant ();
1208 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1209 return DR_MISALIGNMENT_UNKNOWN;
1210 return misalign;
1213 /* Record the base alignment guarantee given by DRB, which occurs
1214 in STMT_INFO. */
1216 static void
1217 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1218 innermost_loop_behavior *drb)
1220 bool existed;
1221 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1222 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1223 if (!existed || entry.second->base_alignment < drb->base_alignment)
1225 entry = std::make_pair (stmt_info, drb);
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_NOTE, vect_location,
1228 "recording new base alignment for %T\n"
1229 " alignment: %d\n"
1230 " misalignment: %d\n"
1231 " based on: %G",
1232 drb->base_address,
1233 drb->base_alignment,
1234 drb->base_misalignment,
1235 stmt_info->stmt);
1239 /* If the region we're going to vectorize is reached, all unconditional
1240 data references occur at least once. We can therefore pool the base
1241 alignment guarantees from each unconditional reference. Do this by
1242 going through all the data references in VINFO and checking whether
1243 the containing statement makes the reference unconditionally. If so,
1244 record the alignment of the base address in VINFO so that it can be
1245 used for all other references with the same base. */
1247 void
1248 vect_record_base_alignments (vec_info *vinfo)
1250 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1251 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1252 for (data_reference *dr : vinfo->shared->datarefs)
1254 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1255 stmt_vec_info stmt_info = dr_info->stmt;
1256 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1257 && STMT_VINFO_VECTORIZABLE (stmt_info)
1258 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1260 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1262 /* If DR is nested in the loop that is being vectorized, we can also
1263 record the alignment of the base wrt the outer loop. */
1264 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1265 vect_record_base_alignment
1266 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1271 /* Function vect_compute_data_ref_alignment
1273 Compute the misalignment of the data reference DR_INFO when vectorizing
1274 with VECTYPE.
1276 Output:
1277 1. initialized misalignment info for DR_INFO
1279 FOR NOW: No analysis is actually performed. Misalignment is calculated
1280 only for trivial cases. TODO. */
1282 static void
1283 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1284 tree vectype)
1286 stmt_vec_info stmt_info = dr_info->stmt;
1287 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1288 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1289 class loop *loop = NULL;
1290 tree ref = DR_REF (dr_info->dr);
1292 if (dump_enabled_p ())
1293 dump_printf_loc (MSG_NOTE, vect_location,
1294 "vect_compute_data_ref_alignment:\n");
1296 if (loop_vinfo)
1297 loop = LOOP_VINFO_LOOP (loop_vinfo);
1299 /* Initialize misalignment to unknown. */
1300 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1302 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1303 return;
1305 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1306 bool step_preserves_misalignment_p;
1308 poly_uint64 vector_alignment
1309 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1310 BITS_PER_UNIT);
1311 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1313 /* If the main loop has peeled for alignment we have no way of knowing
1314 whether the data accesses in the epilogues are aligned. We can't at
1315 compile time answer the question whether we have entered the main loop or
1316 not. Fixes PR 92351. */
1317 if (loop_vinfo)
1319 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1320 if (orig_loop_vinfo
1321 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1322 return;
1325 unsigned HOST_WIDE_INT vect_align_c;
1326 if (!vector_alignment.is_constant (&vect_align_c))
1327 return;
1329 /* No step for BB vectorization. */
1330 if (!loop)
1332 gcc_assert (integer_zerop (drb->step));
1333 step_preserves_misalignment_p = true;
1336 /* In case the dataref is in an inner-loop of the loop that is being
1337 vectorized (LOOP), we use the base and misalignment information
1338 relative to the outer-loop (LOOP). This is ok only if the misalignment
1339 stays the same throughout the execution of the inner-loop, which is why
1340 we have to check that the stride of the dataref in the inner-loop evenly
1341 divides by the vector alignment. */
1342 else if (nested_in_vect_loop_p (loop, stmt_info))
1344 step_preserves_misalignment_p
1345 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1347 if (dump_enabled_p ())
1349 if (step_preserves_misalignment_p)
1350 dump_printf_loc (MSG_NOTE, vect_location,
1351 "inner step divides the vector alignment.\n");
1352 else
1353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354 "inner step doesn't divide the vector"
1355 " alignment.\n");
1359 /* Similarly we can only use base and misalignment information relative to
1360 an innermost loop if the misalignment stays the same throughout the
1361 execution of the loop. As above, this is the case if the stride of
1362 the dataref evenly divides by the alignment. */
1363 else
1365 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1366 step_preserves_misalignment_p
1367 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1369 if (!step_preserves_misalignment_p && dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "step doesn't divide the vector alignment.\n");
1374 unsigned int base_alignment = drb->base_alignment;
1375 unsigned int base_misalignment = drb->base_misalignment;
1377 /* Calculate the maximum of the pooled base address alignment and the
1378 alignment that we can compute for DR itself. */
1379 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1380 = base_alignments->get (drb->base_address);
1381 if (entry
1382 && base_alignment < (*entry).second->base_alignment
1383 && (loop_vinfo
1384 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1385 gimple_bb (entry->first->stmt))
1386 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1387 || (entry->first->dr_aux.group <= dr_info->group)))))
1389 base_alignment = entry->second->base_alignment;
1390 base_misalignment = entry->second->base_misalignment;
1393 if (drb->offset_alignment < vect_align_c
1394 || !step_preserves_misalignment_p
1395 /* We need to know whether the step wrt the vectorized loop is
1396 negative when computing the starting misalignment below. */
1397 || TREE_CODE (drb->step) != INTEGER_CST)
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401 "Unknown alignment for access: %T\n", ref);
1402 return;
1405 if (base_alignment < vect_align_c)
1407 unsigned int max_alignment;
1408 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1409 if (max_alignment < vect_align_c
1410 || !vect_can_force_dr_alignment_p (base,
1411 vect_align_c * BITS_PER_UNIT))
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE, vect_location,
1415 "can't force alignment of ref: %T\n", ref);
1416 return;
1419 /* Force the alignment of the decl.
1420 NOTE: This is the only change to the code we make during
1421 the analysis phase, before deciding to vectorize the loop. */
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "force alignment of %T\n", ref);
1426 dr_info->base_decl = base;
1427 dr_info->base_misaligned = true;
1428 base_misalignment = 0;
1430 poly_int64 misalignment
1431 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1433 unsigned int const_misalignment;
1434 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438 "Non-constant misalignment for access: %T\n", ref);
1439 return;
1442 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1444 if (dump_enabled_p ())
1445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446 "misalign = %d bytes of ref %T\n",
1447 const_misalignment, ref);
1449 return;
1452 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1453 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1454 is made aligned via peeling. */
1456 static bool
1457 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1458 dr_vec_info *dr_peel_info)
1460 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1461 DR_TARGET_ALIGNMENT (dr_info)))
1463 poly_offset_int diff
1464 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1465 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1466 if (known_eq (diff, 0)
1467 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1468 return true;
1470 return false;
1473 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1474 aligned via peeling. */
1476 static bool
1477 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1478 dr_vec_info *dr_peel_info)
1480 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1481 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1482 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1483 DR_OFFSET (dr_peel_info->dr), 0)
1484 || !operand_equal_p (DR_STEP (dr_info->dr),
1485 DR_STEP (dr_peel_info->dr), 0))
1486 return false;
1488 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1491 /* Compute the value for dr_info->misalign so that the access appears
1492 aligned. This is used by peeling to compensate for dr_misalignment
1493 applying the offset for negative step. */
1496 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1498 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1499 return 0;
1501 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1502 poly_int64 misalignment
1503 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1504 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1506 unsigned HOST_WIDE_INT target_alignment_c;
1507 int misalign;
1508 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1509 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1510 return DR_MISALIGNMENT_UNKNOWN;
1511 return misalign;
1514 /* Function vect_update_misalignment_for_peel.
1515 Sets DR_INFO's misalignment
1516 - to 0 if it has the same alignment as DR_PEEL_INFO,
1517 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1518 - to -1 (unknown) otherwise.
1520 DR_INFO - the data reference whose misalignment is to be adjusted.
1521 DR_PEEL_INFO - the data reference whose misalignment is being made
1522 zero in the vector loop by the peel.
1523 NPEEL - the number of iterations in the peel loop if the misalignment
1524 of DR_PEEL_INFO is known at compile time. */
1526 static void
1527 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1528 dr_vec_info *dr_peel_info, int npeel)
1530 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1531 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1533 SET_DR_MISALIGNMENT (dr_info,
1534 vect_dr_misalign_for_aligned_access (dr_peel_info));
1535 return;
1538 unsigned HOST_WIDE_INT alignment;
1539 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1540 && known_alignment_for_access_p (dr_info,
1541 STMT_VINFO_VECTYPE (dr_info->stmt))
1542 && known_alignment_for_access_p (dr_peel_info,
1543 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1545 int misal = dr_info->misalignment;
1546 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1547 misal &= alignment - 1;
1548 set_dr_misalignment (dr_info, misal);
1549 return;
1552 if (dump_enabled_p ())
1553 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1554 "to unknown (-1).\n");
1555 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1558 /* Return true if alignment is relevant for DR_INFO. */
1560 static bool
1561 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1563 stmt_vec_info stmt_info = dr_info->stmt;
1565 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1566 return false;
1568 /* For interleaving, only the alignment of the first access matters. */
1569 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1570 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1571 return false;
1573 /* Scatter-gather and invariant accesses continue to address individual
1574 scalars, so vector-level alignment is irrelevant. */
1575 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1576 || integer_zerop (DR_STEP (dr_info->dr)))
1577 return false;
1579 /* Strided accesses perform only component accesses, alignment is
1580 irrelevant for them. */
1581 if (STMT_VINFO_STRIDED_P (stmt_info)
1582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1583 return false;
1585 return true;
1588 /* Given an memory reference EXP return whether its alignment is less
1589 than its size. */
1591 static bool
1592 not_size_aligned (tree exp)
1594 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1595 return true;
1597 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1598 > get_object_alignment (exp));
1601 /* Function vector_alignment_reachable_p
1603 Return true if vector alignment for DR_INFO is reachable by peeling
1604 a few loop iterations. Return false otherwise. */
1606 static bool
1607 vector_alignment_reachable_p (dr_vec_info *dr_info)
1609 stmt_vec_info stmt_info = dr_info->stmt;
1610 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1614 /* For interleaved access we peel only if number of iterations in
1615 the prolog loop ({VF - misalignment}), is a multiple of the
1616 number of the interleaved accesses. */
1617 int elem_size, mis_in_elements;
1619 /* FORNOW: handle only known alignment. */
1620 if (!known_alignment_for_access_p (dr_info, vectype))
1621 return false;
1623 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1624 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1625 elem_size = vector_element_size (vector_size, nelements);
1626 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1628 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1629 return false;
1632 /* If misalignment is known at the compile time then allow peeling
1633 only if natural alignment is reachable through peeling. */
1634 if (known_alignment_for_access_p (dr_info, vectype)
1635 && !aligned_access_p (dr_info, vectype))
1637 HOST_WIDE_INT elmsize =
1638 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1639 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_NOTE, vect_location,
1642 "data size = %wd. misalignment = %d.\n", elmsize,
1643 dr_misalignment (dr_info, vectype));
1645 if (dr_misalignment (dr_info, vectype) % elmsize)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649 "data size does not divide the misalignment.\n");
1650 return false;
1654 if (!known_alignment_for_access_p (dr_info, vectype))
1656 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1657 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 "Unknown misalignment, %snaturally aligned\n",
1661 is_packed ? "not " : "");
1662 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1665 return true;
1669 /* Calculate the cost of the memory access represented by DR_INFO. */
1671 static void
1672 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1673 dr_alignment_support alignment_support_scheme,
1674 int misalignment,
1675 unsigned int *inside_cost,
1676 unsigned int *outside_cost,
1677 stmt_vector_for_cost *body_cost_vec,
1678 stmt_vector_for_cost *prologue_cost_vec)
1680 stmt_vec_info stmt_info = dr_info->stmt;
1681 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1682 int ncopies;
1684 if (PURE_SLP_STMT (stmt_info))
1685 ncopies = 1;
1686 else
1687 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1689 if (DR_IS_READ (dr_info->dr))
1690 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1691 misalignment, true, inside_cost,
1692 outside_cost, prologue_cost_vec, body_cost_vec, false);
1693 else
1694 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1695 misalignment, inside_cost, body_cost_vec);
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_NOTE, vect_location,
1699 "vect_get_data_access_cost: inside_cost = %d, "
1700 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1704 typedef struct _vect_peel_info
1706 dr_vec_info *dr_info;
1707 int npeel;
1708 unsigned int count;
1709 } *vect_peel_info;
1711 typedef struct _vect_peel_extended_info
1713 vec_info *vinfo;
1714 struct _vect_peel_info peel_info;
1715 unsigned int inside_cost;
1716 unsigned int outside_cost;
1717 } *vect_peel_extended_info;
1720 /* Peeling hashtable helpers. */
1722 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1724 static inline hashval_t hash (const _vect_peel_info *);
1725 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1728 inline hashval_t
1729 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1731 return (hashval_t) peel_info->npeel;
1734 inline bool
1735 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1737 return (a->npeel == b->npeel);
1741 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1743 static void
1744 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1745 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1746 int npeel, bool supportable_if_not_aligned)
1748 struct _vect_peel_info elem, *slot;
1749 _vect_peel_info **new_slot;
1751 elem.npeel = npeel;
1752 slot = peeling_htab->find (&elem);
1753 if (slot)
1754 slot->count++;
1755 else
1757 slot = XNEW (struct _vect_peel_info);
1758 slot->npeel = npeel;
1759 slot->dr_info = dr_info;
1760 slot->count = 1;
1761 new_slot = peeling_htab->find_slot (slot, INSERT);
1762 *new_slot = slot;
1765 /* If this DR is not supported with unknown misalignment then bias
1766 this slot when the cost model is disabled. */
1767 if (!supportable_if_not_aligned
1768 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1769 slot->count += VECT_MAX_COST;
1773 /* Traverse peeling hash table to find peeling option that aligns maximum
1774 number of data accesses. */
1777 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1778 _vect_peel_extended_info *max)
1780 vect_peel_info elem = *slot;
1782 if (elem->count > max->peel_info.count
1783 || (elem->count == max->peel_info.count
1784 && max->peel_info.npeel > elem->npeel))
1786 max->peel_info.npeel = elem->npeel;
1787 max->peel_info.count = elem->count;
1788 max->peel_info.dr_info = elem->dr_info;
1791 return 1;
1794 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1795 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1796 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1797 after peeling. */
1799 static void
1800 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1801 dr_vec_info *dr0_info,
1802 unsigned int *inside_cost,
1803 unsigned int *outside_cost,
1804 stmt_vector_for_cost *body_cost_vec,
1805 stmt_vector_for_cost *prologue_cost_vec,
1806 unsigned int npeel)
1808 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1810 bool dr0_alignment_known_p
1811 = (dr0_info
1812 && known_alignment_for_access_p (dr0_info,
1813 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1815 for (data_reference *dr : datarefs)
1817 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1818 if (!vect_relevant_for_alignment_p (dr_info))
1819 continue;
1821 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1822 dr_alignment_support alignment_support_scheme;
1823 int misalignment;
1824 unsigned HOST_WIDE_INT alignment;
1826 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1827 size_zero_node) < 0;
1828 poly_int64 off = 0;
1829 if (negative)
1830 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1831 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1833 if (npeel == 0)
1834 misalignment = dr_misalignment (dr_info, vectype, off);
1835 else if (dr_info == dr0_info
1836 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1837 misalignment = 0;
1838 else if (!dr0_alignment_known_p
1839 || !known_alignment_for_access_p (dr_info, vectype)
1840 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1841 misalignment = DR_MISALIGNMENT_UNKNOWN;
1842 else
1844 misalignment = dr_misalignment (dr_info, vectype, off);
1845 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1846 misalignment &= alignment - 1;
1848 alignment_support_scheme
1849 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1850 misalignment);
1852 vect_get_data_access_cost (loop_vinfo, dr_info,
1853 alignment_support_scheme, misalignment,
1854 inside_cost, outside_cost,
1855 body_cost_vec, prologue_cost_vec);
1859 /* Traverse peeling hash table and calculate cost for each peeling option.
1860 Find the one with the lowest cost. */
1863 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1864 _vect_peel_extended_info *min)
1866 vect_peel_info elem = *slot;
1867 int dummy;
1868 unsigned int inside_cost = 0, outside_cost = 0;
1869 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1870 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1871 epilogue_cost_vec;
1873 prologue_cost_vec.create (2);
1874 body_cost_vec.create (2);
1875 epilogue_cost_vec.create (2);
1877 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1878 &outside_cost, &body_cost_vec,
1879 &prologue_cost_vec, elem->npeel);
1881 body_cost_vec.release ();
1883 outside_cost += vect_get_known_peeling_cost
1884 (loop_vinfo, elem->npeel, &dummy,
1885 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1886 &prologue_cost_vec, &epilogue_cost_vec);
1888 /* Prologue and epilogue costs are added to the target model later.
1889 These costs depend only on the scalar iteration cost, the
1890 number of peeling iterations finally chosen, and the number of
1891 misaligned statements. So discard the information found here. */
1892 prologue_cost_vec.release ();
1893 epilogue_cost_vec.release ();
1895 if (inside_cost < min->inside_cost
1896 || (inside_cost == min->inside_cost
1897 && outside_cost < min->outside_cost))
1899 min->inside_cost = inside_cost;
1900 min->outside_cost = outside_cost;
1901 min->peel_info.dr_info = elem->dr_info;
1902 min->peel_info.npeel = elem->npeel;
1903 min->peel_info.count = elem->count;
1906 return 1;
1910 /* Choose best peeling option by traversing peeling hash table and either
1911 choosing an option with the lowest cost (if cost model is enabled) or the
1912 option that aligns as many accesses as possible. */
1914 static struct _vect_peel_extended_info
1915 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1916 loop_vec_info loop_vinfo)
1918 struct _vect_peel_extended_info res;
1920 res.peel_info.dr_info = NULL;
1921 res.vinfo = loop_vinfo;
1923 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1925 res.inside_cost = INT_MAX;
1926 res.outside_cost = INT_MAX;
1927 peeling_htab->traverse <_vect_peel_extended_info *,
1928 vect_peeling_hash_get_lowest_cost> (&res);
1930 else
1932 res.peel_info.count = 0;
1933 peeling_htab->traverse <_vect_peel_extended_info *,
1934 vect_peeling_hash_get_most_frequent> (&res);
1935 res.inside_cost = 0;
1936 res.outside_cost = 0;
1939 return res;
1942 /* Return true if the new peeling NPEEL is supported. */
1944 static bool
1945 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1946 unsigned npeel)
1948 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1949 enum dr_alignment_support supportable_dr_alignment;
1951 bool dr0_alignment_known_p
1952 = known_alignment_for_access_p (dr0_info,
1953 STMT_VINFO_VECTYPE (dr0_info->stmt));
1955 /* Ensure that all data refs can be vectorized after the peel. */
1956 for (data_reference *dr : datarefs)
1958 if (dr == dr0_info->dr)
1959 continue;
1961 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1962 if (!vect_relevant_for_alignment_p (dr_info)
1963 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1964 continue;
1966 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1967 int misalignment;
1968 unsigned HOST_WIDE_INT alignment;
1969 if (!dr0_alignment_known_p
1970 || !known_alignment_for_access_p (dr_info, vectype)
1971 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1972 misalignment = DR_MISALIGNMENT_UNKNOWN;
1973 else
1975 misalignment = dr_misalignment (dr_info, vectype);
1976 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1977 misalignment &= alignment - 1;
1979 supportable_dr_alignment
1980 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1981 misalignment);
1982 if (supportable_dr_alignment == dr_unaligned_unsupported)
1983 return false;
1986 return true;
1989 /* Compare two data-references DRA and DRB to group them into chunks
1990 with related alignment. */
1992 static int
1993 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1995 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1996 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1997 int cmp;
1999 /* Stabilize sort. */
2000 if (dra == drb)
2001 return 0;
2003 /* Ordering of DRs according to base. */
2004 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2005 DR_BASE_ADDRESS (drb));
2006 if (cmp != 0)
2007 return cmp;
2009 /* And according to DR_OFFSET. */
2010 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2011 if (cmp != 0)
2012 return cmp;
2014 /* And after step. */
2015 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2016 if (cmp != 0)
2017 return cmp;
2019 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2020 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2021 if (cmp == 0)
2022 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2023 return cmp;
2026 /* Function vect_enhance_data_refs_alignment
2028 This pass will use loop versioning and loop peeling in order to enhance
2029 the alignment of data references in the loop.
2031 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2032 original loop is to be vectorized. Any other loops that are created by
2033 the transformations performed in this pass - are not supposed to be
2034 vectorized. This restriction will be relaxed.
2036 This pass will require a cost model to guide it whether to apply peeling
2037 or versioning or a combination of the two. For example, the scheme that
2038 intel uses when given a loop with several memory accesses, is as follows:
2039 choose one memory access ('p') which alignment you want to force by doing
2040 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2041 other accesses are not necessarily aligned, or (2) use loop versioning to
2042 generate one loop in which all accesses are aligned, and another loop in
2043 which only 'p' is necessarily aligned.
2045 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2046 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2047 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2049 Devising a cost model is the most critical aspect of this work. It will
2050 guide us on which access to peel for, whether to use loop versioning, how
2051 many versions to create, etc. The cost model will probably consist of
2052 generic considerations as well as target specific considerations (on
2053 powerpc for example, misaligned stores are more painful than misaligned
2054 loads).
2056 Here are the general steps involved in alignment enhancements:
2058 -- original loop, before alignment analysis:
2059 for (i=0; i<N; i++){
2060 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2061 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2064 -- After vect_compute_data_refs_alignment:
2065 for (i=0; i<N; i++){
2066 x = q[i]; # DR_MISALIGNMENT(q) = 3
2067 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2070 -- Possibility 1: we do loop versioning:
2071 if (p is aligned) {
2072 for (i=0; i<N; i++){ # loop 1A
2073 x = q[i]; # DR_MISALIGNMENT(q) = 3
2074 p[i] = y; # DR_MISALIGNMENT(p) = 0
2077 else {
2078 for (i=0; i<N; i++){ # loop 1B
2079 x = q[i]; # DR_MISALIGNMENT(q) = 3
2080 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2084 -- Possibility 2: we do loop peeling:
2085 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2086 x = q[i];
2087 p[i] = y;
2089 for (i = 3; i < N; i++){ # loop 2A
2090 x = q[i]; # DR_MISALIGNMENT(q) = 0
2091 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2094 -- Possibility 3: combination of loop peeling and versioning:
2095 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2096 x = q[i];
2097 p[i] = y;
2099 if (p is aligned) {
2100 for (i = 3; i<N; i++){ # loop 3A
2101 x = q[i]; # DR_MISALIGNMENT(q) = 0
2102 p[i] = y; # DR_MISALIGNMENT(p) = 0
2105 else {
2106 for (i = 3; i<N; i++){ # loop 3B
2107 x = q[i]; # DR_MISALIGNMENT(q) = 0
2108 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2112 These loops are later passed to loop_transform to be vectorized. The
2113 vectorizer will use the alignment information to guide the transformation
2114 (whether to generate regular loads/stores, or with special handling for
2115 misalignment). */
2117 opt_result
2118 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2120 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2121 dr_vec_info *first_store = NULL;
2122 dr_vec_info *dr0_info = NULL;
2123 struct data_reference *dr;
2124 unsigned int i;
2125 bool do_peeling = false;
2126 bool do_versioning = false;
2127 unsigned int npeel = 0;
2128 bool one_misalignment_known = false;
2129 bool one_misalignment_unknown = false;
2130 bool one_dr_unsupportable = false;
2131 dr_vec_info *unsupportable_dr_info = NULL;
2132 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2133 hash_table<peel_info_hasher> peeling_htab (1);
2135 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2137 /* Reset data so we can safely be called multiple times. */
2138 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2139 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2141 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2142 return opt_result::success ();
2144 /* Sort the vector of datarefs so DRs that have the same or dependent
2145 alignment are next to each other. */
2146 auto_vec<data_reference_p> datarefs
2147 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2148 datarefs.qsort (dr_align_group_sort_cmp);
2150 /* Compute the number of DRs that become aligned when we peel
2151 a dataref so it becomes aligned. */
2152 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2153 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2154 unsigned i0;
2155 for (i0 = 0; i0 < datarefs.length (); ++i0)
2156 if (DR_BASE_ADDRESS (datarefs[i0]))
2157 break;
2158 for (i = i0 + 1; i <= datarefs.length (); ++i)
2160 if (i == datarefs.length ()
2161 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2162 DR_BASE_ADDRESS (datarefs[i]), 0)
2163 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2164 DR_OFFSET (datarefs[i]), 0)
2165 || !operand_equal_p (DR_STEP (datarefs[i0]),
2166 DR_STEP (datarefs[i]), 0))
2168 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2169 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2170 will get known misalignment if we align one of the refs
2171 with the largest DR_TARGET_ALIGNMENT. */
2172 for (unsigned j = i0; j < i; ++j)
2174 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2175 for (unsigned k = i0; k < i; ++k)
2177 if (k == j)
2178 continue;
2179 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2180 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2181 dr_infoj))
2182 n_same_align_refs[j]++;
2185 i0 = i;
2189 /* While cost model enhancements are expected in the future, the high level
2190 view of the code at this time is as follows:
2192 A) If there is a misaligned access then see if peeling to align
2193 this access can make all data references satisfy
2194 vect_supportable_dr_alignment. If so, update data structures
2195 as needed and return true.
2197 B) If peeling wasn't possible and there is a data reference with an
2198 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2199 then see if loop versioning checks can be used to make all data
2200 references satisfy vect_supportable_dr_alignment. If so, update
2201 data structures as needed and return true.
2203 C) If neither peeling nor versioning were successful then return false if
2204 any data reference does not satisfy vect_supportable_dr_alignment.
2206 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2208 Note, Possibility 3 above (which is peeling and versioning together) is not
2209 being done at this time. */
2211 /* (1) Peeling to force alignment. */
2213 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2214 Considerations:
2215 + How many accesses will become aligned due to the peeling
2216 - How many accesses will become unaligned due to the peeling,
2217 and the cost of misaligned accesses.
2218 - The cost of peeling (the extra runtime checks, the increase
2219 in code size). */
2221 FOR_EACH_VEC_ELT (datarefs, i, dr)
2223 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2224 if (!vect_relevant_for_alignment_p (dr_info))
2225 continue;
2227 stmt_vec_info stmt_info = dr_info->stmt;
2228 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2229 do_peeling = vector_alignment_reachable_p (dr_info);
2230 if (do_peeling)
2232 if (known_alignment_for_access_p (dr_info, vectype))
2234 unsigned int npeel_tmp = 0;
2235 bool negative = tree_int_cst_compare (DR_STEP (dr),
2236 size_zero_node) < 0;
2238 /* If known_alignment_for_access_p then we have set
2239 DR_MISALIGNMENT which is only done if we know it at compiler
2240 time, so it is safe to assume target alignment is constant.
2242 unsigned int target_align =
2243 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2244 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2245 poly_int64 off = 0;
2246 if (negative)
2247 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2248 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2249 mis = negative ? mis : -mis;
2250 if (mis != 0)
2251 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2253 /* For multiple types, it is possible that the bigger type access
2254 will have more than one peeling option. E.g., a loop with two
2255 types: one of size (vector size / 4), and the other one of
2256 size (vector size / 8). Vectorization factor will 8. If both
2257 accesses are misaligned by 3, the first one needs one scalar
2258 iteration to be aligned, and the second one needs 5. But the
2259 first one will be aligned also by peeling 5 scalar
2260 iterations, and in that case both accesses will be aligned.
2261 Hence, except for the immediate peeling amount, we also want
2262 to try to add full vector size, while we don't exceed
2263 vectorization factor.
2264 We do this automatically for cost model, since we calculate
2265 cost for every peeling option. */
2266 poly_uint64 nscalars = npeel_tmp;
2267 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2269 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2270 nscalars = (STMT_SLP_TYPE (stmt_info)
2271 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2274 /* Save info about DR in the hash table. Also include peeling
2275 amounts according to the explanation above. Indicate
2276 the alignment status when the ref is not aligned.
2277 ??? Rather than using unknown alignment here we should
2278 prune all entries from the peeling hashtable which cause
2279 DRs to be not supported. */
2280 bool supportable_if_not_aligned
2281 = vect_supportable_dr_alignment
2282 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2283 while (known_le (npeel_tmp, nscalars))
2285 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2286 dr_info, npeel_tmp,
2287 supportable_if_not_aligned);
2288 npeel_tmp += MAX (1, target_align / dr_size);
2291 one_misalignment_known = true;
2293 else
2295 /* If we don't know any misalignment values, we prefer
2296 peeling for data-ref that has the maximum number of data-refs
2297 with the same alignment, unless the target prefers to align
2298 stores over load. */
2299 unsigned same_align_drs = n_same_align_refs[i];
2300 if (!dr0_info
2301 || dr0_same_align_drs < same_align_drs)
2303 dr0_same_align_drs = same_align_drs;
2304 dr0_info = dr_info;
2306 /* For data-refs with the same number of related
2307 accesses prefer the one where the misalign
2308 computation will be invariant in the outermost loop. */
2309 else if (dr0_same_align_drs == same_align_drs)
2311 class loop *ivloop0, *ivloop;
2312 ivloop0 = outermost_invariant_loop_for_expr
2313 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2314 ivloop = outermost_invariant_loop_for_expr
2315 (loop, DR_BASE_ADDRESS (dr));
2316 if ((ivloop && !ivloop0)
2317 || (ivloop && ivloop0
2318 && flow_loop_nested_p (ivloop, ivloop0)))
2319 dr0_info = dr_info;
2322 one_misalignment_unknown = true;
2324 /* Check for data refs with unsupportable alignment that
2325 can be peeled. */
2326 enum dr_alignment_support supportable_dr_alignment
2327 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2328 DR_MISALIGNMENT_UNKNOWN);
2329 if (supportable_dr_alignment == dr_unaligned_unsupported)
2331 one_dr_unsupportable = true;
2332 unsupportable_dr_info = dr_info;
2335 if (!first_store && DR_IS_WRITE (dr))
2337 first_store = dr_info;
2338 first_store_same_align_drs = same_align_drs;
2342 else
2344 if (!aligned_access_p (dr_info, vectype))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348 "vector alignment may not be reachable\n");
2349 break;
2354 /* Check if we can possibly peel the loop. */
2355 if (!vect_can_advance_ivs_p (loop_vinfo)
2356 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2357 loop_preheader_edge (loop))
2358 || loop->inner
2359 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2360 do_peeling = false;
2362 struct _vect_peel_extended_info peel_for_known_alignment;
2363 struct _vect_peel_extended_info peel_for_unknown_alignment;
2364 struct _vect_peel_extended_info best_peel;
2366 peel_for_unknown_alignment.inside_cost = INT_MAX;
2367 peel_for_unknown_alignment.outside_cost = INT_MAX;
2368 peel_for_unknown_alignment.peel_info.count = 0;
2370 if (do_peeling
2371 && one_misalignment_unknown)
2373 /* Check if the target requires to prefer stores over loads, i.e., if
2374 misaligned stores are more expensive than misaligned loads (taking
2375 drs with same alignment into account). */
2376 unsigned int load_inside_cost = 0;
2377 unsigned int load_outside_cost = 0;
2378 unsigned int store_inside_cost = 0;
2379 unsigned int store_outside_cost = 0;
2380 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2382 stmt_vector_for_cost dummy;
2383 dummy.create (2);
2384 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2385 &load_inside_cost,
2386 &load_outside_cost,
2387 &dummy, &dummy, estimated_npeels);
2388 dummy.release ();
2390 if (first_store)
2392 dummy.create (2);
2393 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2394 &store_inside_cost,
2395 &store_outside_cost,
2396 &dummy, &dummy,
2397 estimated_npeels);
2398 dummy.release ();
2400 else
2402 store_inside_cost = INT_MAX;
2403 store_outside_cost = INT_MAX;
2406 if (load_inside_cost > store_inside_cost
2407 || (load_inside_cost == store_inside_cost
2408 && load_outside_cost > store_outside_cost))
2410 dr0_info = first_store;
2411 dr0_same_align_drs = first_store_same_align_drs;
2412 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2413 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2415 else
2417 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2418 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2421 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2422 prologue_cost_vec.create (2);
2423 epilogue_cost_vec.create (2);
2425 int dummy2;
2426 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2427 (loop_vinfo, estimated_npeels, &dummy2,
2428 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2429 &prologue_cost_vec, &epilogue_cost_vec);
2431 prologue_cost_vec.release ();
2432 epilogue_cost_vec.release ();
2434 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2437 peel_for_unknown_alignment.peel_info.npeel = 0;
2438 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2440 best_peel = peel_for_unknown_alignment;
2442 peel_for_known_alignment.inside_cost = INT_MAX;
2443 peel_for_known_alignment.outside_cost = INT_MAX;
2444 peel_for_known_alignment.peel_info.count = 0;
2445 peel_for_known_alignment.peel_info.dr_info = NULL;
2447 if (do_peeling && one_misalignment_known)
2449 /* Peeling is possible, but there is no data access that is not supported
2450 unless aligned. So we try to choose the best possible peeling from
2451 the hash table. */
2452 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2453 (&peeling_htab, loop_vinfo);
2456 /* Compare costs of peeling for known and unknown alignment. */
2457 if (peel_for_known_alignment.peel_info.dr_info != NULL
2458 && peel_for_unknown_alignment.inside_cost
2459 >= peel_for_known_alignment.inside_cost)
2461 best_peel = peel_for_known_alignment;
2463 /* If the best peeling for known alignment has NPEEL == 0, perform no
2464 peeling at all except if there is an unsupportable dr that we can
2465 align. */
2466 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2467 do_peeling = false;
2470 /* If there is an unsupportable data ref, prefer this over all choices so far
2471 since we'd have to discard a chosen peeling except when it accidentally
2472 aligned the unsupportable data ref. */
2473 if (one_dr_unsupportable)
2474 dr0_info = unsupportable_dr_info;
2475 else if (do_peeling)
2477 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2478 TODO: Use nopeel_outside_cost or get rid of it? */
2479 unsigned nopeel_inside_cost = 0;
2480 unsigned nopeel_outside_cost = 0;
2482 stmt_vector_for_cost dummy;
2483 dummy.create (2);
2484 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2485 &nopeel_outside_cost, &dummy, &dummy, 0);
2486 dummy.release ();
2488 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2489 costs will be recorded. */
2490 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2491 prologue_cost_vec.create (2);
2492 epilogue_cost_vec.create (2);
2494 int dummy2;
2495 nopeel_outside_cost += vect_get_known_peeling_cost
2496 (loop_vinfo, 0, &dummy2,
2497 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2498 &prologue_cost_vec, &epilogue_cost_vec);
2500 prologue_cost_vec.release ();
2501 epilogue_cost_vec.release ();
2503 npeel = best_peel.peel_info.npeel;
2504 dr0_info = best_peel.peel_info.dr_info;
2506 /* If no peeling is not more expensive than the best peeling we
2507 have so far, don't perform any peeling. */
2508 if (nopeel_inside_cost <= best_peel.inside_cost)
2509 do_peeling = false;
2512 if (do_peeling)
2514 stmt_vec_info stmt_info = dr0_info->stmt;
2515 if (known_alignment_for_access_p (dr0_info,
2516 STMT_VINFO_VECTYPE (stmt_info)))
2518 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2519 size_zero_node) < 0;
2520 if (!npeel)
2522 /* Since it's known at compile time, compute the number of
2523 iterations in the peeled loop (the peeling factor) for use in
2524 updating DR_MISALIGNMENT values. The peeling factor is the
2525 vectorization factor minus the misalignment as an element
2526 count. */
2527 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2528 poly_int64 off = 0;
2529 if (negative)
2530 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2531 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2532 unsigned int mis
2533 = dr_misalignment (dr0_info, vectype, off);
2534 mis = negative ? mis : -mis;
2535 /* If known_alignment_for_access_p then we have set
2536 DR_MISALIGNMENT which is only done if we know it at compiler
2537 time, so it is safe to assume target alignment is constant.
2539 unsigned int target_align =
2540 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2541 npeel = ((mis & (target_align - 1))
2542 / vect_get_scalar_dr_size (dr0_info));
2545 /* For interleaved data access every iteration accesses all the
2546 members of the group, therefore we divide the number of iterations
2547 by the group size. */
2548 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2549 npeel /= DR_GROUP_SIZE (stmt_info);
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE, vect_location,
2553 "Try peeling by %d\n", npeel);
2556 /* Ensure that all datarefs can be vectorized after the peel. */
2557 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2558 do_peeling = false;
2560 /* Check if all datarefs are supportable and log. */
2561 if (do_peeling
2562 && npeel == 0
2563 && known_alignment_for_access_p (dr0_info,
2564 STMT_VINFO_VECTYPE (stmt_info)))
2565 return opt_result::success ();
2567 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2568 if (do_peeling)
2570 unsigned max_allowed_peel
2571 = param_vect_max_peeling_for_alignment;
2572 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2573 max_allowed_peel = 0;
2574 if (max_allowed_peel != (unsigned)-1)
2576 unsigned max_peel = npeel;
2577 if (max_peel == 0)
2579 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2580 unsigned HOST_WIDE_INT target_align_c;
2581 if (target_align.is_constant (&target_align_c))
2582 max_peel =
2583 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2584 else
2586 do_peeling = false;
2587 if (dump_enabled_p ())
2588 dump_printf_loc (MSG_NOTE, vect_location,
2589 "Disable peeling, max peels set and vector"
2590 " alignment unknown\n");
2593 if (max_peel > max_allowed_peel)
2595 do_peeling = false;
2596 if (dump_enabled_p ())
2597 dump_printf_loc (MSG_NOTE, vect_location,
2598 "Disable peeling, max peels reached: %d\n", max_peel);
2603 /* Cost model #2 - if peeling may result in a remaining loop not
2604 iterating enough to be vectorized then do not peel. Since this
2605 is a cost heuristic rather than a correctness decision, use the
2606 most likely runtime value for variable vectorization factors. */
2607 if (do_peeling
2608 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2610 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2611 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2612 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2613 < assumed_vf + max_peel)
2614 do_peeling = false;
2617 if (do_peeling)
2619 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2620 If the misalignment of DR_i is identical to that of dr0 then set
2621 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2622 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2623 by the peeling factor times the element size of DR_i (MOD the
2624 vectorization factor times the size). Otherwise, the
2625 misalignment of DR_i must be set to unknown. */
2626 FOR_EACH_VEC_ELT (datarefs, i, dr)
2627 if (dr != dr0_info->dr)
2629 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2630 if (!vect_relevant_for_alignment_p (dr_info))
2631 continue;
2633 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2636 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2637 if (npeel)
2638 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2639 else
2640 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2641 SET_DR_MISALIGNMENT (dr0_info,
2642 vect_dr_misalign_for_aligned_access (dr0_info));
2643 if (dump_enabled_p ())
2645 dump_printf_loc (MSG_NOTE, vect_location,
2646 "Alignment of access forced using peeling.\n");
2647 dump_printf_loc (MSG_NOTE, vect_location,
2648 "Peeling for alignment will be applied.\n");
2651 /* The inside-loop cost will be accounted for in vectorizable_load
2652 and vectorizable_store correctly with adjusted alignments.
2653 Drop the body_cst_vec on the floor here. */
2654 return opt_result::success ();
2658 /* (2) Versioning to force alignment. */
2660 /* Try versioning if:
2661 1) optimize loop for speed and the cost-model is not cheap
2662 2) there is at least one unsupported misaligned data ref with an unknown
2663 misalignment, and
2664 3) all misaligned data refs with a known misalignment are supported, and
2665 4) the number of runtime alignment checks is within reason. */
2667 do_versioning
2668 = (optimize_loop_nest_for_speed_p (loop)
2669 && !loop->inner /* FORNOW */
2670 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2672 if (do_versioning)
2674 FOR_EACH_VEC_ELT (datarefs, i, dr)
2676 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2677 if (!vect_relevant_for_alignment_p (dr_info))
2678 continue;
2680 stmt_vec_info stmt_info = dr_info->stmt;
2681 if (STMT_VINFO_STRIDED_P (stmt_info))
2683 do_versioning = false;
2684 break;
2687 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2688 bool negative = tree_int_cst_compare (DR_STEP (dr),
2689 size_zero_node) < 0;
2690 poly_int64 off = 0;
2691 if (negative)
2692 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2693 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2694 int misalignment;
2695 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2696 continue;
2698 enum dr_alignment_support supportable_dr_alignment
2699 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2700 misalignment);
2701 if (supportable_dr_alignment == dr_unaligned_unsupported)
2703 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2704 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2705 >= (unsigned) param_vect_max_version_for_alignment_checks))
2707 do_versioning = false;
2708 break;
2711 /* At present we don't support versioning for alignment
2712 with variable VF, since there's no guarantee that the
2713 VF is a power of two. We could relax this if we added
2714 a way of enforcing a power-of-two size. */
2715 unsigned HOST_WIDE_INT size;
2716 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2718 do_versioning = false;
2719 break;
2722 /* Forcing alignment in the first iteration is no good if
2723 we don't keep it across iterations. For now, just disable
2724 versioning in this case.
2725 ?? We could actually unroll the loop to achieve the required
2726 overall step alignment, and forcing the alignment could be
2727 done by doing some iterations of the non-vectorized loop. */
2728 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2729 * DR_STEP_ALIGNMENT (dr),
2730 DR_TARGET_ALIGNMENT (dr_info)))
2732 do_versioning = false;
2733 break;
2736 /* The rightmost bits of an aligned address must be zeros.
2737 Construct the mask needed for this test. For example,
2738 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2739 mask must be 15 = 0xf. */
2740 int mask = size - 1;
2742 /* FORNOW: use the same mask to test all potentially unaligned
2743 references in the loop. */
2744 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2745 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2747 do_versioning = false;
2748 break;
2751 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2752 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2756 /* Versioning requires at least one misaligned data reference. */
2757 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2758 do_versioning = false;
2759 else if (!do_versioning)
2760 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2763 if (do_versioning)
2765 const vec<stmt_vec_info> &may_misalign_stmts
2766 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2767 stmt_vec_info stmt_info;
2769 /* It can now be assumed that the data references in the statements
2770 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2771 of the loop being vectorized. */
2772 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2774 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2775 SET_DR_MISALIGNMENT (dr_info,
2776 vect_dr_misalign_for_aligned_access (dr_info));
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_NOTE, vect_location,
2779 "Alignment of access forced using versioning.\n");
2782 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_NOTE, vect_location,
2784 "Versioning for alignment will be applied.\n");
2786 /* Peeling and versioning can't be done together at this time. */
2787 gcc_assert (! (do_peeling && do_versioning));
2789 return opt_result::success ();
2792 /* This point is reached if neither peeling nor versioning is being done. */
2793 gcc_assert (! (do_peeling || do_versioning));
2795 return opt_result::success ();
2799 /* Function vect_analyze_data_refs_alignment
2801 Analyze the alignment of the data-references in the loop.
2802 Return FALSE if a data reference is found that cannot be vectorized. */
2804 opt_result
2805 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2807 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2809 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2810 struct data_reference *dr;
2811 unsigned int i;
2813 vect_record_base_alignments (loop_vinfo);
2814 FOR_EACH_VEC_ELT (datarefs, i, dr)
2816 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2817 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2819 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2820 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2821 continue;
2822 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2823 STMT_VINFO_VECTYPE (dr_info->stmt));
2827 return opt_result::success ();
2831 /* Analyze alignment of DRs of stmts in NODE. */
2833 static bool
2834 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2836 /* Alignment is maintained in the first element of the group. */
2837 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2838 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2839 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2840 tree vectype = SLP_TREE_VECTYPE (node);
2841 poly_uint64 vector_alignment
2842 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2843 BITS_PER_UNIT);
2844 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2845 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2846 /* Re-analyze alignment when we're facing a vectorization with a bigger
2847 alignment requirement. */
2848 else if (known_lt (dr_info->target_alignment, vector_alignment))
2850 poly_uint64 old_target_alignment = dr_info->target_alignment;
2851 int old_misalignment = dr_info->misalignment;
2852 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2853 /* But keep knowledge about a smaller alignment. */
2854 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2855 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2857 dr_info->target_alignment = old_target_alignment;
2858 dr_info->misalignment = old_misalignment;
2861 /* When we ever face unordered target alignments the first one wins in terms
2862 of analyzing and the other will become unknown in dr_misalignment. */
2863 return true;
2866 /* Function vect_slp_analyze_instance_alignment
2868 Analyze the alignment of the data-references in the SLP instance.
2869 Return FALSE if a data reference is found that cannot be vectorized. */
2871 bool
2872 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2873 slp_instance instance)
2875 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2877 slp_tree node;
2878 unsigned i;
2879 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2880 if (! vect_slp_analyze_node_alignment (vinfo, node))
2881 return false;
2883 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2884 && ! vect_slp_analyze_node_alignment
2885 (vinfo, SLP_INSTANCE_TREE (instance)))
2886 return false;
2888 return true;
2892 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2893 accesses of legal size, step, etc. Detect gaps, single element
2894 interleaving, and other special cases. Set grouped access info.
2895 Collect groups of strided stores for further use in SLP analysis.
2896 Worker for vect_analyze_group_access. */
2898 static bool
2899 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2901 data_reference *dr = dr_info->dr;
2902 tree step = DR_STEP (dr);
2903 tree scalar_type = TREE_TYPE (DR_REF (dr));
2904 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2905 stmt_vec_info stmt_info = dr_info->stmt;
2906 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2907 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2908 HOST_WIDE_INT dr_step = -1;
2909 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2910 bool slp_impossible = false;
2912 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2913 size of the interleaving group (including gaps). */
2914 if (tree_fits_shwi_p (step))
2916 dr_step = tree_to_shwi (step);
2917 /* Check that STEP is a multiple of type size. Otherwise there is
2918 a non-element-sized gap at the end of the group which we
2919 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2920 ??? As we can handle non-constant step fine here we should
2921 simply remove uses of DR_GROUP_GAP between the last and first
2922 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2923 simply not include that gap. */
2924 if ((dr_step % type_size) != 0)
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 "Step %T is not a multiple of the element size"
2929 " for %T\n",
2930 step, DR_REF (dr));
2931 return false;
2933 groupsize = absu_hwi (dr_step) / type_size;
2935 else
2936 groupsize = 0;
2938 /* Not consecutive access is possible only if it is a part of interleaving. */
2939 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2941 /* Check if it this DR is a part of interleaving, and is a single
2942 element of the group that is accessed in the loop. */
2944 /* Gaps are supported only for loads. STEP must be a multiple of the type
2945 size. */
2946 if (DR_IS_READ (dr)
2947 && (dr_step % type_size) == 0
2948 && groupsize > 0
2949 /* This could be UINT_MAX but as we are generating code in a very
2950 inefficient way we have to cap earlier.
2951 See PR91403 for example. */
2952 && groupsize <= 4096)
2954 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2955 DR_GROUP_SIZE (stmt_info) = groupsize;
2956 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2957 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_NOTE, vect_location,
2959 "Detected single element interleaving %T"
2960 " step %T\n",
2961 DR_REF (dr), step);
2963 return true;
2966 if (dump_enabled_p ())
2967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2968 "not consecutive access %G", stmt_info->stmt);
2970 if (bb_vinfo)
2972 /* Mark the statement as unvectorizable. */
2973 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2974 return true;
2977 if (dump_enabled_p ())
2978 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2979 STMT_VINFO_STRIDED_P (stmt_info) = true;
2980 return true;
2983 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2985 /* First stmt in the interleaving chain. Check the chain. */
2986 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2987 struct data_reference *data_ref = dr;
2988 unsigned int count = 1;
2989 tree prev_init = DR_INIT (data_ref);
2990 HOST_WIDE_INT diff, gaps = 0;
2992 /* By construction, all group members have INTEGER_CST DR_INITs. */
2993 while (next)
2995 /* We never have the same DR multiple times. */
2996 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2997 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2999 data_ref = STMT_VINFO_DATA_REF (next);
3001 /* All group members have the same STEP by construction. */
3002 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3004 /* Check that the distance between two accesses is equal to the type
3005 size. Otherwise, we have gaps. */
3006 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3007 - TREE_INT_CST_LOW (prev_init)) / type_size;
3008 if (diff < 1 || diff > UINT_MAX)
3010 /* For artificial testcases with array accesses with large
3011 constant indices we can run into overflow issues which
3012 can end up fooling the groupsize constraint below so
3013 check the individual gaps (which are represented as
3014 unsigned int) as well. */
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017 "interleaved access with gap larger "
3018 "than representable\n");
3019 return false;
3021 if (diff != 1)
3023 /* FORNOW: SLP of accesses with gaps is not supported. */
3024 slp_impossible = true;
3025 if (DR_IS_WRITE (data_ref))
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029 "interleaved store with gaps\n");
3030 return false;
3033 gaps += diff - 1;
3036 last_accessed_element += diff;
3038 /* Store the gap from the previous member of the group. If there is no
3039 gap in the access, DR_GROUP_GAP is always 1. */
3040 DR_GROUP_GAP (next) = diff;
3042 prev_init = DR_INIT (data_ref);
3043 next = DR_GROUP_NEXT_ELEMENT (next);
3044 /* Count the number of data-refs in the chain. */
3045 count++;
3048 if (groupsize == 0)
3049 groupsize = count + gaps;
3051 /* This could be UINT_MAX but as we are generating code in a very
3052 inefficient way we have to cap earlier. See PR78699 for example. */
3053 if (groupsize > 4096)
3055 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3057 "group is too large\n");
3058 return false;
3061 /* Check that the size of the interleaving is equal to count for stores,
3062 i.e., that there are no gaps. */
3063 if (groupsize != count
3064 && !DR_IS_READ (dr))
3066 groupsize = count;
3067 STMT_VINFO_STRIDED_P (stmt_info) = true;
3070 /* If there is a gap after the last load in the group it is the
3071 difference between the groupsize and the last accessed
3072 element.
3073 When there is no gap, this difference should be 0. */
3074 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3076 DR_GROUP_SIZE (stmt_info) = groupsize;
3077 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 "Detected interleaving ");
3081 if (DR_IS_READ (dr))
3082 dump_printf (MSG_NOTE, "load ");
3083 else if (STMT_VINFO_STRIDED_P (stmt_info))
3084 dump_printf (MSG_NOTE, "strided store ");
3085 else
3086 dump_printf (MSG_NOTE, "store ");
3087 dump_printf (MSG_NOTE, "of size %u\n",
3088 (unsigned)groupsize);
3089 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3090 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3091 while (next)
3093 if (DR_GROUP_GAP (next) != 1)
3094 dump_printf_loc (MSG_NOTE, vect_location,
3095 "\t<gap of %d elements>\n",
3096 DR_GROUP_GAP (next) - 1);
3097 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3098 next = DR_GROUP_NEXT_ELEMENT (next);
3100 if (DR_GROUP_GAP (stmt_info) != 0)
3101 dump_printf_loc (MSG_NOTE, vect_location,
3102 "\t<gap of %d elements>\n",
3103 DR_GROUP_GAP (stmt_info));
3106 /* SLP: create an SLP data structure for every interleaving group of
3107 stores for further analysis in vect_analyse_slp. */
3108 if (DR_IS_WRITE (dr) && !slp_impossible)
3110 if (loop_vinfo)
3111 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3112 if (bb_vinfo)
3113 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3117 return true;
3120 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3121 accesses of legal size, step, etc. Detect gaps, single element
3122 interleaving, and other special cases. Set grouped access info.
3123 Collect groups of strided stores for further use in SLP analysis. */
3125 static bool
3126 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3128 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3130 /* Dissolve the group if present. */
3131 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3132 while (stmt_info)
3134 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3135 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3136 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3137 stmt_info = next;
3139 return false;
3141 return true;
3144 /* Analyze the access pattern of the data-reference DR_INFO.
3145 In case of non-consecutive accesses call vect_analyze_group_access() to
3146 analyze groups of accesses. */
3148 static bool
3149 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3151 data_reference *dr = dr_info->dr;
3152 tree step = DR_STEP (dr);
3153 tree scalar_type = TREE_TYPE (DR_REF (dr));
3154 stmt_vec_info stmt_info = dr_info->stmt;
3155 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3156 class loop *loop = NULL;
3158 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3159 return true;
3161 if (loop_vinfo)
3162 loop = LOOP_VINFO_LOOP (loop_vinfo);
3164 if (loop_vinfo && !step)
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3168 "bad data-ref access in loop\n");
3169 return false;
3172 /* Allow loads with zero step in inner-loop vectorization. */
3173 if (loop_vinfo && integer_zerop (step))
3175 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3176 if (!nested_in_vect_loop_p (loop, stmt_info))
3177 return DR_IS_READ (dr);
3178 /* Allow references with zero step for outer loops marked
3179 with pragma omp simd only - it guarantees absence of
3180 loop-carried dependencies between inner loop iterations. */
3181 if (loop->safelen < 2)
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE, vect_location,
3185 "zero step in inner loop of nest\n");
3186 return false;
3190 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3192 /* Interleaved accesses are not yet supported within outer-loop
3193 vectorization for references in the inner-loop. */
3194 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3196 /* For the rest of the analysis we use the outer-loop step. */
3197 step = STMT_VINFO_DR_STEP (stmt_info);
3198 if (integer_zerop (step))
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_NOTE, vect_location,
3202 "zero step in outer loop.\n");
3203 return DR_IS_READ (dr);
3207 /* Consecutive? */
3208 if (TREE_CODE (step) == INTEGER_CST)
3210 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3211 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3212 || (dr_step < 0
3213 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3215 /* Mark that it is not interleaving. */
3216 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3217 return true;
3221 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_NOTE, vect_location,
3225 "grouped access in outer loop.\n");
3226 return false;
3230 /* Assume this is a DR handled by non-constant strided load case. */
3231 if (TREE_CODE (step) != INTEGER_CST)
3232 return (STMT_VINFO_STRIDED_P (stmt_info)
3233 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3234 || vect_analyze_group_access (vinfo, dr_info)));
3236 /* Not consecutive access - check if it's a part of interleaving group. */
3237 return vect_analyze_group_access (vinfo, dr_info);
3240 /* Compare two data-references DRA and DRB to group them into chunks
3241 suitable for grouping. */
3243 static int
3244 dr_group_sort_cmp (const void *dra_, const void *drb_)
3246 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3247 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3248 data_reference_p dra = dra_info->dr;
3249 data_reference_p drb = drb_info->dr;
3250 int cmp;
3252 /* Stabilize sort. */
3253 if (dra == drb)
3254 return 0;
3256 /* Different group IDs lead never belong to the same group. */
3257 if (dra_info->group != drb_info->group)
3258 return dra_info->group < drb_info->group ? -1 : 1;
3260 /* Ordering of DRs according to base. */
3261 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3262 DR_BASE_ADDRESS (drb));
3263 if (cmp != 0)
3264 return cmp;
3266 /* And according to DR_OFFSET. */
3267 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3268 if (cmp != 0)
3269 return cmp;
3271 /* Put reads before writes. */
3272 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3273 return DR_IS_READ (dra) ? -1 : 1;
3275 /* Then sort after access size. */
3276 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3277 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3278 if (cmp != 0)
3279 return cmp;
3281 /* And after step. */
3282 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3283 if (cmp != 0)
3284 return cmp;
3286 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3287 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3288 if (cmp == 0)
3289 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3290 return cmp;
3293 /* If OP is the result of a conversion, return the unconverted value,
3294 otherwise return null. */
3296 static tree
3297 strip_conversion (tree op)
3299 if (TREE_CODE (op) != SSA_NAME)
3300 return NULL_TREE;
3301 gimple *stmt = SSA_NAME_DEF_STMT (op);
3302 if (!is_gimple_assign (stmt)
3303 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3304 return NULL_TREE;
3305 return gimple_assign_rhs1 (stmt);
3308 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3309 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3310 be grouped in SLP mode. */
3312 static bool
3313 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3314 bool allow_slp_p)
3316 if (gimple_assign_single_p (stmt1_info->stmt))
3317 return gimple_assign_single_p (stmt2_info->stmt);
3319 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3320 if (call1 && gimple_call_internal_p (call1))
3322 /* Check for two masked loads or two masked stores. */
3323 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3324 if (!call2 || !gimple_call_internal_p (call2))
3325 return false;
3326 internal_fn ifn = gimple_call_internal_fn (call1);
3327 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3328 return false;
3329 if (ifn != gimple_call_internal_fn (call2))
3330 return false;
3332 /* Check that the masks are the same. Cope with casts of masks,
3333 like those created by build_mask_conversion. */
3334 tree mask1 = gimple_call_arg (call1, 2);
3335 tree mask2 = gimple_call_arg (call2, 2);
3336 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3338 mask1 = strip_conversion (mask1);
3339 if (!mask1)
3340 return false;
3341 mask2 = strip_conversion (mask2);
3342 if (!mask2)
3343 return false;
3344 if (!operand_equal_p (mask1, mask2, 0))
3345 return false;
3347 return true;
3350 return false;
3353 /* Function vect_analyze_data_ref_accesses.
3355 Analyze the access pattern of all the data references in the loop.
3357 FORNOW: the only access pattern that is considered vectorizable is a
3358 simple step 1 (consecutive) access.
3360 FORNOW: handle only arrays and pointer accesses. */
3362 opt_result
3363 vect_analyze_data_ref_accesses (vec_info *vinfo,
3364 vec<int> *dataref_groups)
3366 unsigned int i;
3367 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3369 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3371 if (datarefs.is_empty ())
3372 return opt_result::success ();
3374 /* Sort the array of datarefs to make building the interleaving chains
3375 linear. Don't modify the original vector's order, it is needed for
3376 determining what dependencies are reversed. */
3377 vec<dr_vec_info *> datarefs_copy;
3378 datarefs_copy.create (datarefs.length ());
3379 for (unsigned i = 0; i < datarefs.length (); i++)
3381 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3382 /* If the caller computed DR grouping use that, otherwise group by
3383 basic blocks. */
3384 if (dataref_groups)
3385 dr_info->group = (*dataref_groups)[i];
3386 else
3387 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3388 datarefs_copy.quick_push (dr_info);
3390 datarefs_copy.qsort (dr_group_sort_cmp);
3391 hash_set<stmt_vec_info> to_fixup;
3393 /* Build the interleaving chains. */
3394 for (i = 0; i < datarefs_copy.length () - 1;)
3396 dr_vec_info *dr_info_a = datarefs_copy[i];
3397 data_reference_p dra = dr_info_a->dr;
3398 int dra_group_id = dr_info_a->group;
3399 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3400 stmt_vec_info lastinfo = NULL;
3401 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3402 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3404 ++i;
3405 continue;
3407 for (i = i + 1; i < datarefs_copy.length (); ++i)
3409 dr_vec_info *dr_info_b = datarefs_copy[i];
3410 data_reference_p drb = dr_info_b->dr;
3411 int drb_group_id = dr_info_b->group;
3412 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3413 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3414 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3415 break;
3417 /* ??? Imperfect sorting (non-compatible types, non-modulo
3418 accesses, same accesses) can lead to a group to be artificially
3419 split here as we don't just skip over those. If it really
3420 matters we can push those to a worklist and re-iterate
3421 over them. The we can just skip ahead to the next DR here. */
3423 /* DRs in a different DR group should not be put into the same
3424 interleaving group. */
3425 if (dra_group_id != drb_group_id)
3426 break;
3428 /* Check that the data-refs have same first location (except init)
3429 and they are both either store or load (not load and store,
3430 not masked loads or stores). */
3431 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3432 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3433 DR_BASE_ADDRESS (drb)) != 0
3434 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3435 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3436 break;
3438 /* Check that the data-refs have the same constant size. */
3439 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3440 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3441 if (!tree_fits_uhwi_p (sza)
3442 || !tree_fits_uhwi_p (szb)
3443 || !tree_int_cst_equal (sza, szb))
3444 break;
3446 /* Check that the data-refs have the same step. */
3447 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3448 break;
3450 /* Check the types are compatible.
3451 ??? We don't distinguish this during sorting. */
3452 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3453 TREE_TYPE (DR_REF (drb))))
3454 break;
3456 /* Check that the DR_INITs are compile-time constants. */
3457 if (!tree_fits_shwi_p (DR_INIT (dra))
3458 || !tree_fits_shwi_p (DR_INIT (drb)))
3459 break;
3461 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3462 just hold extra information. */
3463 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3464 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3465 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3466 break;
3468 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3469 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3470 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3471 HOST_WIDE_INT init_prev
3472 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3473 gcc_assert (init_a <= init_b
3474 && init_a <= init_prev
3475 && init_prev <= init_b);
3477 /* Do not place the same access in the interleaving chain twice. */
3478 if (init_b == init_prev)
3480 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3481 < gimple_uid (DR_STMT (drb)));
3482 /* Simply link in duplicates and fix up the chain below. */
3484 else
3486 /* If init_b == init_a + the size of the type * k, we have an
3487 interleaving, and DRA is accessed before DRB. */
3488 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3489 if (type_size_a == 0
3490 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3491 % type_size_a != 0))
3492 break;
3494 /* If we have a store, the accesses are adjacent. This splits
3495 groups into chunks we support (we don't support vectorization
3496 of stores with gaps). */
3497 if (!DR_IS_READ (dra)
3498 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3499 != type_size_a))
3500 break;
3502 /* If the step (if not zero or non-constant) is smaller than the
3503 difference between data-refs' inits this splits groups into
3504 suitable sizes. */
3505 if (tree_fits_shwi_p (DR_STEP (dra)))
3507 unsigned HOST_WIDE_INT step
3508 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3509 if (step != 0
3510 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3511 break;
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE, vect_location,
3517 DR_IS_READ (dra)
3518 ? "Detected interleaving load %T and %T\n"
3519 : "Detected interleaving store %T and %T\n",
3520 DR_REF (dra), DR_REF (drb));
3522 /* Link the found element into the group list. */
3523 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3525 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3526 lastinfo = stmtinfo_a;
3528 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3529 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3530 lastinfo = stmtinfo_b;
3532 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3533 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3535 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3536 dump_printf_loc (MSG_NOTE, vect_location,
3537 "Load suitable for SLP vectorization only.\n");
3539 if (init_b == init_prev
3540 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3541 && dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE, vect_location,
3543 "Queuing group with duplicate access for fixup\n");
3547 /* Fixup groups with duplicate entries by splitting it. */
3548 while (1)
3550 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3551 if (!(it != to_fixup.end ()))
3552 break;
3553 stmt_vec_info grp = *it;
3554 to_fixup.remove (grp);
3556 /* Find the earliest duplicate group member. */
3557 unsigned first_duplicate = -1u;
3558 stmt_vec_info next, g = grp;
3559 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3561 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3562 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3563 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3564 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3565 g = next;
3567 if (first_duplicate == -1U)
3568 continue;
3570 /* Then move all stmts after the first duplicate to a new group.
3571 Note this is a heuristic but one with the property that *it
3572 is fixed up completely. */
3573 g = grp;
3574 stmt_vec_info newgroup = NULL, ng = grp;
3575 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3577 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3579 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3580 if (!newgroup)
3581 newgroup = next;
3582 else
3583 DR_GROUP_NEXT_ELEMENT (ng) = next;
3584 ng = next;
3585 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3587 else
3588 g = DR_GROUP_NEXT_ELEMENT (g);
3590 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3592 /* Fixup the new group which still may contain duplicates. */
3593 to_fixup.add (newgroup);
3596 dr_vec_info *dr_info;
3597 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3599 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3600 && !vect_analyze_data_ref_access (vinfo, dr_info))
3602 if (dump_enabled_p ())
3603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3604 "not vectorized: complicated access pattern.\n");
3606 if (is_a <bb_vec_info> (vinfo))
3608 /* Mark the statement as not vectorizable. */
3609 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3610 continue;
3612 else
3614 datarefs_copy.release ();
3615 return opt_result::failure_at (dr_info->stmt->stmt,
3616 "not vectorized:"
3617 " complicated access pattern.\n");
3622 datarefs_copy.release ();
3623 return opt_result::success ();
3626 /* Function vect_vfa_segment_size.
3628 Input:
3629 DR_INFO: The data reference.
3630 LENGTH_FACTOR: segment length to consider.
3632 Return a value suitable for the dr_with_seg_len::seg_len field.
3633 This is the "distance travelled" by the pointer from the first
3634 iteration in the segment to the last. Note that it does not include
3635 the size of the access; in effect it only describes the first byte. */
3637 static tree
3638 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3640 length_factor = size_binop (MINUS_EXPR,
3641 fold_convert (sizetype, length_factor),
3642 size_one_node);
3643 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3644 length_factor);
3647 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3648 gives the worst-case number of bytes covered by the segment. */
3650 static unsigned HOST_WIDE_INT
3651 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3653 stmt_vec_info stmt_vinfo = dr_info->stmt;
3654 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3655 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3656 unsigned HOST_WIDE_INT access_size = ref_size;
3657 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3659 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3660 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3662 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3663 int misalignment;
3664 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3665 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3666 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3667 == dr_explicit_realign_optimized))
3669 /* We might access a full vector's worth. */
3670 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3672 return access_size;
3675 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3676 describes. */
3678 static unsigned int
3679 vect_vfa_align (dr_vec_info *dr_info)
3681 return dr_alignment (dr_info->dr);
3684 /* Function vect_no_alias_p.
3686 Given data references A and B with equal base and offset, see whether
3687 the alias relation can be decided at compilation time. Return 1 if
3688 it can and the references alias, 0 if it can and the references do
3689 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3690 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3691 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3693 static int
3694 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3695 tree segment_length_a, tree segment_length_b,
3696 unsigned HOST_WIDE_INT access_size_a,
3697 unsigned HOST_WIDE_INT access_size_b)
3699 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3700 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3701 poly_uint64 const_length_a;
3702 poly_uint64 const_length_b;
3704 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3705 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3706 [a, a+12) */
3707 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3709 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3710 offset_a -= const_length_a;
3712 else
3713 const_length_a = tree_to_poly_uint64 (segment_length_a);
3714 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3716 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3717 offset_b -= const_length_b;
3719 else
3720 const_length_b = tree_to_poly_uint64 (segment_length_b);
3722 const_length_a += access_size_a;
3723 const_length_b += access_size_b;
3725 if (ranges_known_overlap_p (offset_a, const_length_a,
3726 offset_b, const_length_b))
3727 return 1;
3729 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3730 offset_b, const_length_b))
3731 return 0;
3733 return -1;
3736 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3737 in DDR is >= VF. */
3739 static bool
3740 dependence_distance_ge_vf (data_dependence_relation *ddr,
3741 unsigned int loop_depth, poly_uint64 vf)
3743 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3744 || DDR_NUM_DIST_VECTS (ddr) == 0)
3745 return false;
3747 /* If the dependence is exact, we should have limited the VF instead. */
3748 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3750 unsigned int i;
3751 lambda_vector dist_v;
3752 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3754 HOST_WIDE_INT dist = dist_v[loop_depth];
3755 if (dist != 0
3756 && !(dist > 0 && DDR_REVERSED_P (ddr))
3757 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3758 return false;
3761 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_NOTE, vect_location,
3763 "dependence distance between %T and %T is >= VF\n",
3764 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3766 return true;
3769 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3771 static void
3772 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3774 dump_printf (dump_kind, "%s (%T) >= ",
3775 lower_bound.unsigned_p ? "unsigned" : "abs",
3776 lower_bound.expr);
3777 dump_dec (dump_kind, lower_bound.min_value);
3780 /* Record that the vectorized loop requires the vec_lower_bound described
3781 by EXPR, UNSIGNED_P and MIN_VALUE. */
3783 static void
3784 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3785 poly_uint64 min_value)
3787 vec<vec_lower_bound> &lower_bounds
3788 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3789 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3790 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3792 unsigned_p &= lower_bounds[i].unsigned_p;
3793 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3794 if (lower_bounds[i].unsigned_p != unsigned_p
3795 || maybe_lt (lower_bounds[i].min_value, min_value))
3797 lower_bounds[i].unsigned_p = unsigned_p;
3798 lower_bounds[i].min_value = min_value;
3799 if (dump_enabled_p ())
3801 dump_printf_loc (MSG_NOTE, vect_location,
3802 "updating run-time check to ");
3803 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3804 dump_printf (MSG_NOTE, "\n");
3807 return;
3810 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3811 if (dump_enabled_p ())
3813 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3814 dump_lower_bound (MSG_NOTE, lower_bound);
3815 dump_printf (MSG_NOTE, "\n");
3817 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3820 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3821 will span fewer than GAP bytes. */
3823 static bool
3824 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3825 poly_int64 gap)
3827 stmt_vec_info stmt_info = dr_info->stmt;
3828 HOST_WIDE_INT count
3829 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3830 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3831 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3832 return (estimated_poly_value (gap)
3833 <= count * vect_get_scalar_dr_size (dr_info));
3836 /* Return true if we know that there is no alias between DR_INFO_A and
3837 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3838 When returning true, set *LOWER_BOUND_OUT to this N. */
3840 static bool
3841 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3842 poly_uint64 *lower_bound_out)
3844 /* Check that there is a constant gap of known sign between DR_A
3845 and DR_B. */
3846 data_reference *dr_a = dr_info_a->dr;
3847 data_reference *dr_b = dr_info_b->dr;
3848 poly_int64 init_a, init_b;
3849 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3850 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3851 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3852 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3853 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3854 || !ordered_p (init_a, init_b))
3855 return false;
3857 /* Sort DR_A and DR_B by the address they access. */
3858 if (maybe_lt (init_b, init_a))
3860 std::swap (init_a, init_b);
3861 std::swap (dr_info_a, dr_info_b);
3862 std::swap (dr_a, dr_b);
3865 /* If the two accesses could be dependent within a scalar iteration,
3866 make sure that we'd retain their order. */
3867 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3868 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3869 return false;
3871 /* There is no alias if abs (DR_STEP) is greater than or equal to
3872 the bytes spanned by the combination of the two accesses. */
3873 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3874 return true;
3877 /* Function vect_prune_runtime_alias_test_list.
3879 Prune a list of ddrs to be tested at run-time by versioning for alias.
3880 Merge several alias checks into one if possible.
3881 Return FALSE if resulting list of ddrs is longer then allowed by
3882 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3884 opt_result
3885 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3887 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3888 hash_set <tree_pair_hash> compared_objects;
3890 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3891 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3892 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3893 const vec<vec_object_pair> &check_unequal_addrs
3894 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3895 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3896 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3898 ddr_p ddr;
3899 unsigned int i;
3900 tree length_factor;
3902 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3904 /* Step values are irrelevant for aliasing if the number of vector
3905 iterations is equal to the number of scalar iterations (which can
3906 happen for fully-SLP loops). */
3907 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3909 if (!vf_one_p)
3911 /* Convert the checks for nonzero steps into bound tests. */
3912 tree value;
3913 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3914 vect_check_lower_bound (loop_vinfo, value, true, 1);
3917 if (may_alias_ddrs.is_empty ())
3918 return opt_result::success ();
3920 comp_alias_ddrs.create (may_alias_ddrs.length ());
3922 unsigned int loop_depth
3923 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3924 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3926 /* First, we collect all data ref pairs for aliasing checks. */
3927 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3929 poly_uint64 lower_bound;
3930 tree segment_length_a, segment_length_b;
3931 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3932 unsigned int align_a, align_b;
3934 /* Ignore the alias if the VF we chose ended up being no greater
3935 than the dependence distance. */
3936 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3937 continue;
3939 if (DDR_OBJECT_A (ddr))
3941 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3942 if (!compared_objects.add (new_pair))
3944 if (dump_enabled_p ())
3945 dump_printf_loc (MSG_NOTE, vect_location,
3946 "checking that %T and %T"
3947 " have different addresses\n",
3948 new_pair.first, new_pair.second);
3949 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3951 continue;
3954 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3955 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3957 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3958 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3960 bool preserves_scalar_order_p
3961 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3962 bool ignore_step_p
3963 = (vf_one_p
3964 && (preserves_scalar_order_p
3965 || operand_equal_p (DR_STEP (dr_info_a->dr),
3966 DR_STEP (dr_info_b->dr))));
3968 /* Skip the pair if inter-iteration dependencies are irrelevant
3969 and intra-iteration dependencies are guaranteed to be honored. */
3970 if (ignore_step_p
3971 && (preserves_scalar_order_p
3972 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3973 &lower_bound)))
3975 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_NOTE, vect_location,
3977 "no need for alias check between "
3978 "%T and %T when VF is 1\n",
3979 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3980 continue;
3983 /* See whether we can handle the alias using a bounds check on
3984 the step, and whether that's likely to be the best approach.
3985 (It might not be, for example, if the minimum step is much larger
3986 than the number of bytes handled by one vector iteration.) */
3987 if (!ignore_step_p
3988 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3989 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3990 &lower_bound)
3991 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3992 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3994 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3995 if (dump_enabled_p ())
3997 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3998 "%T and %T when the step %T is outside ",
3999 DR_REF (dr_info_a->dr),
4000 DR_REF (dr_info_b->dr),
4001 DR_STEP (dr_info_a->dr));
4002 if (unsigned_p)
4003 dump_printf (MSG_NOTE, "[0");
4004 else
4006 dump_printf (MSG_NOTE, "(");
4007 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4009 dump_printf (MSG_NOTE, ", ");
4010 dump_dec (MSG_NOTE, lower_bound);
4011 dump_printf (MSG_NOTE, ")\n");
4013 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4014 unsigned_p, lower_bound);
4015 continue;
4018 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4019 if (dr_group_first_a)
4021 stmt_info_a = dr_group_first_a;
4022 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4025 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4026 if (dr_group_first_b)
4028 stmt_info_b = dr_group_first_b;
4029 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4032 if (ignore_step_p)
4034 segment_length_a = size_zero_node;
4035 segment_length_b = size_zero_node;
4037 else
4039 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4040 DR_STEP (dr_info_b->dr), 0))
4041 length_factor = scalar_loop_iters;
4042 else
4043 length_factor = size_int (vect_factor);
4044 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4045 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4047 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4048 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4049 align_a = vect_vfa_align (dr_info_a);
4050 align_b = vect_vfa_align (dr_info_b);
4052 /* See whether the alias is known at compilation time. */
4053 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4054 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4055 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4056 DR_OFFSET (dr_info_b->dr), 0)
4057 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4058 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4059 && poly_int_tree_p (segment_length_a)
4060 && poly_int_tree_p (segment_length_b))
4062 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4063 segment_length_a,
4064 segment_length_b,
4065 access_size_a,
4066 access_size_b);
4067 if (res >= 0 && dump_enabled_p ())
4069 dump_printf_loc (MSG_NOTE, vect_location,
4070 "can tell at compile time that %T and %T",
4071 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4072 if (res == 0)
4073 dump_printf (MSG_NOTE, " do not alias\n");
4074 else
4075 dump_printf (MSG_NOTE, " alias\n");
4078 if (res == 0)
4079 continue;
4081 if (res == 1)
4082 return opt_result::failure_at (stmt_info_b->stmt,
4083 "not vectorized:"
4084 " compilation time alias: %G%G",
4085 stmt_info_a->stmt,
4086 stmt_info_b->stmt);
4089 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4090 access_size_a, align_a);
4091 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4092 access_size_b, align_b);
4093 /* Canonicalize the order to be the one that's needed for accurate
4094 RAW, WAR and WAW flags, in cases where the data references are
4095 well-ordered. The order doesn't really matter otherwise,
4096 but we might as well be consistent. */
4097 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4098 std::swap (dr_a, dr_b);
4100 dr_with_seg_len_pair_t dr_with_seg_len_pair
4101 (dr_a, dr_b, (preserves_scalar_order_p
4102 ? dr_with_seg_len_pair_t::WELL_ORDERED
4103 : dr_with_seg_len_pair_t::REORDERED));
4105 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4108 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4110 unsigned int count = (comp_alias_ddrs.length ()
4111 + check_unequal_addrs.length ());
4113 if (count
4114 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4115 == VECT_COST_MODEL_VERY_CHEAP))
4116 return opt_result::failure_at
4117 (vect_location, "would need a runtime alias check\n");
4119 if (dump_enabled_p ())
4120 dump_printf_loc (MSG_NOTE, vect_location,
4121 "improved number of alias checks from %d to %d\n",
4122 may_alias_ddrs.length (), count);
4123 unsigned limit = param_vect_max_version_for_alias_checks;
4124 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4125 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4126 if (count > limit)
4127 return opt_result::failure_at
4128 (vect_location,
4129 "number of versioning for alias run-time tests exceeds %d "
4130 "(--param vect-max-version-for-alias-checks)\n", limit);
4132 return opt_result::success ();
4135 /* Check whether we can use an internal function for a gather load
4136 or scatter store. READ_P is true for loads and false for stores.
4137 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4138 the type of the memory elements being loaded or stored. OFFSET_TYPE
4139 is the type of the offset that is being applied to the invariant
4140 base address. SCALE is the amount by which the offset should
4141 be multiplied *after* it has been converted to address width.
4143 Return true if the function is supported, storing the function id in
4144 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4146 bool
4147 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4148 tree vectype, tree memory_type, tree offset_type,
4149 int scale, internal_fn *ifn_out,
4150 tree *offset_vectype_out)
4152 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4153 unsigned int element_bits = vector_element_bits (vectype);
4154 if (element_bits != memory_bits)
4155 /* For now the vector elements must be the same width as the
4156 memory elements. */
4157 return false;
4159 /* Work out which function we need. */
4160 internal_fn ifn, alt_ifn, alt_ifn2;
4161 if (read_p)
4163 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4164 alt_ifn = IFN_MASK_GATHER_LOAD;
4165 /* When target supports MASK_LEN_GATHER_LOAD, we always
4166 use MASK_LEN_GATHER_LOAD regardless whether len and
4167 mask are valid or not. */
4168 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4170 else
4172 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4173 alt_ifn = IFN_MASK_SCATTER_STORE;
4174 /* When target supports MASK_LEN_SCATTER_STORE, we always
4175 use MASK_LEN_SCATTER_STORE regardless whether len and
4176 mask are valid or not. */
4177 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4180 for (;;)
4182 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4183 if (!offset_vectype)
4184 return false;
4186 /* Test whether the target supports this combination. */
4187 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4188 offset_vectype, scale))
4190 *ifn_out = ifn;
4191 *offset_vectype_out = offset_vectype;
4192 return true;
4194 else if (!masked_p
4195 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4196 memory_type,
4197 offset_vectype,
4198 scale))
4200 *ifn_out = alt_ifn;
4201 *offset_vectype_out = offset_vectype;
4202 return true;
4204 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4205 memory_type,
4206 offset_vectype, scale))
4208 *ifn_out = alt_ifn2;
4209 *offset_vectype_out = offset_vectype;
4210 return true;
4213 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4214 && TYPE_PRECISION (offset_type) >= element_bits)
4215 return false;
4217 offset_type = build_nonstandard_integer_type
4218 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4222 /* STMT_INFO is a call to an internal gather load or scatter store function.
4223 Describe the operation in INFO. */
4225 static void
4226 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4227 gather_scatter_info *info)
4229 gcall *call = as_a <gcall *> (stmt_info->stmt);
4230 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4231 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4233 info->ifn = gimple_call_internal_fn (call);
4234 info->decl = NULL_TREE;
4235 info->base = gimple_call_arg (call, 0);
4236 info->offset = gimple_call_arg (call, 1);
4237 info->offset_dt = vect_unknown_def_type;
4238 info->offset_vectype = NULL_TREE;
4239 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4240 info->element_type = TREE_TYPE (vectype);
4241 info->memory_type = TREE_TYPE (DR_REF (dr));
4244 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4245 gather load or scatter store. Describe the operation in *INFO if so. */
4247 bool
4248 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4249 gather_scatter_info *info)
4251 HOST_WIDE_INT scale = 1;
4252 poly_int64 pbitpos, pbitsize;
4253 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4254 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4255 tree offtype = NULL_TREE;
4256 tree decl = NULL_TREE, base, off;
4257 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4258 tree memory_type = TREE_TYPE (DR_REF (dr));
4259 machine_mode pmode;
4260 int punsignedp, reversep, pvolatilep = 0;
4261 internal_fn ifn;
4262 tree offset_vectype;
4263 bool masked_p = false;
4265 /* See whether this is already a call to a gather/scatter internal function.
4266 If not, see whether it's a masked load or store. */
4267 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4268 if (call && gimple_call_internal_p (call))
4270 ifn = gimple_call_internal_fn (call);
4271 if (internal_gather_scatter_fn_p (ifn))
4273 vect_describe_gather_scatter_call (stmt_info, info);
4274 return true;
4276 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4279 /* True if we should aim to use internal functions rather than
4280 built-in functions. */
4281 bool use_ifn_p = (DR_IS_READ (dr)
4282 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4283 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4285 base = DR_REF (dr);
4286 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4287 see if we can use the def stmt of the address. */
4288 if (masked_p
4289 && TREE_CODE (base) == MEM_REF
4290 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4291 && integer_zerop (TREE_OPERAND (base, 1))
4292 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4294 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4295 if (is_gimple_assign (def_stmt)
4296 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4297 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4300 /* The gather and scatter builtins need address of the form
4301 loop_invariant + vector * {1, 2, 4, 8}
4303 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4304 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4305 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4306 multiplications and additions in it. To get a vector, we need
4307 a single SSA_NAME that will be defined in the loop and will
4308 contain everything that is not loop invariant and that can be
4309 vectorized. The following code attempts to find such a preexistng
4310 SSA_NAME OFF and put the loop invariants into a tree BASE
4311 that can be gimplified before the loop. */
4312 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4313 &punsignedp, &reversep, &pvolatilep);
4314 if (reversep)
4315 return false;
4317 /* PR 107346. Packed structs can have fields at offsets that are not
4318 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4319 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4320 return false;
4322 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4324 if (TREE_CODE (base) == MEM_REF)
4326 if (!integer_zerop (TREE_OPERAND (base, 1)))
4328 if (off == NULL_TREE)
4329 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4330 else
4331 off = size_binop (PLUS_EXPR, off,
4332 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4334 base = TREE_OPERAND (base, 0);
4336 else
4337 base = build_fold_addr_expr (base);
4339 if (off == NULL_TREE)
4340 off = size_zero_node;
4342 /* If base is not loop invariant, either off is 0, then we start with just
4343 the constant offset in the loop invariant BASE and continue with base
4344 as OFF, otherwise give up.
4345 We could handle that case by gimplifying the addition of base + off
4346 into some SSA_NAME and use that as off, but for now punt. */
4347 if (!expr_invariant_in_loop_p (loop, base))
4349 if (!integer_zerop (off))
4350 return false;
4351 off = base;
4352 base = size_int (pbytepos);
4354 /* Otherwise put base + constant offset into the loop invariant BASE
4355 and continue with OFF. */
4356 else
4358 base = fold_convert (sizetype, base);
4359 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4362 /* OFF at this point may be either a SSA_NAME or some tree expression
4363 from get_inner_reference. Try to peel off loop invariants from it
4364 into BASE as long as possible. */
4365 STRIP_NOPS (off);
4366 while (offtype == NULL_TREE)
4368 enum tree_code code;
4369 tree op0, op1, add = NULL_TREE;
4371 if (TREE_CODE (off) == SSA_NAME)
4373 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4375 if (expr_invariant_in_loop_p (loop, off))
4376 return false;
4378 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4379 break;
4381 op0 = gimple_assign_rhs1 (def_stmt);
4382 code = gimple_assign_rhs_code (def_stmt);
4383 op1 = gimple_assign_rhs2 (def_stmt);
4385 else
4387 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4388 return false;
4389 code = TREE_CODE (off);
4390 extract_ops_from_tree (off, &code, &op0, &op1);
4392 switch (code)
4394 case POINTER_PLUS_EXPR:
4395 case PLUS_EXPR:
4396 if (expr_invariant_in_loop_p (loop, op0))
4398 add = op0;
4399 off = op1;
4400 do_add:
4401 add = fold_convert (sizetype, add);
4402 if (scale != 1)
4403 add = size_binop (MULT_EXPR, add, size_int (scale));
4404 base = size_binop (PLUS_EXPR, base, add);
4405 continue;
4407 if (expr_invariant_in_loop_p (loop, op1))
4409 add = op1;
4410 off = op0;
4411 goto do_add;
4413 break;
4414 case MINUS_EXPR:
4415 if (expr_invariant_in_loop_p (loop, op1))
4417 add = fold_convert (sizetype, op1);
4418 add = size_binop (MINUS_EXPR, size_zero_node, add);
4419 off = op0;
4420 goto do_add;
4422 break;
4423 case MULT_EXPR:
4424 if (scale == 1 && tree_fits_shwi_p (op1))
4426 int new_scale = tree_to_shwi (op1);
4427 /* Only treat this as a scaling operation if the target
4428 supports it for at least some offset type. */
4429 if (use_ifn_p
4430 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4431 masked_p, vectype, memory_type,
4432 signed_char_type_node,
4433 new_scale, &ifn,
4434 &offset_vectype)
4435 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4436 masked_p, vectype, memory_type,
4437 unsigned_char_type_node,
4438 new_scale, &ifn,
4439 &offset_vectype))
4440 break;
4441 scale = new_scale;
4442 off = op0;
4443 continue;
4445 break;
4446 case SSA_NAME:
4447 off = op0;
4448 continue;
4449 CASE_CONVERT:
4450 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4451 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4452 break;
4454 /* Don't include the conversion if the target is happy with
4455 the current offset type. */
4456 if (use_ifn_p
4457 && TREE_CODE (off) == SSA_NAME
4458 && !POINTER_TYPE_P (TREE_TYPE (off))
4459 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4460 masked_p, vectype, memory_type,
4461 TREE_TYPE (off), scale, &ifn,
4462 &offset_vectype))
4463 break;
4465 if (TYPE_PRECISION (TREE_TYPE (op0))
4466 == TYPE_PRECISION (TREE_TYPE (off)))
4468 off = op0;
4469 continue;
4472 /* Include the conversion if it is widening and we're using
4473 the IFN path or the target can handle the converted from
4474 offset or the current size is not already the same as the
4475 data vector element size. */
4476 if ((TYPE_PRECISION (TREE_TYPE (op0))
4477 < TYPE_PRECISION (TREE_TYPE (off)))
4478 && (use_ifn_p
4479 || (DR_IS_READ (dr)
4480 ? (targetm.vectorize.builtin_gather
4481 && targetm.vectorize.builtin_gather (vectype,
4482 TREE_TYPE (op0),
4483 scale))
4484 : (targetm.vectorize.builtin_scatter
4485 && targetm.vectorize.builtin_scatter (vectype,
4486 TREE_TYPE (op0),
4487 scale)))
4488 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4489 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4491 off = op0;
4492 offtype = TREE_TYPE (off);
4493 STRIP_NOPS (off);
4494 continue;
4496 break;
4497 default:
4498 break;
4500 break;
4503 /* If at the end OFF still isn't a SSA_NAME or isn't
4504 defined in the loop, punt. */
4505 if (TREE_CODE (off) != SSA_NAME
4506 || expr_invariant_in_loop_p (loop, off))
4507 return false;
4509 if (offtype == NULL_TREE)
4510 offtype = TREE_TYPE (off);
4512 if (use_ifn_p)
4514 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4515 vectype, memory_type, offtype, scale,
4516 &ifn, &offset_vectype))
4517 ifn = IFN_LAST;
4518 decl = NULL_TREE;
4520 else
4522 if (DR_IS_READ (dr))
4524 if (targetm.vectorize.builtin_gather)
4525 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4527 else
4529 if (targetm.vectorize.builtin_scatter)
4530 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4532 ifn = IFN_LAST;
4533 /* The offset vector type will be read from DECL when needed. */
4534 offset_vectype = NULL_TREE;
4537 info->ifn = ifn;
4538 info->decl = decl;
4539 info->base = base;
4540 info->offset = off;
4541 info->offset_dt = vect_unknown_def_type;
4542 info->offset_vectype = offset_vectype;
4543 info->scale = scale;
4544 info->element_type = TREE_TYPE (vectype);
4545 info->memory_type = memory_type;
4546 return true;
4549 /* Find the data references in STMT, analyze them with respect to LOOP and
4550 append them to DATAREFS. Return false if datarefs in this stmt cannot
4551 be handled. */
4553 opt_result
4554 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4555 vec<data_reference_p> *datarefs,
4556 vec<int> *dataref_groups, int group_id)
4558 /* We can ignore clobbers for dataref analysis - they are removed during
4559 loop vectorization and BB vectorization checks dependences with a
4560 stmt walk. */
4561 if (gimple_clobber_p (stmt))
4562 return opt_result::success ();
4564 if (gimple_has_volatile_ops (stmt))
4565 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4566 stmt);
4568 if (stmt_can_throw_internal (cfun, stmt))
4569 return opt_result::failure_at (stmt,
4570 "not vectorized:"
4571 " statement can throw an exception: %G",
4572 stmt);
4574 auto_vec<data_reference_p, 2> refs;
4575 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4576 if (!res)
4577 return res;
4579 if (refs.is_empty ())
4580 return opt_result::success ();
4582 if (refs.length () > 1)
4584 while (!refs.is_empty ())
4585 free_data_ref (refs.pop ());
4586 return opt_result::failure_at (stmt,
4587 "not vectorized: more than one "
4588 "data ref in stmt: %G", stmt);
4591 data_reference_p dr = refs.pop ();
4592 if (gcall *call = dyn_cast <gcall *> (stmt))
4593 if (!gimple_call_internal_p (call)
4594 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4595 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4597 free_data_ref (dr);
4598 return opt_result::failure_at (stmt,
4599 "not vectorized: dr in a call %G", stmt);
4602 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4603 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4605 free_data_ref (dr);
4606 return opt_result::failure_at (stmt,
4607 "not vectorized:"
4608 " statement is an unsupported"
4609 " bitfield access %G", stmt);
4612 if (DR_BASE_ADDRESS (dr)
4613 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4615 free_data_ref (dr);
4616 return opt_result::failure_at (stmt,
4617 "not vectorized:"
4618 " base addr of dr is a constant\n");
4621 /* Check whether this may be a SIMD lane access and adjust the
4622 DR to make it easier for us to handle it. */
4623 if (loop
4624 && loop->simduid
4625 && (!DR_BASE_ADDRESS (dr)
4626 || !DR_OFFSET (dr)
4627 || !DR_INIT (dr)
4628 || !DR_STEP (dr)))
4630 struct data_reference *newdr
4631 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4632 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4633 if (DR_BASE_ADDRESS (newdr)
4634 && DR_OFFSET (newdr)
4635 && DR_INIT (newdr)
4636 && DR_STEP (newdr)
4637 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4638 && integer_zerop (DR_STEP (newdr)))
4640 tree base_address = DR_BASE_ADDRESS (newdr);
4641 tree off = DR_OFFSET (newdr);
4642 tree step = ssize_int (1);
4643 if (integer_zerop (off)
4644 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4646 off = TREE_OPERAND (base_address, 1);
4647 base_address = TREE_OPERAND (base_address, 0);
4649 STRIP_NOPS (off);
4650 if (TREE_CODE (off) == MULT_EXPR
4651 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4653 step = TREE_OPERAND (off, 1);
4654 off = TREE_OPERAND (off, 0);
4655 STRIP_NOPS (off);
4657 if (CONVERT_EXPR_P (off)
4658 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4659 < TYPE_PRECISION (TREE_TYPE (off))))
4660 off = TREE_OPERAND (off, 0);
4661 if (TREE_CODE (off) == SSA_NAME)
4663 gimple *def = SSA_NAME_DEF_STMT (off);
4664 /* Look through widening conversion. */
4665 if (is_gimple_assign (def)
4666 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4668 tree rhs1 = gimple_assign_rhs1 (def);
4669 if (TREE_CODE (rhs1) == SSA_NAME
4670 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4671 && (TYPE_PRECISION (TREE_TYPE (off))
4672 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4673 def = SSA_NAME_DEF_STMT (rhs1);
4675 if (is_gimple_call (def)
4676 && gimple_call_internal_p (def)
4677 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4679 tree arg = gimple_call_arg (def, 0);
4680 tree reft = TREE_TYPE (DR_REF (newdr));
4681 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4682 arg = SSA_NAME_VAR (arg);
4683 if (arg == loop->simduid
4684 /* For now. */
4685 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4687 DR_BASE_ADDRESS (newdr) = base_address;
4688 DR_OFFSET (newdr) = ssize_int (0);
4689 DR_STEP (newdr) = step;
4690 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4691 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4692 /* Mark as simd-lane access. */
4693 tree arg2 = gimple_call_arg (def, 1);
4694 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4695 free_data_ref (dr);
4696 datarefs->safe_push (newdr);
4697 if (dataref_groups)
4698 dataref_groups->safe_push (group_id);
4699 return opt_result::success ();
4704 free_data_ref (newdr);
4707 datarefs->safe_push (dr);
4708 if (dataref_groups)
4709 dataref_groups->safe_push (group_id);
4710 return opt_result::success ();
4713 /* Function vect_analyze_data_refs.
4715 Find all the data references in the loop or basic block.
4717 The general structure of the analysis of data refs in the vectorizer is as
4718 follows:
4719 1- vect_analyze_data_refs(loop/bb): call
4720 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4721 in the loop/bb and their dependences.
4722 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4723 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4724 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4728 opt_result
4729 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4731 class loop *loop = NULL;
4732 unsigned int i;
4733 struct data_reference *dr;
4734 tree scalar_type;
4736 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4738 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4739 loop = LOOP_VINFO_LOOP (loop_vinfo);
4741 /* Go through the data-refs, check that the analysis succeeded. Update
4742 pointer from stmt_vec_info struct to DR and vectype. */
4744 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4745 FOR_EACH_VEC_ELT (datarefs, i, dr)
4747 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4748 poly_uint64 vf;
4750 gcc_assert (DR_REF (dr));
4751 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4752 gcc_assert (!stmt_info->dr_aux.dr);
4753 stmt_info->dr_aux.dr = dr;
4754 stmt_info->dr_aux.stmt = stmt_info;
4756 /* Check that analysis of the data-ref succeeded. */
4757 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4758 || !DR_STEP (dr))
4760 bool maybe_gather
4761 = DR_IS_READ (dr)
4762 && !TREE_THIS_VOLATILE (DR_REF (dr));
4763 bool maybe_scatter
4764 = DR_IS_WRITE (dr)
4765 && !TREE_THIS_VOLATILE (DR_REF (dr));
4767 /* If target supports vector gather loads or scatter stores,
4768 see if they can't be used. */
4769 if (is_a <loop_vec_info> (vinfo)
4770 && !nested_in_vect_loop_p (loop, stmt_info))
4772 if (maybe_gather || maybe_scatter)
4774 if (maybe_gather)
4775 gatherscatter = GATHER;
4776 else
4777 gatherscatter = SCATTER;
4781 if (gatherscatter == SG_NONE)
4783 if (dump_enabled_p ())
4784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4785 "not vectorized: data ref analysis "
4786 "failed %G", stmt_info->stmt);
4787 if (is_a <bb_vec_info> (vinfo))
4789 /* In BB vectorization the ref can still participate
4790 in dependence analysis, we just can't vectorize it. */
4791 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4792 continue;
4794 return opt_result::failure_at (stmt_info->stmt,
4795 "not vectorized:"
4796 " data ref analysis failed: %G",
4797 stmt_info->stmt);
4801 /* See if this was detected as SIMD lane access. */
4802 if (dr->aux == (void *)-1
4803 || dr->aux == (void *)-2
4804 || dr->aux == (void *)-3
4805 || dr->aux == (void *)-4)
4807 if (nested_in_vect_loop_p (loop, stmt_info))
4808 return opt_result::failure_at (stmt_info->stmt,
4809 "not vectorized:"
4810 " data ref analysis failed: %G",
4811 stmt_info->stmt);
4812 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4813 = -(uintptr_t) dr->aux;
4816 tree base = get_base_address (DR_REF (dr));
4817 if (base && VAR_P (base) && DECL_NONALIASED (base))
4819 if (dump_enabled_p ())
4820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4821 "not vectorized: base object not addressable "
4822 "for stmt: %G", stmt_info->stmt);
4823 if (is_a <bb_vec_info> (vinfo))
4825 /* In BB vectorization the ref can still participate
4826 in dependence analysis, we just can't vectorize it. */
4827 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4828 continue;
4830 return opt_result::failure_at (stmt_info->stmt,
4831 "not vectorized: base object not"
4832 " addressable for stmt: %G",
4833 stmt_info->stmt);
4836 if (is_a <loop_vec_info> (vinfo)
4837 && DR_STEP (dr)
4838 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4840 if (nested_in_vect_loop_p (loop, stmt_info))
4841 return opt_result::failure_at (stmt_info->stmt,
4842 "not vectorized: "
4843 "not suitable for strided load %G",
4844 stmt_info->stmt);
4845 STMT_VINFO_STRIDED_P (stmt_info) = true;
4848 /* Update DR field in stmt_vec_info struct. */
4850 /* If the dataref is in an inner-loop of the loop that is considered for
4851 for vectorization, we also want to analyze the access relative to
4852 the outer-loop (DR contains information only relative to the
4853 inner-most enclosing loop). We do that by building a reference to the
4854 first location accessed by the inner-loop, and analyze it relative to
4855 the outer-loop. */
4856 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4858 /* Build a reference to the first location accessed by the
4859 inner loop: *(BASE + INIT + OFFSET). By construction,
4860 this address must be invariant in the inner loop, so we
4861 can consider it as being used in the outer loop. */
4862 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4863 tree offset = unshare_expr (DR_OFFSET (dr));
4864 tree init = unshare_expr (DR_INIT (dr));
4865 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4866 init, offset);
4867 tree init_addr = fold_build_pointer_plus (base, init_offset);
4868 tree init_ref = build_fold_indirect_ref (init_addr);
4870 if (dump_enabled_p ())
4871 dump_printf_loc (MSG_NOTE, vect_location,
4872 "analyze in outer loop: %T\n", init_ref);
4874 opt_result res
4875 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4876 init_ref, loop, stmt_info->stmt);
4877 if (!res)
4878 /* dr_analyze_innermost already explained the failure. */
4879 return res;
4881 if (dump_enabled_p ())
4882 dump_printf_loc (MSG_NOTE, vect_location,
4883 "\touter base_address: %T\n"
4884 "\touter offset from base address: %T\n"
4885 "\touter constant offset from base address: %T\n"
4886 "\touter step: %T\n"
4887 "\touter base alignment: %d\n\n"
4888 "\touter base misalignment: %d\n"
4889 "\touter offset alignment: %d\n"
4890 "\touter step alignment: %d\n",
4891 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4892 STMT_VINFO_DR_OFFSET (stmt_info),
4893 STMT_VINFO_DR_INIT (stmt_info),
4894 STMT_VINFO_DR_STEP (stmt_info),
4895 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4896 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4897 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4898 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4901 /* Set vectype for STMT. */
4902 scalar_type = TREE_TYPE (DR_REF (dr));
4903 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4904 if (!vectype)
4906 if (dump_enabled_p ())
4908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4909 "not vectorized: no vectype for stmt: %G",
4910 stmt_info->stmt);
4911 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4912 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4913 scalar_type);
4914 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4917 if (is_a <bb_vec_info> (vinfo))
4919 /* No vector type is fine, the ref can still participate
4920 in dependence analysis, we just can't vectorize it. */
4921 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4922 continue;
4924 if (fatal)
4925 *fatal = false;
4926 return opt_result::failure_at (stmt_info->stmt,
4927 "not vectorized:"
4928 " no vectype for stmt: %G"
4929 " scalar_type: %T\n",
4930 stmt_info->stmt, scalar_type);
4932 else
4934 if (dump_enabled_p ())
4935 dump_printf_loc (MSG_NOTE, vect_location,
4936 "got vectype for stmt: %G%T\n",
4937 stmt_info->stmt, vectype);
4940 /* Adjust the minimal vectorization factor according to the
4941 vector type. */
4942 vf = TYPE_VECTOR_SUBPARTS (vectype);
4943 *min_vf = upper_bound (*min_vf, vf);
4945 /* Leave the BB vectorizer to pick the vector type later, based on
4946 the final dataref group size and SLP node size. */
4947 if (is_a <loop_vec_info> (vinfo))
4948 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4950 if (gatherscatter != SG_NONE)
4952 gather_scatter_info gs_info;
4953 if (!vect_check_gather_scatter (stmt_info,
4954 as_a <loop_vec_info> (vinfo),
4955 &gs_info)
4956 || !get_vectype_for_scalar_type (vinfo,
4957 TREE_TYPE (gs_info.offset)))
4959 if (fatal)
4960 *fatal = false;
4961 return opt_result::failure_at
4962 (stmt_info->stmt,
4963 (gatherscatter == GATHER)
4964 ? "not vectorized: not suitable for gather load %G"
4965 : "not vectorized: not suitable for scatter store %G",
4966 stmt_info->stmt);
4968 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4972 /* We used to stop processing and prune the list here. Verify we no
4973 longer need to. */
4974 gcc_assert (i == datarefs.length ());
4976 return opt_result::success ();
4980 /* Function vect_get_new_vect_var.
4982 Returns a name for a new variable. The current naming scheme appends the
4983 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4984 the name of vectorizer generated variables, and appends that to NAME if
4985 provided. */
4987 tree
4988 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4990 const char *prefix;
4991 tree new_vect_var;
4993 switch (var_kind)
4995 case vect_simple_var:
4996 prefix = "vect";
4997 break;
4998 case vect_scalar_var:
4999 prefix = "stmp";
5000 break;
5001 case vect_mask_var:
5002 prefix = "mask";
5003 break;
5004 case vect_pointer_var:
5005 prefix = "vectp";
5006 break;
5007 default:
5008 gcc_unreachable ();
5011 if (name)
5013 char* tmp = concat (prefix, "_", name, NULL);
5014 new_vect_var = create_tmp_reg (type, tmp);
5015 free (tmp);
5017 else
5018 new_vect_var = create_tmp_reg (type, prefix);
5020 return new_vect_var;
5023 /* Like vect_get_new_vect_var but return an SSA name. */
5025 tree
5026 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5028 const char *prefix;
5029 tree new_vect_var;
5031 switch (var_kind)
5033 case vect_simple_var:
5034 prefix = "vect";
5035 break;
5036 case vect_scalar_var:
5037 prefix = "stmp";
5038 break;
5039 case vect_pointer_var:
5040 prefix = "vectp";
5041 break;
5042 default:
5043 gcc_unreachable ();
5046 if (name)
5048 char* tmp = concat (prefix, "_", name, NULL);
5049 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5050 free (tmp);
5052 else
5053 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5055 return new_vect_var;
5058 /* Duplicate points-to info on NAME from DR_INFO. */
5060 static void
5061 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5063 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5064 /* DR_PTR_INFO is for a base SSA name, not including constant or
5065 variable offsets in the ref so its alignment info does not apply. */
5066 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5069 /* Function vect_create_addr_base_for_vector_ref.
5071 Create an expression that computes the address of the first memory location
5072 that will be accessed for a data reference.
5074 Input:
5075 STMT_INFO: The statement containing the data reference.
5076 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5077 OFFSET: Optional. If supplied, it is be added to the initial address.
5078 LOOP: Specify relative to which loop-nest should the address be computed.
5079 For example, when the dataref is in an inner-loop nested in an
5080 outer-loop that is now being vectorized, LOOP can be either the
5081 outer-loop, or the inner-loop. The first memory location accessed
5082 by the following dataref ('in' points to short):
5084 for (i=0; i<N; i++)
5085 for (j=0; j<M; j++)
5086 s += in[i+j]
5088 is as follows:
5089 if LOOP=i_loop: &in (relative to i_loop)
5090 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5092 Output:
5093 1. Return an SSA_NAME whose value is the address of the memory location of
5094 the first vector of the data reference.
5095 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5096 these statement(s) which define the returned SSA_NAME.
5098 FORNOW: We are only handling array accesses with step 1. */
5100 tree
5101 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5102 gimple_seq *new_stmt_list,
5103 tree offset)
5105 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5106 struct data_reference *dr = dr_info->dr;
5107 const char *base_name;
5108 tree addr_base;
5109 tree dest;
5110 gimple_seq seq = NULL;
5111 tree vect_ptr_type;
5112 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5113 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5115 tree data_ref_base = unshare_expr (drb->base_address);
5116 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5117 tree init = unshare_expr (drb->init);
5119 if (loop_vinfo)
5120 base_name = get_name (data_ref_base);
5121 else
5123 base_offset = ssize_int (0);
5124 init = ssize_int (0);
5125 base_name = get_name (DR_REF (dr));
5128 /* Create base_offset */
5129 base_offset = size_binop (PLUS_EXPR,
5130 fold_convert (sizetype, base_offset),
5131 fold_convert (sizetype, init));
5133 if (offset)
5135 offset = fold_convert (sizetype, offset);
5136 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5137 base_offset, offset);
5140 /* base + base_offset */
5141 if (loop_vinfo)
5142 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5143 else
5144 addr_base = build1 (ADDR_EXPR,
5145 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5146 /* Strip zero offset components since we don't need
5147 them and they can confuse late diagnostics if
5148 we CSE them wrongly. See PR106904 for example. */
5149 unshare_expr (strip_zero_offset_components
5150 (DR_REF (dr))));
5152 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5153 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5154 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5155 gimple_seq_add_seq (new_stmt_list, seq);
5157 if (DR_PTR_INFO (dr)
5158 && TREE_CODE (addr_base) == SSA_NAME
5159 /* We should only duplicate pointer info to newly created SSA names. */
5160 && SSA_NAME_VAR (addr_base) == dest)
5162 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5163 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5169 return addr_base;
5173 /* Function vect_create_data_ref_ptr.
5175 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5176 location accessed in the loop by STMT_INFO, along with the def-use update
5177 chain to appropriately advance the pointer through the loop iterations.
5178 Also set aliasing information for the pointer. This pointer is used by
5179 the callers to this function to create a memory reference expression for
5180 vector load/store access.
5182 Input:
5183 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5184 GIMPLE_ASSIGN <name, data-ref> or
5185 GIMPLE_ASSIGN <data-ref, name>.
5186 2. AGGR_TYPE: the type of the reference, which should be either a vector
5187 or an array.
5188 3. AT_LOOP: the loop where the vector memref is to be created.
5189 4. OFFSET (optional): a byte offset to be added to the initial address
5190 accessed by the data-ref in STMT_INFO.
5191 5. BSI: location where the new stmts are to be placed if there is no loop
5192 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5193 pointing to the initial address.
5194 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5195 to the IV during each iteration of the loop. NULL says to move
5196 by one copy of AGGR_TYPE up or down, depending on the step of the
5197 data reference.
5199 Output:
5200 1. Declare a new ptr to vector_type, and have it point to the base of the
5201 data reference (initial addressed accessed by the data reference).
5202 For example, for vector of type V8HI, the following code is generated:
5204 v8hi *ap;
5205 ap = (v8hi *)initial_address;
5207 if OFFSET is not supplied:
5208 initial_address = &a[init];
5209 if OFFSET is supplied:
5210 initial_address = &a[init] + OFFSET;
5211 if BYTE_OFFSET is supplied:
5212 initial_address = &a[init] + BYTE_OFFSET;
5214 Return the initial_address in INITIAL_ADDRESS.
5216 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5217 update the pointer in each iteration of the loop.
5219 Return the increment stmt that updates the pointer in PTR_INCR.
5221 3. Return the pointer. */
5223 tree
5224 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5225 tree aggr_type, class loop *at_loop, tree offset,
5226 tree *initial_address, gimple_stmt_iterator *gsi,
5227 gimple **ptr_incr, bool only_init,
5228 tree iv_step)
5230 const char *base_name;
5231 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5232 class loop *loop = NULL;
5233 bool nested_in_vect_loop = false;
5234 class loop *containing_loop = NULL;
5235 tree aggr_ptr_type;
5236 tree aggr_ptr;
5237 tree new_temp;
5238 gimple_seq new_stmt_list = NULL;
5239 edge pe = NULL;
5240 basic_block new_bb;
5241 tree aggr_ptr_init;
5242 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5243 struct data_reference *dr = dr_info->dr;
5244 tree aptr;
5245 gimple_stmt_iterator incr_gsi;
5246 bool insert_after;
5247 tree indx_before_incr, indx_after_incr;
5248 gimple *incr;
5249 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5251 gcc_assert (iv_step != NULL_TREE
5252 || TREE_CODE (aggr_type) == ARRAY_TYPE
5253 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5255 if (loop_vinfo)
5257 loop = LOOP_VINFO_LOOP (loop_vinfo);
5258 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5259 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5260 pe = loop_preheader_edge (loop);
5262 else
5264 gcc_assert (bb_vinfo);
5265 only_init = true;
5266 *ptr_incr = NULL;
5269 /* Create an expression for the first address accessed by this load
5270 in LOOP. */
5271 base_name = get_name (DR_BASE_ADDRESS (dr));
5273 if (dump_enabled_p ())
5275 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5276 dump_printf_loc (MSG_NOTE, vect_location,
5277 "create %s-pointer variable to type: %T",
5278 get_tree_code_name (TREE_CODE (aggr_type)),
5279 aggr_type);
5280 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5281 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5282 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5283 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5284 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5285 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5286 else
5287 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5288 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5291 /* (1) Create the new aggregate-pointer variable.
5292 Vector and array types inherit the alias set of their component
5293 type by default so we need to use a ref-all pointer if the data
5294 reference does not conflict with the created aggregated data
5295 reference because it is not addressable. */
5296 bool need_ref_all = false;
5297 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5298 get_alias_set (DR_REF (dr))))
5299 need_ref_all = true;
5300 /* Likewise for any of the data references in the stmt group. */
5301 else if (DR_GROUP_SIZE (stmt_info) > 1)
5303 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5306 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5307 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5308 get_alias_set (DR_REF (sdr))))
5310 need_ref_all = true;
5311 break;
5313 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5315 while (sinfo);
5317 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5318 need_ref_all);
5319 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5322 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5323 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5324 def-use update cycles for the pointer: one relative to the outer-loop
5325 (LOOP), which is what steps (3) and (4) below do. The other is relative
5326 to the inner-loop (which is the inner-most loop containing the dataref),
5327 and this is done be step (5) below.
5329 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5330 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5331 redundant. Steps (3),(4) create the following:
5333 vp0 = &base_addr;
5334 LOOP: vp1 = phi(vp0,vp2)
5337 vp2 = vp1 + step
5338 goto LOOP
5340 If there is an inner-loop nested in loop, then step (5) will also be
5341 applied, and an additional update in the inner-loop will be created:
5343 vp0 = &base_addr;
5344 LOOP: vp1 = phi(vp0,vp2)
5346 inner: vp3 = phi(vp1,vp4)
5347 vp4 = vp3 + inner_step
5348 if () goto inner
5350 vp2 = vp1 + step
5351 if () goto LOOP */
5353 /* (2) Calculate the initial address of the aggregate-pointer, and set
5354 the aggregate-pointer to point to it before the loop. */
5356 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5358 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5359 stmt_info, &new_stmt_list,
5360 offset);
5361 if (new_stmt_list)
5363 if (pe)
5365 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5366 gcc_assert (!new_bb);
5368 else
5369 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5372 *initial_address = new_temp;
5373 aggr_ptr_init = new_temp;
5375 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5376 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5377 inner-loop nested in LOOP (during outer-loop vectorization). */
5379 /* No update in loop is required. */
5380 if (only_init && (!loop_vinfo || at_loop == loop))
5381 aptr = aggr_ptr_init;
5382 else
5384 /* Accesses to invariant addresses should be handled specially
5385 by the caller. */
5386 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5387 gcc_assert (!integer_zerop (step));
5389 if (iv_step == NULL_TREE)
5391 /* The step of the aggregate pointer is the type size,
5392 negated for downward accesses. */
5393 iv_step = TYPE_SIZE_UNIT (aggr_type);
5394 if (tree_int_cst_sgn (step) == -1)
5395 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5398 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5400 create_iv (aggr_ptr_init, PLUS_EXPR,
5401 fold_convert (aggr_ptr_type, iv_step),
5402 aggr_ptr, loop, &incr_gsi, insert_after,
5403 &indx_before_incr, &indx_after_incr);
5404 incr = gsi_stmt (incr_gsi);
5406 /* Copy the points-to information if it exists. */
5407 if (DR_PTR_INFO (dr))
5409 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5410 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5412 if (ptr_incr)
5413 *ptr_incr = incr;
5415 aptr = indx_before_incr;
5418 if (!nested_in_vect_loop || only_init)
5419 return aptr;
5422 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5423 nested in LOOP, if exists. */
5425 gcc_assert (nested_in_vect_loop);
5426 if (!only_init)
5428 standard_iv_increment_position (containing_loop, &incr_gsi,
5429 &insert_after);
5430 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5431 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5432 &indx_before_incr, &indx_after_incr);
5433 incr = gsi_stmt (incr_gsi);
5435 /* Copy the points-to information if it exists. */
5436 if (DR_PTR_INFO (dr))
5438 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5439 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5441 if (ptr_incr)
5442 *ptr_incr = incr;
5444 return indx_before_incr;
5446 else
5447 gcc_unreachable ();
5451 /* Function bump_vector_ptr
5453 Increment a pointer (to a vector type) by vector-size. If requested,
5454 i.e. if PTR-INCR is given, then also connect the new increment stmt
5455 to the existing def-use update-chain of the pointer, by modifying
5456 the PTR_INCR as illustrated below:
5458 The pointer def-use update-chain before this function:
5459 DATAREF_PTR = phi (p_0, p_2)
5460 ....
5461 PTR_INCR: p_2 = DATAREF_PTR + step
5463 The pointer def-use update-chain after this function:
5464 DATAREF_PTR = phi (p_0, p_2)
5465 ....
5466 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5467 ....
5468 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5470 Input:
5471 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5472 in the loop.
5473 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5474 the loop. The increment amount across iterations is expected
5475 to be vector_size.
5476 BSI - location where the new update stmt is to be placed.
5477 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5478 BUMP - optional. The offset by which to bump the pointer. If not given,
5479 the offset is assumed to be vector_size.
5481 Output: Return NEW_DATAREF_PTR as illustrated above.
5485 tree
5486 bump_vector_ptr (vec_info *vinfo,
5487 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5488 stmt_vec_info stmt_info, tree bump)
5490 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5491 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5492 tree update = TYPE_SIZE_UNIT (vectype);
5493 gimple *incr_stmt;
5494 ssa_op_iter iter;
5495 use_operand_p use_p;
5496 tree new_dataref_ptr;
5498 if (bump)
5499 update = bump;
5501 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5502 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5503 else if (is_gimple_min_invariant (dataref_ptr))
5504 /* When possible avoid emitting a separate increment stmt that will
5505 force the addressed object addressable. */
5506 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5507 fold_build2 (MEM_REF,
5508 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5509 dataref_ptr,
5510 fold_convert (ptr_type_node, update)));
5511 else
5512 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5513 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5514 dataref_ptr, update);
5515 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5516 /* Fold the increment, avoiding excessive chains use-def chains of
5517 those, leading to compile-time issues for passes until the next
5518 forwprop pass which would do this as well. */
5519 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5520 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5522 incr_stmt = gsi_stmt (fold_gsi);
5523 update_stmt (incr_stmt);
5526 /* Copy the points-to information if it exists. */
5527 if (DR_PTR_INFO (dr))
5529 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5530 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5533 if (!ptr_incr)
5534 return new_dataref_ptr;
5536 /* Update the vector-pointer's cross-iteration increment. */
5537 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5539 tree use = USE_FROM_PTR (use_p);
5541 if (use == dataref_ptr)
5542 SET_USE (use_p, new_dataref_ptr);
5543 else
5544 gcc_assert (operand_equal_p (use, update, 0));
5547 return new_dataref_ptr;
5551 /* Copy memory reference info such as base/clique from the SRC reference
5552 to the DEST MEM_REF. */
5554 void
5555 vect_copy_ref_info (tree dest, tree src)
5557 if (TREE_CODE (dest) != MEM_REF)
5558 return;
5560 tree src_base = src;
5561 while (handled_component_p (src_base))
5562 src_base = TREE_OPERAND (src_base, 0);
5563 if (TREE_CODE (src_base) != MEM_REF
5564 && TREE_CODE (src_base) != TARGET_MEM_REF)
5565 return;
5567 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5568 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5572 /* Function vect_create_destination_var.
5574 Create a new temporary of type VECTYPE. */
5576 tree
5577 vect_create_destination_var (tree scalar_dest, tree vectype)
5579 tree vec_dest;
5580 const char *name;
5581 char *new_name;
5582 tree type;
5583 enum vect_var_kind kind;
5585 kind = vectype
5586 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5587 ? vect_mask_var
5588 : vect_simple_var
5589 : vect_scalar_var;
5590 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5592 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5594 name = get_name (scalar_dest);
5595 if (name)
5596 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5597 else
5598 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5599 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5600 free (new_name);
5602 return vec_dest;
5605 /* Function vect_grouped_store_supported.
5607 Returns TRUE if interleave high and interleave low permutations
5608 are supported, and FALSE otherwise. */
5610 bool
5611 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5613 machine_mode mode = TYPE_MODE (vectype);
5615 /* vect_permute_store_chain requires the group size to be equal to 3 or
5616 be a power of two. */
5617 if (count != 3 && exact_log2 (count) == -1)
5619 if (dump_enabled_p ())
5620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5621 "the size of the group of accesses"
5622 " is not a power of 2 or not eqaul to 3\n");
5623 return false;
5626 /* Check that the permutation is supported. */
5627 if (VECTOR_MODE_P (mode))
5629 unsigned int i;
5630 if (count == 3)
5632 unsigned int j0 = 0, j1 = 0, j2 = 0;
5633 unsigned int i, j;
5635 unsigned int nelt;
5636 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5638 if (dump_enabled_p ())
5639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5640 "cannot handle groups of 3 stores for"
5641 " variable-length vectors\n");
5642 return false;
5645 vec_perm_builder sel (nelt, nelt, 1);
5646 sel.quick_grow (nelt);
5647 vec_perm_indices indices;
5648 for (j = 0; j < 3; j++)
5650 int nelt0 = ((3 - j) * nelt) % 3;
5651 int nelt1 = ((3 - j) * nelt + 1) % 3;
5652 int nelt2 = ((3 - j) * nelt + 2) % 3;
5653 for (i = 0; i < nelt; i++)
5655 if (3 * i + nelt0 < nelt)
5656 sel[3 * i + nelt0] = j0++;
5657 if (3 * i + nelt1 < nelt)
5658 sel[3 * i + nelt1] = nelt + j1++;
5659 if (3 * i + nelt2 < nelt)
5660 sel[3 * i + nelt2] = 0;
5662 indices.new_vector (sel, 2, nelt);
5663 if (!can_vec_perm_const_p (mode, mode, indices))
5665 if (dump_enabled_p ())
5666 dump_printf (MSG_MISSED_OPTIMIZATION,
5667 "permutation op not supported by target.\n");
5668 return false;
5671 for (i = 0; i < nelt; i++)
5673 if (3 * i + nelt0 < nelt)
5674 sel[3 * i + nelt0] = 3 * i + nelt0;
5675 if (3 * i + nelt1 < nelt)
5676 sel[3 * i + nelt1] = 3 * i + nelt1;
5677 if (3 * i + nelt2 < nelt)
5678 sel[3 * i + nelt2] = nelt + j2++;
5680 indices.new_vector (sel, 2, nelt);
5681 if (!can_vec_perm_const_p (mode, mode, indices))
5683 if (dump_enabled_p ())
5684 dump_printf (MSG_MISSED_OPTIMIZATION,
5685 "permutation op not supported by target.\n");
5686 return false;
5689 return true;
5691 else
5693 /* If length is not equal to 3 then only power of 2 is supported. */
5694 gcc_assert (pow2p_hwi (count));
5695 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5697 /* The encoding has 2 interleaved stepped patterns. */
5698 if(!multiple_p (nelt, 2))
5699 return false;
5700 vec_perm_builder sel (nelt, 2, 3);
5701 sel.quick_grow (6);
5702 for (i = 0; i < 3; i++)
5704 sel[i * 2] = i;
5705 sel[i * 2 + 1] = i + nelt;
5707 vec_perm_indices indices (sel, 2, nelt);
5708 if (can_vec_perm_const_p (mode, mode, indices))
5710 for (i = 0; i < 6; i++)
5711 sel[i] += exact_div (nelt, 2);
5712 indices.new_vector (sel, 2, nelt);
5713 if (can_vec_perm_const_p (mode, mode, indices))
5714 return true;
5719 if (dump_enabled_p ())
5720 dump_printf (MSG_MISSED_OPTIMIZATION,
5721 "permutation op not supported by target.\n");
5722 return false;
5725 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5726 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5728 internal_fn
5729 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5730 bool masked_p)
5732 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5733 vec_mask_len_store_lanes_optab, vectype,
5734 count))
5735 return IFN_MASK_LEN_STORE_LANES;
5736 else if (masked_p)
5738 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5739 vec_mask_store_lanes_optab, vectype,
5740 count))
5741 return IFN_MASK_STORE_LANES;
5743 else
5745 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5746 vec_store_lanes_optab, vectype, count))
5747 return IFN_STORE_LANES;
5749 return IFN_LAST;
5753 /* Function vect_permute_store_chain.
5755 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5756 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5757 the data correctly for the stores. Return the final references for stores
5758 in RESULT_CHAIN.
5760 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5761 The input is 4 vectors each containing 8 elements. We assign a number to
5762 each element, the input sequence is:
5764 1st vec: 0 1 2 3 4 5 6 7
5765 2nd vec: 8 9 10 11 12 13 14 15
5766 3rd vec: 16 17 18 19 20 21 22 23
5767 4th vec: 24 25 26 27 28 29 30 31
5769 The output sequence should be:
5771 1st vec: 0 8 16 24 1 9 17 25
5772 2nd vec: 2 10 18 26 3 11 19 27
5773 3rd vec: 4 12 20 28 5 13 21 30
5774 4th vec: 6 14 22 30 7 15 23 31
5776 i.e., we interleave the contents of the four vectors in their order.
5778 We use interleave_high/low instructions to create such output. The input of
5779 each interleave_high/low operation is two vectors:
5780 1st vec 2nd vec
5781 0 1 2 3 4 5 6 7
5782 the even elements of the result vector are obtained left-to-right from the
5783 high/low elements of the first vector. The odd elements of the result are
5784 obtained left-to-right from the high/low elements of the second vector.
5785 The output of interleave_high will be: 0 4 1 5
5786 and of interleave_low: 2 6 3 7
5789 The permutation is done in log LENGTH stages. In each stage interleave_high
5790 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5791 where the first argument is taken from the first half of DR_CHAIN and the
5792 second argument from it's second half.
5793 In our example,
5795 I1: interleave_high (1st vec, 3rd vec)
5796 I2: interleave_low (1st vec, 3rd vec)
5797 I3: interleave_high (2nd vec, 4th vec)
5798 I4: interleave_low (2nd vec, 4th vec)
5800 The output for the first stage is:
5802 I1: 0 16 1 17 2 18 3 19
5803 I2: 4 20 5 21 6 22 7 23
5804 I3: 8 24 9 25 10 26 11 27
5805 I4: 12 28 13 29 14 30 15 31
5807 The output of the second stage, i.e. the final result is:
5809 I1: 0 8 16 24 1 9 17 25
5810 I2: 2 10 18 26 3 11 19 27
5811 I3: 4 12 20 28 5 13 21 30
5812 I4: 6 14 22 30 7 15 23 31. */
5814 void
5815 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5816 unsigned int length,
5817 stmt_vec_info stmt_info,
5818 gimple_stmt_iterator *gsi,
5819 vec<tree> *result_chain)
5821 tree vect1, vect2, high, low;
5822 gimple *perm_stmt;
5823 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5824 tree perm_mask_low, perm_mask_high;
5825 tree data_ref;
5826 tree perm3_mask_low, perm3_mask_high;
5827 unsigned int i, j, n, log_length = exact_log2 (length);
5829 result_chain->quick_grow (length);
5830 memcpy (result_chain->address (), dr_chain.address (),
5831 length * sizeof (tree));
5833 if (length == 3)
5835 /* vect_grouped_store_supported ensures that this is constant. */
5836 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5837 unsigned int j0 = 0, j1 = 0, j2 = 0;
5839 vec_perm_builder sel (nelt, nelt, 1);
5840 sel.quick_grow (nelt);
5841 vec_perm_indices indices;
5842 for (j = 0; j < 3; j++)
5844 int nelt0 = ((3 - j) * nelt) % 3;
5845 int nelt1 = ((3 - j) * nelt + 1) % 3;
5846 int nelt2 = ((3 - j) * nelt + 2) % 3;
5848 for (i = 0; i < nelt; i++)
5850 if (3 * i + nelt0 < nelt)
5851 sel[3 * i + nelt0] = j0++;
5852 if (3 * i + nelt1 < nelt)
5853 sel[3 * i + nelt1] = nelt + j1++;
5854 if (3 * i + nelt2 < nelt)
5855 sel[3 * i + nelt2] = 0;
5857 indices.new_vector (sel, 2, nelt);
5858 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5860 for (i = 0; i < nelt; i++)
5862 if (3 * i + nelt0 < nelt)
5863 sel[3 * i + nelt0] = 3 * i + nelt0;
5864 if (3 * i + nelt1 < nelt)
5865 sel[3 * i + nelt1] = 3 * i + nelt1;
5866 if (3 * i + nelt2 < nelt)
5867 sel[3 * i + nelt2] = nelt + j2++;
5869 indices.new_vector (sel, 2, nelt);
5870 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5872 vect1 = dr_chain[0];
5873 vect2 = dr_chain[1];
5875 /* Create interleaving stmt:
5876 low = VEC_PERM_EXPR <vect1, vect2,
5877 {j, nelt, *, j + 1, nelt + j + 1, *,
5878 j + 2, nelt + j + 2, *, ...}> */
5879 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5880 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5881 vect2, perm3_mask_low);
5882 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5884 vect1 = data_ref;
5885 vect2 = dr_chain[2];
5886 /* Create interleaving stmt:
5887 low = VEC_PERM_EXPR <vect1, vect2,
5888 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5889 6, 7, nelt + j + 2, ...}> */
5890 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5891 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5892 vect2, perm3_mask_high);
5893 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5894 (*result_chain)[j] = data_ref;
5897 else
5899 /* If length is not equal to 3 then only power of 2 is supported. */
5900 gcc_assert (pow2p_hwi (length));
5902 /* The encoding has 2 interleaved stepped patterns. */
5903 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5904 vec_perm_builder sel (nelt, 2, 3);
5905 sel.quick_grow (6);
5906 for (i = 0; i < 3; i++)
5908 sel[i * 2] = i;
5909 sel[i * 2 + 1] = i + nelt;
5911 vec_perm_indices indices (sel, 2, nelt);
5912 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5914 for (i = 0; i < 6; i++)
5915 sel[i] += exact_div (nelt, 2);
5916 indices.new_vector (sel, 2, nelt);
5917 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5919 for (i = 0, n = log_length; i < n; i++)
5921 for (j = 0; j < length/2; j++)
5923 vect1 = dr_chain[j];
5924 vect2 = dr_chain[j+length/2];
5926 /* Create interleaving stmt:
5927 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5928 ...}> */
5929 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5930 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5931 vect2, perm_mask_high);
5932 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5933 (*result_chain)[2*j] = high;
5935 /* Create interleaving stmt:
5936 low = VEC_PERM_EXPR <vect1, vect2,
5937 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5938 ...}> */
5939 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5940 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5941 vect2, perm_mask_low);
5942 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5943 (*result_chain)[2*j+1] = low;
5945 memcpy (dr_chain.address (), result_chain->address (),
5946 length * sizeof (tree));
5951 /* Function vect_setup_realignment
5953 This function is called when vectorizing an unaligned load using
5954 the dr_explicit_realign[_optimized] scheme.
5955 This function generates the following code at the loop prolog:
5957 p = initial_addr;
5958 x msq_init = *(floor(p)); # prolog load
5959 realignment_token = call target_builtin;
5960 loop:
5961 x msq = phi (msq_init, ---)
5963 The stmts marked with x are generated only for the case of
5964 dr_explicit_realign_optimized.
5966 The code above sets up a new (vector) pointer, pointing to the first
5967 location accessed by STMT_INFO, and a "floor-aligned" load using that
5968 pointer. It also generates code to compute the "realignment-token"
5969 (if the relevant target hook was defined), and creates a phi-node at the
5970 loop-header bb whose arguments are the result of the prolog-load (created
5971 by this function) and the result of a load that takes place in the loop
5972 (to be created by the caller to this function).
5974 For the case of dr_explicit_realign_optimized:
5975 The caller to this function uses the phi-result (msq) to create the
5976 realignment code inside the loop, and sets up the missing phi argument,
5977 as follows:
5978 loop:
5979 msq = phi (msq_init, lsq)
5980 lsq = *(floor(p')); # load in loop
5981 result = realign_load (msq, lsq, realignment_token);
5983 For the case of dr_explicit_realign:
5984 loop:
5985 msq = *(floor(p)); # load in loop
5986 p' = p + (VS-1);
5987 lsq = *(floor(p')); # load in loop
5988 result = realign_load (msq, lsq, realignment_token);
5990 Input:
5991 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5992 a memory location that may be unaligned.
5993 BSI - place where new code is to be inserted.
5994 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5995 is used.
5997 Output:
5998 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5999 target hook, if defined.
6000 Return value - the result of the loop-header phi node. */
6002 tree
6003 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6004 gimple_stmt_iterator *gsi, tree *realignment_token,
6005 enum dr_alignment_support alignment_support_scheme,
6006 tree init_addr,
6007 class loop **at_loop)
6009 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6010 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6011 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6012 struct data_reference *dr = dr_info->dr;
6013 class loop *loop = NULL;
6014 edge pe = NULL;
6015 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6016 tree vec_dest;
6017 gimple *inc;
6018 tree ptr;
6019 tree data_ref;
6020 basic_block new_bb;
6021 tree msq_init = NULL_TREE;
6022 tree new_temp;
6023 gphi *phi_stmt;
6024 tree msq = NULL_TREE;
6025 gimple_seq stmts = NULL;
6026 bool compute_in_loop = false;
6027 bool nested_in_vect_loop = false;
6028 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6029 class loop *loop_for_initial_load = NULL;
6031 if (loop_vinfo)
6033 loop = LOOP_VINFO_LOOP (loop_vinfo);
6034 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6037 gcc_assert (alignment_support_scheme == dr_explicit_realign
6038 || alignment_support_scheme == dr_explicit_realign_optimized);
6040 /* We need to generate three things:
6041 1. the misalignment computation
6042 2. the extra vector load (for the optimized realignment scheme).
6043 3. the phi node for the two vectors from which the realignment is
6044 done (for the optimized realignment scheme). */
6046 /* 1. Determine where to generate the misalignment computation.
6048 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6049 calculation will be generated by this function, outside the loop (in the
6050 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6051 caller, inside the loop.
6053 Background: If the misalignment remains fixed throughout the iterations of
6054 the loop, then both realignment schemes are applicable, and also the
6055 misalignment computation can be done outside LOOP. This is because we are
6056 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6057 are a multiple of VS (the Vector Size), and therefore the misalignment in
6058 different vectorized LOOP iterations is always the same.
6059 The problem arises only if the memory access is in an inner-loop nested
6060 inside LOOP, which is now being vectorized using outer-loop vectorization.
6061 This is the only case when the misalignment of the memory access may not
6062 remain fixed throughout the iterations of the inner-loop (as explained in
6063 detail in vect_supportable_dr_alignment). In this case, not only is the
6064 optimized realignment scheme not applicable, but also the misalignment
6065 computation (and generation of the realignment token that is passed to
6066 REALIGN_LOAD) have to be done inside the loop.
6068 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6069 or not, which in turn determines if the misalignment is computed inside
6070 the inner-loop, or outside LOOP. */
6072 if (init_addr != NULL_TREE || !loop_vinfo)
6074 compute_in_loop = true;
6075 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6079 /* 2. Determine where to generate the extra vector load.
6081 For the optimized realignment scheme, instead of generating two vector
6082 loads in each iteration, we generate a single extra vector load in the
6083 preheader of the loop, and in each iteration reuse the result of the
6084 vector load from the previous iteration. In case the memory access is in
6085 an inner-loop nested inside LOOP, which is now being vectorized using
6086 outer-loop vectorization, we need to determine whether this initial vector
6087 load should be generated at the preheader of the inner-loop, or can be
6088 generated at the preheader of LOOP. If the memory access has no evolution
6089 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6090 to be generated inside LOOP (in the preheader of the inner-loop). */
6092 if (nested_in_vect_loop)
6094 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6095 bool invariant_in_outerloop =
6096 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6097 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6099 else
6100 loop_for_initial_load = loop;
6101 if (at_loop)
6102 *at_loop = loop_for_initial_load;
6104 tree vuse = NULL_TREE;
6105 if (loop_for_initial_load)
6107 pe = loop_preheader_edge (loop_for_initial_load);
6108 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6109 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6111 if (!vuse)
6112 vuse = gimple_vuse (gsi_stmt (*gsi));
6114 /* 3. For the case of the optimized realignment, create the first vector
6115 load at the loop preheader. */
6117 if (alignment_support_scheme == dr_explicit_realign_optimized)
6119 /* Create msq_init = *(floor(p1)) in the loop preheader */
6120 gassign *new_stmt;
6122 gcc_assert (!compute_in_loop);
6123 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6124 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6125 loop_for_initial_load, NULL_TREE,
6126 &init_addr, NULL, &inc, true);
6127 if (TREE_CODE (ptr) == SSA_NAME)
6128 new_temp = copy_ssa_name (ptr);
6129 else
6130 new_temp = make_ssa_name (TREE_TYPE (ptr));
6131 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6132 tree type = TREE_TYPE (ptr);
6133 new_stmt = gimple_build_assign
6134 (new_temp, BIT_AND_EXPR, ptr,
6135 fold_build2 (MINUS_EXPR, type,
6136 build_int_cst (type, 0),
6137 build_int_cst (type, align)));
6138 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6139 gcc_assert (!new_bb);
6140 data_ref
6141 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6142 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6143 vect_copy_ref_info (data_ref, DR_REF (dr));
6144 new_stmt = gimple_build_assign (vec_dest, data_ref);
6145 new_temp = make_ssa_name (vec_dest, new_stmt);
6146 gimple_assign_set_lhs (new_stmt, new_temp);
6147 gimple_set_vuse (new_stmt, vuse);
6148 if (pe)
6150 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6151 gcc_assert (!new_bb);
6153 else
6154 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6156 msq_init = gimple_assign_lhs (new_stmt);
6159 /* 4. Create realignment token using a target builtin, if available.
6160 It is done either inside the containing loop, or before LOOP (as
6161 determined above). */
6163 if (targetm.vectorize.builtin_mask_for_load)
6165 gcall *new_stmt;
6166 tree builtin_decl;
6168 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6169 if (!init_addr)
6171 /* Generate the INIT_ADDR computation outside LOOP. */
6172 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6173 stmt_info, &stmts,
6174 NULL_TREE);
6175 if (loop)
6177 pe = loop_preheader_edge (loop);
6178 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6179 gcc_assert (!new_bb);
6181 else
6182 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6185 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6186 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6187 vec_dest =
6188 vect_create_destination_var (scalar_dest,
6189 gimple_call_return_type (new_stmt));
6190 new_temp = make_ssa_name (vec_dest, new_stmt);
6191 gimple_call_set_lhs (new_stmt, new_temp);
6193 if (compute_in_loop)
6194 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6195 else
6197 /* Generate the misalignment computation outside LOOP. */
6198 pe = loop_preheader_edge (loop);
6199 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6200 gcc_assert (!new_bb);
6203 *realignment_token = gimple_call_lhs (new_stmt);
6205 /* The result of the CALL_EXPR to this builtin is determined from
6206 the value of the parameter and no global variables are touched
6207 which makes the builtin a "const" function. Requiring the
6208 builtin to have the "const" attribute makes it unnecessary
6209 to call mark_call_clobbered. */
6210 gcc_assert (TREE_READONLY (builtin_decl));
6213 if (alignment_support_scheme == dr_explicit_realign)
6214 return msq;
6216 gcc_assert (!compute_in_loop);
6217 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6220 /* 5. Create msq = phi <msq_init, lsq> in loop */
6222 pe = loop_preheader_edge (containing_loop);
6223 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6224 msq = make_ssa_name (vec_dest);
6225 phi_stmt = create_phi_node (msq, containing_loop->header);
6226 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6228 return msq;
6232 /* Function vect_grouped_load_supported.
6234 COUNT is the size of the load group (the number of statements plus the
6235 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6236 only one statement, with a gap of COUNT - 1.
6238 Returns true if a suitable permute exists. */
6240 bool
6241 vect_grouped_load_supported (tree vectype, bool single_element_p,
6242 unsigned HOST_WIDE_INT count)
6244 machine_mode mode = TYPE_MODE (vectype);
6246 /* If this is single-element interleaving with an element distance
6247 that leaves unused vector loads around punt - we at least create
6248 very sub-optimal code in that case (and blow up memory,
6249 see PR65518). */
6250 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254 "single-element interleaving not supported "
6255 "for not adjacent vector loads\n");
6256 return false;
6259 /* vect_permute_load_chain requires the group size to be equal to 3 or
6260 be a power of two. */
6261 if (count != 3 && exact_log2 (count) == -1)
6263 if (dump_enabled_p ())
6264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265 "the size of the group of accesses"
6266 " is not a power of 2 or not equal to 3\n");
6267 return false;
6270 /* Check that the permutation is supported. */
6271 if (VECTOR_MODE_P (mode))
6273 unsigned int i, j;
6274 if (count == 3)
6276 unsigned int nelt;
6277 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6279 if (dump_enabled_p ())
6280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6281 "cannot handle groups of 3 loads for"
6282 " variable-length vectors\n");
6283 return false;
6286 vec_perm_builder sel (nelt, nelt, 1);
6287 sel.quick_grow (nelt);
6288 vec_perm_indices indices;
6289 unsigned int k;
6290 for (k = 0; k < 3; k++)
6292 for (i = 0; i < nelt; i++)
6293 if (3 * i + k < 2 * nelt)
6294 sel[i] = 3 * i + k;
6295 else
6296 sel[i] = 0;
6297 indices.new_vector (sel, 2, nelt);
6298 if (!can_vec_perm_const_p (mode, mode, indices))
6300 if (dump_enabled_p ())
6301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6302 "shuffle of 3 loads is not supported by"
6303 " target\n");
6304 return false;
6306 for (i = 0, j = 0; i < nelt; i++)
6307 if (3 * i + k < 2 * nelt)
6308 sel[i] = i;
6309 else
6310 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6311 indices.new_vector (sel, 2, nelt);
6312 if (!can_vec_perm_const_p (mode, mode, indices))
6314 if (dump_enabled_p ())
6315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6316 "shuffle of 3 loads is not supported by"
6317 " target\n");
6318 return false;
6321 return true;
6323 else
6325 /* If length is not equal to 3 then only power of 2 is supported. */
6326 gcc_assert (pow2p_hwi (count));
6327 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6329 /* The encoding has a single stepped pattern. */
6330 vec_perm_builder sel (nelt, 1, 3);
6331 sel.quick_grow (3);
6332 for (i = 0; i < 3; i++)
6333 sel[i] = i * 2;
6334 vec_perm_indices indices (sel, 2, nelt);
6335 if (can_vec_perm_const_p (mode, mode, indices))
6337 for (i = 0; i < 3; i++)
6338 sel[i] = i * 2 + 1;
6339 indices.new_vector (sel, 2, nelt);
6340 if (can_vec_perm_const_p (mode, mode, indices))
6341 return true;
6346 if (dump_enabled_p ())
6347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6348 "extract even/odd not supported by target\n");
6349 return false;
6352 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6353 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6355 internal_fn
6356 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6357 bool masked_p)
6359 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6360 vec_mask_len_load_lanes_optab, vectype,
6361 count))
6362 return IFN_MASK_LEN_LOAD_LANES;
6363 else if (masked_p)
6365 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6366 vec_mask_load_lanes_optab, vectype,
6367 count))
6368 return IFN_MASK_LOAD_LANES;
6370 else
6372 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6373 vectype, count))
6374 return IFN_LOAD_LANES;
6376 return IFN_LAST;
6379 /* Function vect_permute_load_chain.
6381 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6382 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6383 the input data correctly. Return the final references for loads in
6384 RESULT_CHAIN.
6386 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6387 The input is 4 vectors each containing 8 elements. We assign a number to each
6388 element, the input sequence is:
6390 1st vec: 0 1 2 3 4 5 6 7
6391 2nd vec: 8 9 10 11 12 13 14 15
6392 3rd vec: 16 17 18 19 20 21 22 23
6393 4th vec: 24 25 26 27 28 29 30 31
6395 The output sequence should be:
6397 1st vec: 0 4 8 12 16 20 24 28
6398 2nd vec: 1 5 9 13 17 21 25 29
6399 3rd vec: 2 6 10 14 18 22 26 30
6400 4th vec: 3 7 11 15 19 23 27 31
6402 i.e., the first output vector should contain the first elements of each
6403 interleaving group, etc.
6405 We use extract_even/odd instructions to create such output. The input of
6406 each extract_even/odd operation is two vectors
6407 1st vec 2nd vec
6408 0 1 2 3 4 5 6 7
6410 and the output is the vector of extracted even/odd elements. The output of
6411 extract_even will be: 0 2 4 6
6412 and of extract_odd: 1 3 5 7
6415 The permutation is done in log LENGTH stages. In each stage extract_even
6416 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6417 their order. In our example,
6419 E1: extract_even (1st vec, 2nd vec)
6420 E2: extract_odd (1st vec, 2nd vec)
6421 E3: extract_even (3rd vec, 4th vec)
6422 E4: extract_odd (3rd vec, 4th vec)
6424 The output for the first stage will be:
6426 E1: 0 2 4 6 8 10 12 14
6427 E2: 1 3 5 7 9 11 13 15
6428 E3: 16 18 20 22 24 26 28 30
6429 E4: 17 19 21 23 25 27 29 31
6431 In order to proceed and create the correct sequence for the next stage (or
6432 for the correct output, if the second stage is the last one, as in our
6433 example), we first put the output of extract_even operation and then the
6434 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6435 The input for the second stage is:
6437 1st vec (E1): 0 2 4 6 8 10 12 14
6438 2nd vec (E3): 16 18 20 22 24 26 28 30
6439 3rd vec (E2): 1 3 5 7 9 11 13 15
6440 4th vec (E4): 17 19 21 23 25 27 29 31
6442 The output of the second stage:
6444 E1: 0 4 8 12 16 20 24 28
6445 E2: 2 6 10 14 18 22 26 30
6446 E3: 1 5 9 13 17 21 25 29
6447 E4: 3 7 11 15 19 23 27 31
6449 And RESULT_CHAIN after reordering:
6451 1st vec (E1): 0 4 8 12 16 20 24 28
6452 2nd vec (E3): 1 5 9 13 17 21 25 29
6453 3rd vec (E2): 2 6 10 14 18 22 26 30
6454 4th vec (E4): 3 7 11 15 19 23 27 31. */
6456 static void
6457 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6458 unsigned int length,
6459 stmt_vec_info stmt_info,
6460 gimple_stmt_iterator *gsi,
6461 vec<tree> *result_chain)
6463 tree data_ref, first_vect, second_vect;
6464 tree perm_mask_even, perm_mask_odd;
6465 tree perm3_mask_low, perm3_mask_high;
6466 gimple *perm_stmt;
6467 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6468 unsigned int i, j, log_length = exact_log2 (length);
6470 result_chain->quick_grow (length);
6471 memcpy (result_chain->address (), dr_chain.address (),
6472 length * sizeof (tree));
6474 if (length == 3)
6476 /* vect_grouped_load_supported ensures that this is constant. */
6477 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6478 unsigned int k;
6480 vec_perm_builder sel (nelt, nelt, 1);
6481 sel.quick_grow (nelt);
6482 vec_perm_indices indices;
6483 for (k = 0; k < 3; k++)
6485 for (i = 0; i < nelt; i++)
6486 if (3 * i + k < 2 * nelt)
6487 sel[i] = 3 * i + k;
6488 else
6489 sel[i] = 0;
6490 indices.new_vector (sel, 2, nelt);
6491 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6493 for (i = 0, j = 0; i < nelt; i++)
6494 if (3 * i + k < 2 * nelt)
6495 sel[i] = i;
6496 else
6497 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6498 indices.new_vector (sel, 2, nelt);
6499 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6501 first_vect = dr_chain[0];
6502 second_vect = dr_chain[1];
6504 /* Create interleaving stmt (low part of):
6505 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6506 ...}> */
6507 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6508 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6509 second_vect, perm3_mask_low);
6510 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6512 /* Create interleaving stmt (high part of):
6513 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6514 ...}> */
6515 first_vect = data_ref;
6516 second_vect = dr_chain[2];
6517 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6518 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6519 second_vect, perm3_mask_high);
6520 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6521 (*result_chain)[k] = data_ref;
6524 else
6526 /* If length is not equal to 3 then only power of 2 is supported. */
6527 gcc_assert (pow2p_hwi (length));
6529 /* The encoding has a single stepped pattern. */
6530 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6531 vec_perm_builder sel (nelt, 1, 3);
6532 sel.quick_grow (3);
6533 for (i = 0; i < 3; ++i)
6534 sel[i] = i * 2;
6535 vec_perm_indices indices (sel, 2, nelt);
6536 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6538 for (i = 0; i < 3; ++i)
6539 sel[i] = i * 2 + 1;
6540 indices.new_vector (sel, 2, nelt);
6541 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6543 for (i = 0; i < log_length; i++)
6545 for (j = 0; j < length; j += 2)
6547 first_vect = dr_chain[j];
6548 second_vect = dr_chain[j+1];
6550 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6551 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6552 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6553 first_vect, second_vect,
6554 perm_mask_even);
6555 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6556 (*result_chain)[j/2] = data_ref;
6558 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6559 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6560 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6561 first_vect, second_vect,
6562 perm_mask_odd);
6563 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6564 (*result_chain)[j/2+length/2] = data_ref;
6566 memcpy (dr_chain.address (), result_chain->address (),
6567 length * sizeof (tree));
6572 /* Function vect_shift_permute_load_chain.
6574 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6575 sequence of stmts to reorder the input data accordingly.
6576 Return the final references for loads in RESULT_CHAIN.
6577 Return true if successed, false otherwise.
6579 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6580 The input is 3 vectors each containing 8 elements. We assign a
6581 number to each element, the input sequence is:
6583 1st vec: 0 1 2 3 4 5 6 7
6584 2nd vec: 8 9 10 11 12 13 14 15
6585 3rd vec: 16 17 18 19 20 21 22 23
6587 The output sequence should be:
6589 1st vec: 0 3 6 9 12 15 18 21
6590 2nd vec: 1 4 7 10 13 16 19 22
6591 3rd vec: 2 5 8 11 14 17 20 23
6593 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6595 First we shuffle all 3 vectors to get correct elements order:
6597 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6598 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6599 3rd vec: (16 19 22) (17 20 23) (18 21)
6601 Next we unite and shift vector 3 times:
6603 1st step:
6604 shift right by 6 the concatenation of:
6605 "1st vec" and "2nd vec"
6606 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6607 "2nd vec" and "3rd vec"
6608 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6609 "3rd vec" and "1st vec"
6610 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6611 | New vectors |
6613 So that now new vectors are:
6615 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6616 2nd vec: (10 13) (16 19 22) (17 20 23)
6617 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6619 2nd step:
6620 shift right by 5 the concatenation of:
6621 "1st vec" and "3rd vec"
6622 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6623 "2nd vec" and "1st vec"
6624 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6625 "3rd vec" and "2nd vec"
6626 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6627 | New vectors |
6629 So that now new vectors are:
6631 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6632 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6633 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6635 3rd step:
6636 shift right by 5 the concatenation of:
6637 "1st vec" and "1st vec"
6638 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6639 shift right by 3 the concatenation of:
6640 "2nd vec" and "2nd vec"
6641 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6642 | New vectors |
6644 So that now all vectors are READY:
6645 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6646 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6647 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6649 This algorithm is faster than one in vect_permute_load_chain if:
6650 1. "shift of a concatination" is faster than general permutation.
6651 This is usually so.
6652 2. The TARGET machine can't execute vector instructions in parallel.
6653 This is because each step of the algorithm depends on previous.
6654 The algorithm in vect_permute_load_chain is much more parallel.
6656 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6659 static bool
6660 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6661 unsigned int length,
6662 stmt_vec_info stmt_info,
6663 gimple_stmt_iterator *gsi,
6664 vec<tree> *result_chain)
6666 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6667 tree perm2_mask1, perm2_mask2, perm3_mask;
6668 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6669 gimple *perm_stmt;
6671 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6672 machine_mode vmode = TYPE_MODE (vectype);
6673 unsigned int i;
6674 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6676 unsigned HOST_WIDE_INT nelt, vf;
6677 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6678 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6679 /* Not supported for variable-length vectors. */
6680 return false;
6682 vec_perm_builder sel (nelt, nelt, 1);
6683 sel.quick_grow (nelt);
6685 result_chain->quick_grow (length);
6686 memcpy (result_chain->address (), dr_chain.address (),
6687 length * sizeof (tree));
6689 if (pow2p_hwi (length) && vf > 4)
6691 unsigned int j, log_length = exact_log2 (length);
6692 for (i = 0; i < nelt / 2; ++i)
6693 sel[i] = i * 2;
6694 for (i = 0; i < nelt / 2; ++i)
6695 sel[nelt / 2 + i] = i * 2 + 1;
6696 vec_perm_indices indices (sel, 2, nelt);
6697 if (!can_vec_perm_const_p (vmode, vmode, indices))
6699 if (dump_enabled_p ())
6700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6701 "shuffle of 2 fields structure is not \
6702 supported by target\n");
6703 return false;
6705 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6707 for (i = 0; i < nelt / 2; ++i)
6708 sel[i] = i * 2 + 1;
6709 for (i = 0; i < nelt / 2; ++i)
6710 sel[nelt / 2 + i] = i * 2;
6711 indices.new_vector (sel, 2, nelt);
6712 if (!can_vec_perm_const_p (vmode, vmode, indices))
6714 if (dump_enabled_p ())
6715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716 "shuffle of 2 fields structure is not \
6717 supported by target\n");
6718 return false;
6720 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6722 /* Generating permutation constant to shift all elements.
6723 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6724 for (i = 0; i < nelt; i++)
6725 sel[i] = nelt / 2 + i;
6726 indices.new_vector (sel, 2, nelt);
6727 if (!can_vec_perm_const_p (vmode, vmode, indices))
6729 if (dump_enabled_p ())
6730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6731 "shift permutation is not supported by target\n");
6732 return false;
6734 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6736 /* Generating permutation constant to select vector from 2.
6737 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6738 for (i = 0; i < nelt / 2; i++)
6739 sel[i] = i;
6740 for (i = nelt / 2; i < nelt; i++)
6741 sel[i] = nelt + i;
6742 indices.new_vector (sel, 2, nelt);
6743 if (!can_vec_perm_const_p (vmode, vmode, indices))
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6747 "select is not supported by target\n");
6748 return false;
6750 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6752 for (i = 0; i < log_length; i++)
6754 for (j = 0; j < length; j += 2)
6756 first_vect = dr_chain[j];
6757 second_vect = dr_chain[j + 1];
6759 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6760 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6761 first_vect, first_vect,
6762 perm2_mask1);
6763 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6764 vect[0] = data_ref;
6766 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6767 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6768 second_vect, second_vect,
6769 perm2_mask2);
6770 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6771 vect[1] = data_ref;
6773 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6774 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6775 vect[0], vect[1], shift1_mask);
6776 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6777 (*result_chain)[j/2 + length/2] = data_ref;
6779 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6780 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6781 vect[0], vect[1], select_mask);
6782 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6783 (*result_chain)[j/2] = data_ref;
6785 memcpy (dr_chain.address (), result_chain->address (),
6786 length * sizeof (tree));
6788 return true;
6790 if (length == 3 && vf > 2)
6792 unsigned int k = 0, l = 0;
6794 /* Generating permutation constant to get all elements in rigth order.
6795 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6796 for (i = 0; i < nelt; i++)
6798 if (3 * k + (l % 3) >= nelt)
6800 k = 0;
6801 l += (3 - (nelt % 3));
6803 sel[i] = 3 * k + (l % 3);
6804 k++;
6806 vec_perm_indices indices (sel, 2, nelt);
6807 if (!can_vec_perm_const_p (vmode, vmode, indices))
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811 "shuffle of 3 fields structure is not \
6812 supported by target\n");
6813 return false;
6815 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6817 /* Generating permutation constant to shift all elements.
6818 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6819 for (i = 0; i < nelt; i++)
6820 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6821 indices.new_vector (sel, 2, nelt);
6822 if (!can_vec_perm_const_p (vmode, vmode, indices))
6824 if (dump_enabled_p ())
6825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6826 "shift permutation is not supported by target\n");
6827 return false;
6829 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6831 /* Generating permutation constant to shift all elements.
6832 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6833 for (i = 0; i < nelt; i++)
6834 sel[i] = 2 * (nelt / 3) + 1 + i;
6835 indices.new_vector (sel, 2, nelt);
6836 if (!can_vec_perm_const_p (vmode, vmode, indices))
6838 if (dump_enabled_p ())
6839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6840 "shift permutation is not supported by target\n");
6841 return false;
6843 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6845 /* Generating permutation constant to shift all elements.
6846 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6847 for (i = 0; i < nelt; i++)
6848 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6849 indices.new_vector (sel, 2, nelt);
6850 if (!can_vec_perm_const_p (vmode, vmode, indices))
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854 "shift permutation is not supported by target\n");
6855 return false;
6857 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6859 /* Generating permutation constant to shift all elements.
6860 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6861 for (i = 0; i < nelt; i++)
6862 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6863 indices.new_vector (sel, 2, nelt);
6864 if (!can_vec_perm_const_p (vmode, vmode, indices))
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868 "shift permutation is not supported by target\n");
6869 return false;
6871 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6873 for (k = 0; k < 3; k++)
6875 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6876 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6877 dr_chain[k], dr_chain[k],
6878 perm3_mask);
6879 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6880 vect[k] = data_ref;
6883 for (k = 0; k < 3; k++)
6885 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6886 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6887 vect[k % 3], vect[(k + 1) % 3],
6888 shift1_mask);
6889 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6890 vect_shift[k] = data_ref;
6893 for (k = 0; k < 3; k++)
6895 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6896 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6897 vect_shift[(4 - k) % 3],
6898 vect_shift[(3 - k) % 3],
6899 shift2_mask);
6900 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6901 vect[k] = data_ref;
6904 (*result_chain)[3 - (nelt % 3)] = vect[2];
6906 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6907 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6908 vect[0], shift3_mask);
6909 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6910 (*result_chain)[nelt % 3] = data_ref;
6912 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6913 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6914 vect[1], shift4_mask);
6915 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6916 (*result_chain)[0] = data_ref;
6917 return true;
6919 return false;
6922 /* Function vect_transform_grouped_load.
6924 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6925 to perform their permutation and ascribe the result vectorized statements to
6926 the scalar statements.
6929 void
6930 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6931 vec<tree> dr_chain,
6932 int size, gimple_stmt_iterator *gsi)
6934 machine_mode mode;
6935 vec<tree> result_chain = vNULL;
6937 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6938 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6939 vectors, that are ready for vector computation. */
6940 result_chain.create (size);
6942 /* If reassociation width for vector type is 2 or greater target machine can
6943 execute 2 or more vector instructions in parallel. Otherwise try to
6944 get chain for loads group using vect_shift_permute_load_chain. */
6945 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6946 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6947 || pow2p_hwi (size)
6948 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6949 gsi, &result_chain))
6950 vect_permute_load_chain (vinfo, dr_chain,
6951 size, stmt_info, gsi, &result_chain);
6952 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6953 result_chain.release ();
6956 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6957 generated as part of the vectorization of STMT_INFO. Assign the statement
6958 for each vector to the associated scalar statement. */
6960 void
6961 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6962 vec<tree> result_chain)
6964 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6965 unsigned int i, gap_count;
6966 tree tmp_data_ref;
6968 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6969 Since we scan the chain starting from it's first node, their order
6970 corresponds the order of data-refs in RESULT_CHAIN. */
6971 stmt_vec_info next_stmt_info = first_stmt_info;
6972 gap_count = 1;
6973 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6975 if (!next_stmt_info)
6976 break;
6978 /* Skip the gaps. Loads created for the gaps will be removed by dead
6979 code elimination pass later. No need to check for the first stmt in
6980 the group, since it always exists.
6981 DR_GROUP_GAP is the number of steps in elements from the previous
6982 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6983 correspond to the gaps. */
6984 if (next_stmt_info != first_stmt_info
6985 && gap_count < DR_GROUP_GAP (next_stmt_info))
6987 gap_count++;
6988 continue;
6991 /* ??? The following needs cleanup after the removal of
6992 DR_GROUP_SAME_DR_STMT. */
6993 if (next_stmt_info)
6995 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6996 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6997 copies, and we put the new vector statement last. */
6998 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7000 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7001 gap_count = 1;
7006 /* Function vect_force_dr_alignment_p.
7008 Returns whether the alignment of a DECL can be forced to be aligned
7009 on ALIGNMENT bit boundary. */
7011 bool
7012 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7014 if (!VAR_P (decl))
7015 return false;
7017 if (decl_in_symtab_p (decl)
7018 && !symtab_node::get (decl)->can_increase_alignment_p ())
7019 return false;
7021 if (TREE_STATIC (decl))
7022 return (known_le (alignment,
7023 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7024 else
7025 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7028 /* Return whether the data reference DR_INFO is supported with respect to its
7029 alignment.
7030 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7031 it is aligned, i.e., check if it is possible to vectorize it with different
7032 alignment. */
7034 enum dr_alignment_support
7035 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7036 tree vectype, int misalignment)
7038 data_reference *dr = dr_info->dr;
7039 stmt_vec_info stmt_info = dr_info->stmt;
7040 machine_mode mode = TYPE_MODE (vectype);
7041 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7042 class loop *vect_loop = NULL;
7043 bool nested_in_vect_loop = false;
7045 if (misalignment == 0)
7046 return dr_aligned;
7048 /* For now assume all conditional loads/stores support unaligned
7049 access without any special code. */
7050 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7051 if (gimple_call_internal_p (stmt)
7052 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7053 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7054 return dr_unaligned_supported;
7056 if (loop_vinfo)
7058 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7059 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7062 /* Possibly unaligned access. */
7064 /* We can choose between using the implicit realignment scheme (generating
7065 a misaligned_move stmt) and the explicit realignment scheme (generating
7066 aligned loads with a REALIGN_LOAD). There are two variants to the
7067 explicit realignment scheme: optimized, and unoptimized.
7068 We can optimize the realignment only if the step between consecutive
7069 vector loads is equal to the vector size. Since the vector memory
7070 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7071 is guaranteed that the misalignment amount remains the same throughout the
7072 execution of the vectorized loop. Therefore, we can create the
7073 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7074 at the loop preheader.
7076 However, in the case of outer-loop vectorization, when vectorizing a
7077 memory access in the inner-loop nested within the LOOP that is now being
7078 vectorized, while it is guaranteed that the misalignment of the
7079 vectorized memory access will remain the same in different outer-loop
7080 iterations, it is *not* guaranteed that is will remain the same throughout
7081 the execution of the inner-loop. This is because the inner-loop advances
7082 with the original scalar step (and not in steps of VS). If the inner-loop
7083 step happens to be a multiple of VS, then the misalignment remains fixed
7084 and we can use the optimized realignment scheme. For example:
7086 for (i=0; i<N; i++)
7087 for (j=0; j<M; j++)
7088 s += a[i+j];
7090 When vectorizing the i-loop in the above example, the step between
7091 consecutive vector loads is 1, and so the misalignment does not remain
7092 fixed across the execution of the inner-loop, and the realignment cannot
7093 be optimized (as illustrated in the following pseudo vectorized loop):
7095 for (i=0; i<N; i+=4)
7096 for (j=0; j<M; j++){
7097 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7098 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7099 // (assuming that we start from an aligned address).
7102 We therefore have to use the unoptimized realignment scheme:
7104 for (i=0; i<N; i+=4)
7105 for (j=k; j<M; j+=4)
7106 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7107 // that the misalignment of the initial address is
7108 // 0).
7110 The loop can then be vectorized as follows:
7112 for (k=0; k<4; k++){
7113 rt = get_realignment_token (&vp[k]);
7114 for (i=0; i<N; i+=4){
7115 v1 = vp[i+k];
7116 for (j=k; j<M; j+=4){
7117 v2 = vp[i+j+VS-1];
7118 va = REALIGN_LOAD <v1,v2,rt>;
7119 vs += va;
7120 v1 = v2;
7123 } */
7125 if (DR_IS_READ (dr))
7127 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7128 && (!targetm.vectorize.builtin_mask_for_load
7129 || targetm.vectorize.builtin_mask_for_load ()))
7131 /* If we are doing SLP then the accesses need not have the
7132 same alignment, instead it depends on the SLP group size. */
7133 if (loop_vinfo
7134 && STMT_SLP_TYPE (stmt_info)
7135 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7136 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7137 * (DR_GROUP_SIZE
7138 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7139 TYPE_VECTOR_SUBPARTS (vectype))))
7141 else if (!loop_vinfo
7142 || (nested_in_vect_loop
7143 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7144 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7145 return dr_explicit_realign;
7146 else
7147 return dr_explicit_realign_optimized;
7151 bool is_packed = false;
7152 tree type = TREE_TYPE (DR_REF (dr));
7153 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7154 is_packed = not_size_aligned (DR_REF (dr));
7155 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7156 is_packed))
7157 return dr_unaligned_supported;
7159 /* Unsupported. */
7160 return dr_unaligned_unsupported;