xfail gnat.dg/trampoline3.adb scan-assembler-not check on hppa*-*-*
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blobe6a3035064b2866c2a06193e259c587a066557c1
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
104 static cgraph_node*
105 simd_clone_call_p (gimple *stmt)
107 gcall *call = dyn_cast <gcall *> (stmt);
108 if (!call)
109 return NULL;
111 tree fndecl = NULL_TREE;
112 if (gimple_call_internal_p (call, IFN_MASK_CALL))
113 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
114 else
115 fndecl = gimple_call_fndecl (stmt);
117 if (fndecl == NULL_TREE)
118 return NULL;
120 cgraph_node *node = cgraph_node::get (fndecl);
121 if (node && node->simd_clones != NULL)
122 return node;
124 return NULL;
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
144 types. */
146 tree
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
149 HOST_WIDE_INT lhs, rhs;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
154 return scalar_type;
156 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
158 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
159 if (assign)
161 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
162 if (gimple_assign_cast_p (assign)
163 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
169 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
171 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
172 if (rhs < lhs)
173 scalar_type = rhs_type;
176 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
178 auto clone = node->simd_clones->simdclone;
179 for (unsigned int i = 0; i < clone->nargs; ++i)
181 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
183 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
184 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
185 if (rhs < lhs)
187 scalar_type = arg_scalar_type;
188 lhs = rhs;
193 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
195 unsigned int i = 0;
196 if (gimple_call_internal_p (call))
198 internal_fn ifn = gimple_call_internal_fn (call);
199 if (internal_load_fn_p (ifn))
200 /* For loads the LHS type does the trick. */
201 i = ~0U;
202 else if (internal_store_fn_p (ifn))
204 /* For stores use the tyep of the stored value. */
205 i = internal_fn_stored_value_index (ifn);
206 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
207 i = ~0U;
209 else if (internal_fn_mask_index (ifn) == 0)
210 i = 1;
212 if (i < gimple_call_num_args (call))
214 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
217 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
218 if (rhs < lhs)
219 scalar_type = rhs_type;
224 return scalar_type;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
232 static opt_result
233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
238 return opt_result::failure_at (vect_location,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
241 " == 0\n");
243 opt_result res
244 = runtime_alias_check_p (ddr, loop,
245 optimize_loop_nest_for_speed_p (loop));
246 if (!res)
247 return res;
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
255 static void
256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
258 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
259 for (unsigned int i = 0; i < checks.length(); ++i)
260 if (checks[i] == value)
261 return;
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "need run-time check that %T is nonzero\n",
266 value);
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
274 static bool
275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
277 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
278 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
283 return true;
285 /* If there is a loop invariant read involved we might vectorize it in
286 the prologue, breaking scalar oder with respect to the in-loop store. */
287 if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
288 || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
289 return false;
291 /* STMT_A and STMT_B belong to overlapping groups. All loads are
292 emitted at the position of the first scalar load.
293 Stores in a group are emitted at the position of the last scalar store.
294 Compute that position and check whether the resulting order matches
295 the current one. */
296 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
297 if (il_a)
299 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
300 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
301 s = DR_GROUP_NEXT_ELEMENT (s))
302 il_a = get_later_stmt (il_a, s);
303 else /* DR_IS_READ */
304 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
305 s = DR_GROUP_NEXT_ELEMENT (s))
306 if (get_later_stmt (il_a, s) == il_a)
307 il_a = s;
309 else
310 il_a = stmtinfo_a;
311 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
312 if (il_b)
314 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
315 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
316 s = DR_GROUP_NEXT_ELEMENT (s))
317 il_b = get_later_stmt (il_b, s);
318 else /* DR_IS_READ */
319 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
320 s = DR_GROUP_NEXT_ELEMENT (s))
321 if (get_later_stmt (il_b, s) == il_b)
322 il_b = s;
324 else
325 il_b = stmtinfo_b;
326 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
327 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
330 /* A subroutine of vect_analyze_data_ref_dependence. Handle
331 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
332 distances. These distances are conservatively correct but they don't
333 reflect a guaranteed dependence.
335 Return true if this function does all the work necessary to avoid
336 an alias or false if the caller should use the dependence distances
337 to limit the vectorization factor in the usual way. LOOP_DEPTH is
338 the depth of the loop described by LOOP_VINFO and the other arguments
339 are as for vect_analyze_data_ref_dependence. */
341 static bool
342 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
343 loop_vec_info loop_vinfo,
344 int loop_depth, unsigned int *max_vf)
346 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
347 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
349 int dist = dist_v[loop_depth];
350 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
352 /* If the user asserted safelen >= DIST consecutive iterations
353 can be executed concurrently, assume independence.
355 ??? An alternative would be to add the alias check even
356 in this case, and vectorize the fallback loop with the
357 maximum VF set to safelen. However, if the user has
358 explicitly given a length, it's less likely that that
359 would be a win. */
360 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
362 if ((unsigned int) loop->safelen < *max_vf)
363 *max_vf = loop->safelen;
364 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
365 continue;
368 /* For dependence distances of 2 or more, we have the option
369 of limiting VF or checking for an alias at runtime.
370 Prefer to check at runtime if we can, to avoid limiting
371 the VF unnecessarily when the bases are in fact independent.
373 Note that the alias checks will be removed if the VF ends up
374 being small enough. */
375 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
376 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
377 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
378 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
379 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
382 return true;
386 /* Function vect_analyze_data_ref_dependence.
388 FIXME: I needed to change the sense of the returned flag.
390 Return FALSE if there (might) exist a dependence between a memory-reference
391 DRA and a memory-reference DRB. When versioning for alias may check a
392 dependence at run-time, return TRUE. Adjust *MAX_VF according to
393 the data dependence. */
395 static opt_result
396 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
397 loop_vec_info loop_vinfo,
398 unsigned int *max_vf)
400 unsigned int i;
401 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
402 struct data_reference *dra = DDR_A (ddr);
403 struct data_reference *drb = DDR_B (ddr);
404 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
405 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
406 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
407 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
408 lambda_vector dist_v;
409 unsigned int loop_depth;
411 /* If user asserted safelen consecutive iterations can be
412 executed concurrently, assume independence. */
413 auto apply_safelen = [&]()
415 if (loop->safelen >= 2)
417 if ((unsigned int) loop->safelen < *max_vf)
418 *max_vf = loop->safelen;
419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 return true;
422 return false;
425 /* In loop analysis all data references should be vectorizable. */
426 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
427 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
428 gcc_unreachable ();
430 /* Independent data accesses. */
431 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
432 return opt_result::success ();
434 if (dra == drb
435 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
436 return opt_result::success ();
438 /* We do not have to consider dependences between accesses that belong
439 to the same group, unless the stride could be smaller than the
440 group size. */
441 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
442 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
443 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
444 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
445 return opt_result::success ();
447 /* Even if we have an anti-dependence then, as the vectorized loop covers at
448 least two scalar iterations, there is always also a true dependence.
449 As the vectorizer does not re-order loads and stores we can ignore
450 the anti-dependence if TBAA can disambiguate both DRs similar to the
451 case with known negative distance anti-dependences (positive
452 distance anti-dependences would violate TBAA constraints). */
453 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
454 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
455 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
456 get_alias_set (DR_REF (drb))))
457 return opt_result::success ();
459 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
460 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
462 if (apply_safelen ())
463 return opt_result::success ();
465 return opt_result::failure_at
466 (stmtinfo_a->stmt,
467 "possible alias involving gather/scatter between %T and %T\n",
468 DR_REF (dra), DR_REF (drb));
471 /* Unknown data dependence. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
474 if (apply_safelen ())
475 return opt_result::success ();
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
479 "versioning for alias required: "
480 "can't determine dependence between %T and %T\n",
481 DR_REF (dra), DR_REF (drb));
483 /* Add to list of ddrs that need to be tested at run-time. */
484 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
487 /* Known data dependence. */
488 if (DDR_NUM_DIST_VECTS (ddr) == 0)
490 if (apply_safelen ())
491 return opt_result::success ();
493 if (dump_enabled_p ())
494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
495 "versioning for alias required: "
496 "bad dist vector for %T and %T\n",
497 DR_REF (dra), DR_REF (drb));
498 /* Add to list of ddrs that need to be tested at run-time. */
499 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
502 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
504 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
505 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
506 loop_depth, max_vf))
507 return opt_result::success ();
509 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
511 int dist = dist_v[loop_depth];
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance = %d.\n", dist);
517 if (dist == 0)
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance == 0 between %T and %T\n",
522 DR_REF (dra), DR_REF (drb));
524 /* When we perform grouped accesses and perform implicit CSE
525 by detecting equal accesses and doing disambiguation with
526 runtime alias tests like for
527 .. = a[i];
528 .. = a[i+1];
529 a[i] = ..;
530 a[i+1] = ..;
531 *p = ..;
532 .. = a[i];
533 .. = a[i+1];
534 where we will end up loading { a[i], a[i+1] } once, make
535 sure that inserting group loads before the first load and
536 stores after the last store will do the right thing.
537 Similar for groups like
538 a[i] = ...;
539 ... = a[i];
540 a[i+1] = ...;
541 where loads from the group interleave with the store. */
542 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
543 return opt_result::failure_at (stmtinfo_a->stmt,
544 "READ_WRITE dependence"
545 " in interleaving.\n");
547 if (loop->safelen < 2)
549 tree indicator = dr_zero_step_indicator (dra);
550 if (!indicator || integer_zerop (indicator))
551 return opt_result::failure_at (stmtinfo_a->stmt,
552 "access also has a zero step\n");
553 else if (TREE_CODE (indicator) != INTEGER_CST)
554 vect_check_nonzero_value (loop_vinfo, indicator);
556 continue;
559 if (dist > 0 && DDR_REVERSED_P (ddr))
561 /* If DDR_REVERSED_P the order of the data-refs in DDR was
562 reversed (to make distance vector positive), and the actual
563 distance is negative. */
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "dependence distance negative.\n");
567 /* When doing outer loop vectorization, we need to check if there is
568 a backward dependence at the inner loop level if the dependence
569 at the outer loop is reversed. See PR81740. */
570 if (nested_in_vect_loop_p (loop, stmtinfo_a)
571 || nested_in_vect_loop_p (loop, stmtinfo_b))
573 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
574 DDR_LOOP_NEST (ddr));
575 if (dist_v[inner_depth] < 0)
576 return opt_result::failure_at (stmtinfo_a->stmt,
577 "not vectorized, dependence "
578 "between data-refs %T and %T\n",
579 DR_REF (dra), DR_REF (drb));
581 /* Record a negative dependence distance to later limit the
582 amount of stmt copying / unrolling we can perform.
583 Only need to handle read-after-write dependence. */
584 if (DR_IS_READ (drb)
585 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
586 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
587 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
588 continue;
591 unsigned int abs_dist = abs (dist);
592 if (abs_dist >= 2 && abs_dist < *max_vf)
594 /* The dependence distance requires reduction of the maximal
595 vectorization factor. */
596 *max_vf = abs_dist;
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "adjusting maximal vectorization factor to %i\n",
600 *max_vf);
603 if (abs_dist >= *max_vf)
605 /* Dependence distance does not create dependence, as far as
606 vectorization is concerned, in this case. */
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location,
609 "dependence distance >= VF.\n");
610 continue;
613 return opt_result::failure_at (stmtinfo_a->stmt,
614 "not vectorized, possible dependence "
615 "between data-refs %T and %T\n",
616 DR_REF (dra), DR_REF (drb));
619 return opt_result::success ();
622 /* Funcion vect_analyze_early_break_dependences.
624 Examime all the data references in the loop and make sure that if we have
625 mulitple exits that we are able to safely move stores such that they become
626 safe for vectorization. The function also calculates the place where to move
627 the instructions to and computes what the new vUSE chain should be.
629 This works in tandem with the CFG that will be produced by
630 slpeel_tree_duplicate_loop_to_edge_cfg later on.
632 This function tries to validate whether an early break vectorization
633 is possible for the current instruction sequence. Returns True i
634 possible, otherwise False.
636 Requirements:
637 - Any memory access must be to a fixed size buffer.
638 - There must not be any loads and stores to the same object.
639 - Multiple loads are allowed as long as they don't alias.
641 NOTE:
642 This implemementation is very conservative. Any overlappig loads/stores
643 that take place before the early break statement gets rejected aside from
644 WAR dependencies.
646 i.e.:
648 a[i] = 8
649 c = a[i]
650 if (b[i])
653 is not allowed, but
655 c = a[i]
656 a[i] = 8
657 if (b[i])
660 is which is the common case. */
662 static opt_result
663 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
665 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
667 /* List of all load data references found during traversal. */
668 auto_vec<data_reference *> bases;
669 basic_block dest_bb = NULL;
671 hash_set <gimple *> visited;
672 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
673 class loop *loop_nest = loop_outer (loop);
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location,
677 "loop contains multiple exits, analyzing"
678 " statement dependencies.\n");
680 /* Since we don't support general control flow, the location we'll move the
681 side-effects to is always the latch connected exit. When we support
682 general control flow we can do better but for now this is fine. */
683 dest_bb = single_pred (loop->latch);
684 basic_block bb = dest_bb;
688 /* If the destination block is also the header then we have nothing to do. */
689 if (!single_pred_p (bb))
690 continue;
692 bb = single_pred (bb);
693 gimple_stmt_iterator gsi = gsi_last_bb (bb);
695 /* Now analyze all the remaining statements and try to determine which
696 instructions are allowed/needed to be moved. */
697 while (!gsi_end_p (gsi))
699 gimple *stmt = gsi_stmt (gsi);
700 gsi_prev (&gsi);
701 if (!gimple_has_ops (stmt)
702 || is_gimple_debug (stmt))
703 continue;
705 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
706 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
707 if (!dr_ref)
708 continue;
710 /* We currently only support statically allocated objects due to
711 not having first-faulting loads support or peeling for
712 alignment support. Compute the size of the referenced object
713 (it could be dynamically allocated). */
714 tree obj = DR_BASE_ADDRESS (dr_ref);
715 if (!obj || TREE_CODE (obj) != ADDR_EXPR)
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "early breaks only supported on statically"
720 " allocated objects.\n");
721 return opt_result::failure_at (stmt,
722 "can't safely apply code motion to "
723 "dependencies of %G to vectorize "
724 "the early exit.\n", stmt);
727 tree refop = TREE_OPERAND (obj, 0);
728 tree refbase = get_base_address (refop);
729 if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
730 || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
732 if (dump_enabled_p ())
733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
734 "early breaks only supported on"
735 " statically allocated objects.\n");
736 return opt_result::failure_at (stmt,
737 "can't safely apply code motion to "
738 "dependencies of %G to vectorize "
739 "the early exit.\n", stmt);
742 /* Check if vector accesses to the object will be within bounds.
743 must be a constant or assume loop will be versioned or niters
744 bounded by VF so accesses are within range. */
745 if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
749 "early breaks not supported: vectorization "
750 "would %s beyond size of obj.",
751 DR_IS_READ (dr_ref) ? "read" : "write");
752 return opt_result::failure_at (stmt,
753 "can't safely apply code motion to "
754 "dependencies of %G to vectorize "
755 "the early exit.\n", stmt);
758 if (DR_IS_READ (dr_ref))
759 bases.safe_push (dr_ref);
760 else if (DR_IS_WRITE (dr_ref))
762 /* We are moving writes down in the CFG. To be sure that this
763 is valid after vectorization we have to check all the loads
764 we are sinking the stores past to see if any of them may
765 alias or are the same object.
767 Same objects will not be an issue because unless the store
768 is marked volatile the value can be forwarded. If the
769 store is marked volatile we don't vectorize the loop
770 anyway.
772 That leaves the check for aliasing. We don't really need
773 to care about the stores aliasing with each other since the
774 stores are moved in order so the effects are still observed
775 correctly. This leaves the check for WAR dependencies
776 which we would be introducing here if the DR can alias.
777 The check is quadratic in loads/stores but I have not found
778 a better API to do this. I believe all loads and stores
779 must be checked. We also must check them when we
780 encountered the store, since we don't care about loads past
781 the store. */
783 for (auto dr_read : bases)
784 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
786 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
788 vect_location,
789 "early breaks not supported: "
790 "overlapping loads and stores "
791 "found before the break "
792 "statement.\n");
794 return opt_result::failure_at (stmt,
795 "can't safely apply code motion to dependencies"
796 " to vectorize the early exit. %G may alias with"
797 " %G\n", stmt, dr_read->stmt);
801 if (gimple_vdef (stmt))
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_NOTE, vect_location,
805 "==> recording stmt %G", stmt);
807 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
809 else if (gimple_vuse (stmt))
811 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
812 if (dump_enabled_p ())
813 dump_printf_loc (MSG_NOTE, vect_location,
814 "marked statement for vUSE update: %G", stmt);
818 while (bb != loop->header);
820 /* We don't allow outer -> inner loop transitions which should have been
821 trapped already during loop form analysis. */
822 gcc_assert (dest_bb->loop_father == loop);
824 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
826 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
828 /* All uses shall be updated to that of the first load. Entries are
829 stored in reverse order. */
830 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
831 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_NOTE, vect_location,
835 "will update use: %T, mem_ref: %G", vuse, g);
839 if (dump_enabled_p ())
840 dump_printf_loc (MSG_NOTE, vect_location,
841 "recorded statements to be moved to BB %d\n",
842 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
844 return opt_result::success ();
847 /* Function vect_analyze_data_ref_dependences.
849 Examine all the data references in the loop, and make sure there do not
850 exist any data dependences between them. Set *MAX_VF according to
851 the maximum vectorization factor the data dependences allow. */
853 opt_result
854 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
855 unsigned int *max_vf)
857 unsigned int i;
858 struct data_dependence_relation *ddr;
860 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
862 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
864 LOOP_VINFO_DDRS (loop_vinfo)
865 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
866 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
867 /* We do not need read-read dependences. */
868 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
869 &LOOP_VINFO_DDRS (loop_vinfo),
870 LOOP_VINFO_LOOP_NEST (loop_vinfo),
871 false);
872 gcc_assert (res);
875 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
877 /* For epilogues we either have no aliases or alias versioning
878 was applied to original loop. Therefore we may just get max_vf
879 using VF of original loop. */
880 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
881 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
882 else
883 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
885 opt_result res
886 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
887 if (!res)
888 return res;
891 /* If we have early break statements in the loop, check to see if they
892 are of a form we can vectorizer. */
893 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
894 return vect_analyze_early_break_dependences (loop_vinfo);
896 return opt_result::success ();
900 /* Function vect_slp_analyze_data_ref_dependence.
902 Return TRUE if there (might) exist a dependence between a memory-reference
903 DRA and a memory-reference DRB for VINFO. When versioning for alias
904 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
905 according to the data dependence. */
907 static bool
908 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
909 struct data_dependence_relation *ddr)
911 struct data_reference *dra = DDR_A (ddr);
912 struct data_reference *drb = DDR_B (ddr);
913 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
914 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
916 /* We need to check dependences of statements marked as unvectorizable
917 as well, they still can prohibit vectorization. */
919 /* Independent data accesses. */
920 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
921 return false;
923 if (dra == drb)
924 return false;
926 /* Read-read is OK. */
927 if (DR_IS_READ (dra) && DR_IS_READ (drb))
928 return false;
930 /* If dra and drb are part of the same interleaving chain consider
931 them independent. */
932 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
933 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
934 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
935 return false;
937 /* Unknown data dependence. */
938 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
940 if (dump_enabled_p ())
941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
942 "can't determine dependence between %T and %T\n",
943 DR_REF (dra), DR_REF (drb));
945 else if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "determined dependence between %T and %T\n",
948 DR_REF (dra), DR_REF (drb));
950 return true;
954 /* Analyze dependences involved in the transform of a store SLP NODE. */
956 static bool
957 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
959 /* This walks over all stmts involved in the SLP store done
960 in NODE verifying we can sink them up to the last stmt in the
961 group. */
962 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
963 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
965 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
967 stmt_vec_info access_info
968 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
969 if (access_info == last_access_info)
970 continue;
971 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
972 ao_ref ref;
973 bool ref_initialized_p = false;
974 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
975 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
977 gimple *stmt = gsi_stmt (gsi);
978 if (! gimple_vuse (stmt))
979 continue;
981 /* If we couldn't record a (single) data reference for this
982 stmt we have to resort to the alias oracle. */
983 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
984 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
985 if (!dr_b)
987 /* We are moving a store - this means
988 we cannot use TBAA for disambiguation. */
989 if (!ref_initialized_p)
990 ao_ref_init (&ref, DR_REF (dr_a));
991 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
992 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
993 return false;
994 continue;
997 gcc_assert (!gimple_visited_p (stmt));
999 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1000 dr_b, vNULL);
1001 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1002 free_dependence_relation (ddr);
1003 if (dependent)
1004 return false;
1007 return true;
1010 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1011 contain the vector of scalar stores of this instance if we are
1012 disambiguating the loads. */
1014 static bool
1015 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1016 vec<stmt_vec_info> stores,
1017 stmt_vec_info last_store_info)
1019 /* This walks over all stmts involved in the SLP load done
1020 in NODE verifying we can hoist them up to the first stmt in the
1021 group. */
1022 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1023 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1025 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1027 stmt_vec_info access_info
1028 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1029 if (access_info == first_access_info)
1030 continue;
1031 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1032 ao_ref ref;
1033 bool ref_initialized_p = false;
1034 hash_set<stmt_vec_info> grp_visited;
1035 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1036 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1038 gimple *stmt = gsi_stmt (gsi);
1039 if (! gimple_vdef (stmt))
1040 continue;
1042 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1044 /* If we run into a store of this same instance (we've just
1045 marked those) then delay dependence checking until we run
1046 into the last store because this is where it will have
1047 been sunk to (and we verified that we can do that already). */
1048 if (gimple_visited_p (stmt))
1050 if (stmt_info != last_store_info)
1051 continue;
1053 for (stmt_vec_info &store_info : stores)
1055 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1056 ddr_p ddr = initialize_data_dependence_relation
1057 (dr_a, store_dr, vNULL);
1058 bool dependent
1059 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1060 free_dependence_relation (ddr);
1061 if (dependent)
1062 return false;
1064 continue;
1067 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1069 /* We are hoisting a load - this means we can use TBAA for
1070 disambiguation. */
1071 if (!ref_initialized_p)
1072 ao_ref_init (&ref, DR_REF (dr_a));
1073 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1075 /* If we couldn't record a (single) data reference for this
1076 stmt we have to give up now. */
1077 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1078 if (!dr_b)
1079 return false;
1080 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1081 dr_b, vNULL);
1082 bool dependent
1083 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1084 free_dependence_relation (ddr);
1085 if (dependent)
1086 return false;
1088 /* No dependence. */
1089 return true;
1091 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1093 /* When we run into a store group we have to honor
1094 that earlier stores might be moved here. We don't
1095 know exactly which and where to since we lack a
1096 back-mapping from DR to SLP node, so assume all
1097 earlier stores are sunk here. It's enough to
1098 consider the last stmt of a group for this.
1099 ??? Both this and the fact that we disregard that
1100 the conflicting instance might be removed later
1101 is overly conservative. */
1102 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1103 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1104 store_info != NULL;
1105 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1106 if ((store_info == stmt_info
1107 || get_later_stmt (store_info, stmt_info) == stmt_info)
1108 && !check_hoist (store_info))
1109 return false;
1111 else
1113 if (!check_hoist (stmt_info))
1114 return false;
1118 return true;
1122 /* Function vect_analyze_data_ref_dependences.
1124 Examine all the data references in the basic-block, and make sure there
1125 do not exist any data dependences between them. Set *MAX_VF according to
1126 the maximum vectorization factor the data dependences allow. */
1128 bool
1129 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1131 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1133 /* The stores of this instance are at the root of the SLP tree. */
1134 slp_tree store = NULL;
1135 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1136 store = SLP_INSTANCE_TREE (instance);
1138 /* Verify we can sink stores to the vectorized stmt insert location. */
1139 stmt_vec_info last_store_info = NULL;
1140 if (store)
1142 if (! vect_slp_analyze_store_dependences (vinfo, store))
1143 return false;
1145 /* Mark stores in this instance and remember the last one. */
1146 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1147 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1148 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1151 bool res = true;
1153 /* Verify we can sink loads to the vectorized stmt insert location,
1154 special-casing stores of this instance. */
1155 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1156 if (! vect_slp_analyze_load_dependences (vinfo, load,
1157 store
1158 ? SLP_TREE_SCALAR_STMTS (store)
1159 : vNULL, last_store_info))
1161 res = false;
1162 break;
1165 /* Unset the visited flag. */
1166 if (store)
1167 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1168 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1170 return res;
1173 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1174 applied. */
1177 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1179 HOST_WIDE_INT diff = 0;
1180 /* Alignment is only analyzed for the first element of a DR group,
1181 use that but adjust misalignment by the offset of the access. */
1182 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1184 dr_vec_info *first_dr
1185 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1186 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1187 INTEGER_CSTs and the first element in the group has the lowest
1188 address. */
1189 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1190 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1191 gcc_assert (diff >= 0);
1192 dr_info = first_dr;
1195 int misalign = dr_info->misalignment;
1196 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1197 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1198 return misalign;
1200 /* If the access is only aligned for a vector type with smaller alignment
1201 requirement the access has unknown misalignment. */
1202 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1203 targetm.vectorize.preferred_vector_alignment (vectype)))
1204 return DR_MISALIGNMENT_UNKNOWN;
1206 /* Apply the offset from the DR group start and the externally supplied
1207 offset which can for example result from a negative stride access. */
1208 poly_int64 misalignment = misalign + diff + offset;
1210 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1211 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1212 unsigned HOST_WIDE_INT target_alignment_c
1213 = dr_info->target_alignment.to_constant ();
1214 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1215 return DR_MISALIGNMENT_UNKNOWN;
1216 return misalign;
1219 /* Record the base alignment guarantee given by DRB, which occurs
1220 in STMT_INFO. */
1222 static void
1223 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1224 innermost_loop_behavior *drb)
1226 bool existed;
1227 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1228 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1229 if (!existed || entry.second->base_alignment < drb->base_alignment)
1231 entry = std::make_pair (stmt_info, drb);
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_NOTE, vect_location,
1234 "recording new base alignment for %T\n"
1235 " alignment: %d\n"
1236 " misalignment: %d\n"
1237 " based on: %G",
1238 drb->base_address,
1239 drb->base_alignment,
1240 drb->base_misalignment,
1241 stmt_info->stmt);
1245 /* If the region we're going to vectorize is reached, all unconditional
1246 data references occur at least once. We can therefore pool the base
1247 alignment guarantees from each unconditional reference. Do this by
1248 going through all the data references in VINFO and checking whether
1249 the containing statement makes the reference unconditionally. If so,
1250 record the alignment of the base address in VINFO so that it can be
1251 used for all other references with the same base. */
1253 void
1254 vect_record_base_alignments (vec_info *vinfo)
1256 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1257 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1258 for (data_reference *dr : vinfo->shared->datarefs)
1260 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1261 stmt_vec_info stmt_info = dr_info->stmt;
1262 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1263 && STMT_VINFO_VECTORIZABLE (stmt_info)
1264 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1266 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1268 /* If DR is nested in the loop that is being vectorized, we can also
1269 record the alignment of the base wrt the outer loop. */
1270 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1271 vect_record_base_alignment
1272 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1277 /* Function vect_compute_data_ref_alignment
1279 Compute the misalignment of the data reference DR_INFO when vectorizing
1280 with VECTYPE.
1282 Output:
1283 1. initialized misalignment info for DR_INFO
1285 FOR NOW: No analysis is actually performed. Misalignment is calculated
1286 only for trivial cases. TODO. */
1288 static void
1289 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1290 tree vectype)
1292 stmt_vec_info stmt_info = dr_info->stmt;
1293 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1294 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1295 class loop *loop = NULL;
1296 tree ref = DR_REF (dr_info->dr);
1298 if (dump_enabled_p ())
1299 dump_printf_loc (MSG_NOTE, vect_location,
1300 "vect_compute_data_ref_alignment:\n");
1302 if (loop_vinfo)
1303 loop = LOOP_VINFO_LOOP (loop_vinfo);
1305 /* Initialize misalignment to unknown. */
1306 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1308 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1309 return;
1311 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1312 bool step_preserves_misalignment_p;
1314 poly_uint64 vector_alignment
1315 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1316 BITS_PER_UNIT);
1317 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1319 /* If the main loop has peeled for alignment we have no way of knowing
1320 whether the data accesses in the epilogues are aligned. We can't at
1321 compile time answer the question whether we have entered the main loop or
1322 not. Fixes PR 92351. */
1323 if (loop_vinfo)
1325 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1326 if (orig_loop_vinfo
1327 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1328 return;
1331 unsigned HOST_WIDE_INT vect_align_c;
1332 if (!vector_alignment.is_constant (&vect_align_c))
1333 return;
1335 /* No step for BB vectorization. */
1336 if (!loop)
1338 gcc_assert (integer_zerop (drb->step));
1339 step_preserves_misalignment_p = true;
1342 /* In case the dataref is in an inner-loop of the loop that is being
1343 vectorized (LOOP), we use the base and misalignment information
1344 relative to the outer-loop (LOOP). This is ok only if the misalignment
1345 stays the same throughout the execution of the inner-loop, which is why
1346 we have to check that the stride of the dataref in the inner-loop evenly
1347 divides by the vector alignment. */
1348 else if (nested_in_vect_loop_p (loop, stmt_info))
1350 step_preserves_misalignment_p
1351 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1353 if (dump_enabled_p ())
1355 if (step_preserves_misalignment_p)
1356 dump_printf_loc (MSG_NOTE, vect_location,
1357 "inner step divides the vector alignment.\n");
1358 else
1359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1360 "inner step doesn't divide the vector"
1361 " alignment.\n");
1365 /* Similarly we can only use base and misalignment information relative to
1366 an innermost loop if the misalignment stays the same throughout the
1367 execution of the loop. As above, this is the case if the stride of
1368 the dataref evenly divides by the alignment. */
1369 else
1371 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1372 step_preserves_misalignment_p
1373 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1375 if (!step_preserves_misalignment_p && dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "step doesn't divide the vector alignment.\n");
1380 unsigned int base_alignment = drb->base_alignment;
1381 unsigned int base_misalignment = drb->base_misalignment;
1383 /* Calculate the maximum of the pooled base address alignment and the
1384 alignment that we can compute for DR itself. */
1385 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1386 = base_alignments->get (drb->base_address);
1387 if (entry
1388 && base_alignment < (*entry).second->base_alignment
1389 && (loop_vinfo
1390 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1391 gimple_bb (entry->first->stmt))
1392 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1393 || (entry->first->dr_aux.group <= dr_info->group)))))
1395 base_alignment = entry->second->base_alignment;
1396 base_misalignment = entry->second->base_misalignment;
1399 if (drb->offset_alignment < vect_align_c
1400 || !step_preserves_misalignment_p
1401 /* We need to know whether the step wrt the vectorized loop is
1402 negative when computing the starting misalignment below. */
1403 || TREE_CODE (drb->step) != INTEGER_CST)
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407 "Unknown alignment for access: %T\n", ref);
1408 return;
1411 if (base_alignment < vect_align_c)
1413 unsigned int max_alignment;
1414 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1415 if (max_alignment < vect_align_c
1416 || !vect_can_force_dr_alignment_p (base,
1417 vect_align_c * BITS_PER_UNIT))
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_NOTE, vect_location,
1421 "can't force alignment of ref: %T\n", ref);
1422 return;
1425 /* Force the alignment of the decl.
1426 NOTE: This is the only change to the code we make during
1427 the analysis phase, before deciding to vectorize the loop. */
1428 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_NOTE, vect_location,
1430 "force alignment of %T\n", ref);
1432 dr_info->base_decl = base;
1433 dr_info->base_misaligned = true;
1434 base_misalignment = 0;
1436 poly_int64 misalignment
1437 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1439 unsigned int const_misalignment;
1440 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1442 if (dump_enabled_p ())
1443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1444 "Non-constant misalignment for access: %T\n", ref);
1445 return;
1448 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452 "misalign = %d bytes of ref %T\n",
1453 const_misalignment, ref);
1455 return;
1458 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1459 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1460 is made aligned via peeling. */
1462 static bool
1463 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1464 dr_vec_info *dr_peel_info)
1466 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1467 DR_TARGET_ALIGNMENT (dr_info)))
1469 poly_offset_int diff
1470 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1471 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1472 if (known_eq (diff, 0)
1473 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1474 return true;
1476 return false;
1479 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1480 aligned via peeling. */
1482 static bool
1483 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1484 dr_vec_info *dr_peel_info)
1486 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1487 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1488 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1489 DR_OFFSET (dr_peel_info->dr), 0)
1490 || !operand_equal_p (DR_STEP (dr_info->dr),
1491 DR_STEP (dr_peel_info->dr), 0))
1492 return false;
1494 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1497 /* Compute the value for dr_info->misalign so that the access appears
1498 aligned. This is used by peeling to compensate for dr_misalignment
1499 applying the offset for negative step. */
1502 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1504 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1505 return 0;
1507 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1508 poly_int64 misalignment
1509 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1510 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1512 unsigned HOST_WIDE_INT target_alignment_c;
1513 int misalign;
1514 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1515 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1516 return DR_MISALIGNMENT_UNKNOWN;
1517 return misalign;
1520 /* Function vect_update_misalignment_for_peel.
1521 Sets DR_INFO's misalignment
1522 - to 0 if it has the same alignment as DR_PEEL_INFO,
1523 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1524 - to -1 (unknown) otherwise.
1526 DR_INFO - the data reference whose misalignment is to be adjusted.
1527 DR_PEEL_INFO - the data reference whose misalignment is being made
1528 zero in the vector loop by the peel.
1529 NPEEL - the number of iterations in the peel loop if the misalignment
1530 of DR_PEEL_INFO is known at compile time. */
1532 static void
1533 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1534 dr_vec_info *dr_peel_info, int npeel)
1536 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1537 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1539 SET_DR_MISALIGNMENT (dr_info,
1540 vect_dr_misalign_for_aligned_access (dr_peel_info));
1541 return;
1544 unsigned HOST_WIDE_INT alignment;
1545 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1546 && known_alignment_for_access_p (dr_info,
1547 STMT_VINFO_VECTYPE (dr_info->stmt))
1548 && known_alignment_for_access_p (dr_peel_info,
1549 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1551 int misal = dr_info->misalignment;
1552 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1553 misal &= alignment - 1;
1554 set_dr_misalignment (dr_info, misal);
1555 return;
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1560 "to unknown (-1).\n");
1561 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1564 /* Return true if alignment is relevant for DR_INFO. */
1566 static bool
1567 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1569 stmt_vec_info stmt_info = dr_info->stmt;
1571 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1572 return false;
1574 /* For interleaving, only the alignment of the first access matters. */
1575 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1576 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1577 return false;
1579 /* Scatter-gather and invariant accesses continue to address individual
1580 scalars, so vector-level alignment is irrelevant. */
1581 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1582 || integer_zerop (DR_STEP (dr_info->dr)))
1583 return false;
1585 /* Strided accesses perform only component accesses, alignment is
1586 irrelevant for them. */
1587 if (STMT_VINFO_STRIDED_P (stmt_info)
1588 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1589 return false;
1591 return true;
1594 /* Given an memory reference EXP return whether its alignment is less
1595 than its size. */
1597 static bool
1598 not_size_aligned (tree exp)
1600 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1601 return true;
1603 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1604 > get_object_alignment (exp));
1607 /* Function vector_alignment_reachable_p
1609 Return true if vector alignment for DR_INFO is reachable by peeling
1610 a few loop iterations. Return false otherwise. */
1612 static bool
1613 vector_alignment_reachable_p (dr_vec_info *dr_info)
1615 stmt_vec_info stmt_info = dr_info->stmt;
1616 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1618 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1620 /* For interleaved access we peel only if number of iterations in
1621 the prolog loop ({VF - misalignment}), is a multiple of the
1622 number of the interleaved accesses. */
1623 int elem_size, mis_in_elements;
1625 /* FORNOW: handle only known alignment. */
1626 if (!known_alignment_for_access_p (dr_info, vectype))
1627 return false;
1629 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1630 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1631 elem_size = vector_element_size (vector_size, nelements);
1632 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1634 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1635 return false;
1638 /* If misalignment is known at the compile time then allow peeling
1639 only if natural alignment is reachable through peeling. */
1640 if (known_alignment_for_access_p (dr_info, vectype)
1641 && !aligned_access_p (dr_info, vectype))
1643 HOST_WIDE_INT elmsize =
1644 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1645 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_NOTE, vect_location,
1648 "data size = %wd. misalignment = %d.\n", elmsize,
1649 dr_misalignment (dr_info, vectype));
1651 if (dr_misalignment (dr_info, vectype) % elmsize)
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655 "data size does not divide the misalignment.\n");
1656 return false;
1660 if (!known_alignment_for_access_p (dr_info, vectype))
1662 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1663 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1664 if (dump_enabled_p ())
1665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666 "Unknown misalignment, %snaturally aligned\n",
1667 is_packed ? "not " : "");
1668 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1671 return true;
1675 /* Calculate the cost of the memory access represented by DR_INFO. */
1677 static void
1678 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1679 dr_alignment_support alignment_support_scheme,
1680 int misalignment,
1681 unsigned int *inside_cost,
1682 unsigned int *outside_cost,
1683 stmt_vector_for_cost *body_cost_vec,
1684 stmt_vector_for_cost *prologue_cost_vec)
1686 stmt_vec_info stmt_info = dr_info->stmt;
1687 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1688 int ncopies;
1690 if (PURE_SLP_STMT (stmt_info))
1691 ncopies = 1;
1692 else
1693 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1695 if (DR_IS_READ (dr_info->dr))
1696 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1697 misalignment, true, inside_cost,
1698 outside_cost, prologue_cost_vec, body_cost_vec, false);
1699 else
1700 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1701 misalignment, inside_cost, body_cost_vec);
1703 if (dump_enabled_p ())
1704 dump_printf_loc (MSG_NOTE, vect_location,
1705 "vect_get_data_access_cost: inside_cost = %d, "
1706 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1710 typedef struct _vect_peel_info
1712 dr_vec_info *dr_info;
1713 int npeel;
1714 unsigned int count;
1715 } *vect_peel_info;
1717 typedef struct _vect_peel_extended_info
1719 vec_info *vinfo;
1720 struct _vect_peel_info peel_info;
1721 unsigned int inside_cost;
1722 unsigned int outside_cost;
1723 } *vect_peel_extended_info;
1726 /* Peeling hashtable helpers. */
1728 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1730 static inline hashval_t hash (const _vect_peel_info *);
1731 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1734 inline hashval_t
1735 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1737 return (hashval_t) peel_info->npeel;
1740 inline bool
1741 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1743 return (a->npeel == b->npeel);
1747 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1749 static void
1750 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1751 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1752 int npeel, bool supportable_if_not_aligned)
1754 struct _vect_peel_info elem, *slot;
1755 _vect_peel_info **new_slot;
1757 elem.npeel = npeel;
1758 slot = peeling_htab->find (&elem);
1759 if (slot)
1760 slot->count++;
1761 else
1763 slot = XNEW (struct _vect_peel_info);
1764 slot->npeel = npeel;
1765 slot->dr_info = dr_info;
1766 slot->count = 1;
1767 new_slot = peeling_htab->find_slot (slot, INSERT);
1768 *new_slot = slot;
1771 /* If this DR is not supported with unknown misalignment then bias
1772 this slot when the cost model is disabled. */
1773 if (!supportable_if_not_aligned
1774 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1775 slot->count += VECT_MAX_COST;
1779 /* Traverse peeling hash table to find peeling option that aligns maximum
1780 number of data accesses. */
1783 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1784 _vect_peel_extended_info *max)
1786 vect_peel_info elem = *slot;
1788 if (elem->count > max->peel_info.count
1789 || (elem->count == max->peel_info.count
1790 && max->peel_info.npeel > elem->npeel))
1792 max->peel_info.npeel = elem->npeel;
1793 max->peel_info.count = elem->count;
1794 max->peel_info.dr_info = elem->dr_info;
1797 return 1;
1800 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1801 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1802 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1803 after peeling. */
1805 static void
1806 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1807 dr_vec_info *dr0_info,
1808 unsigned int *inside_cost,
1809 unsigned int *outside_cost,
1810 stmt_vector_for_cost *body_cost_vec,
1811 stmt_vector_for_cost *prologue_cost_vec,
1812 unsigned int npeel)
1814 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1816 bool dr0_alignment_known_p
1817 = (dr0_info
1818 && known_alignment_for_access_p (dr0_info,
1819 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1821 for (data_reference *dr : datarefs)
1823 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1824 if (!vect_relevant_for_alignment_p (dr_info))
1825 continue;
1827 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1828 dr_alignment_support alignment_support_scheme;
1829 int misalignment;
1830 unsigned HOST_WIDE_INT alignment;
1832 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1833 size_zero_node) < 0;
1834 poly_int64 off = 0;
1835 if (negative)
1836 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1837 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1839 if (npeel == 0)
1840 misalignment = dr_misalignment (dr_info, vectype, off);
1841 else if (dr_info == dr0_info
1842 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1843 misalignment = 0;
1844 else if (!dr0_alignment_known_p
1845 || !known_alignment_for_access_p (dr_info, vectype)
1846 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1847 misalignment = DR_MISALIGNMENT_UNKNOWN;
1848 else
1850 misalignment = dr_misalignment (dr_info, vectype, off);
1851 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1852 misalignment &= alignment - 1;
1854 alignment_support_scheme
1855 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1856 misalignment);
1858 vect_get_data_access_cost (loop_vinfo, dr_info,
1859 alignment_support_scheme, misalignment,
1860 inside_cost, outside_cost,
1861 body_cost_vec, prologue_cost_vec);
1865 /* Traverse peeling hash table and calculate cost for each peeling option.
1866 Find the one with the lowest cost. */
1869 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1870 _vect_peel_extended_info *min)
1872 vect_peel_info elem = *slot;
1873 int dummy;
1874 unsigned int inside_cost = 0, outside_cost = 0;
1875 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1876 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1877 epilogue_cost_vec;
1879 prologue_cost_vec.create (2);
1880 body_cost_vec.create (2);
1881 epilogue_cost_vec.create (2);
1883 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1884 &outside_cost, &body_cost_vec,
1885 &prologue_cost_vec, elem->npeel);
1887 body_cost_vec.release ();
1889 outside_cost += vect_get_known_peeling_cost
1890 (loop_vinfo, elem->npeel, &dummy,
1891 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1892 &prologue_cost_vec, &epilogue_cost_vec);
1894 /* Prologue and epilogue costs are added to the target model later.
1895 These costs depend only on the scalar iteration cost, the
1896 number of peeling iterations finally chosen, and the number of
1897 misaligned statements. So discard the information found here. */
1898 prologue_cost_vec.release ();
1899 epilogue_cost_vec.release ();
1901 if (inside_cost < min->inside_cost
1902 || (inside_cost == min->inside_cost
1903 && outside_cost < min->outside_cost))
1905 min->inside_cost = inside_cost;
1906 min->outside_cost = outside_cost;
1907 min->peel_info.dr_info = elem->dr_info;
1908 min->peel_info.npeel = elem->npeel;
1909 min->peel_info.count = elem->count;
1912 return 1;
1916 /* Choose best peeling option by traversing peeling hash table and either
1917 choosing an option with the lowest cost (if cost model is enabled) or the
1918 option that aligns as many accesses as possible. */
1920 static struct _vect_peel_extended_info
1921 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1922 loop_vec_info loop_vinfo)
1924 struct _vect_peel_extended_info res;
1926 res.peel_info.dr_info = NULL;
1927 res.vinfo = loop_vinfo;
1929 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1931 res.inside_cost = INT_MAX;
1932 res.outside_cost = INT_MAX;
1933 peeling_htab->traverse <_vect_peel_extended_info *,
1934 vect_peeling_hash_get_lowest_cost> (&res);
1936 else
1938 res.peel_info.count = 0;
1939 peeling_htab->traverse <_vect_peel_extended_info *,
1940 vect_peeling_hash_get_most_frequent> (&res);
1941 res.inside_cost = 0;
1942 res.outside_cost = 0;
1945 return res;
1948 /* Return true if the new peeling NPEEL is supported. */
1950 static bool
1951 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1952 unsigned npeel)
1954 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1955 enum dr_alignment_support supportable_dr_alignment;
1957 bool dr0_alignment_known_p
1958 = known_alignment_for_access_p (dr0_info,
1959 STMT_VINFO_VECTYPE (dr0_info->stmt));
1961 /* Ensure that all data refs can be vectorized after the peel. */
1962 for (data_reference *dr : datarefs)
1964 if (dr == dr0_info->dr)
1965 continue;
1967 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1968 if (!vect_relevant_for_alignment_p (dr_info)
1969 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1970 continue;
1972 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1973 int misalignment;
1974 unsigned HOST_WIDE_INT alignment;
1975 if (!dr0_alignment_known_p
1976 || !known_alignment_for_access_p (dr_info, vectype)
1977 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1978 misalignment = DR_MISALIGNMENT_UNKNOWN;
1979 else
1981 misalignment = dr_misalignment (dr_info, vectype);
1982 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1983 misalignment &= alignment - 1;
1985 supportable_dr_alignment
1986 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1987 misalignment);
1988 if (supportable_dr_alignment == dr_unaligned_unsupported)
1989 return false;
1992 return true;
1995 /* Compare two data-references DRA and DRB to group them into chunks
1996 with related alignment. */
1998 static int
1999 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2001 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2002 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2003 int cmp;
2005 /* Stabilize sort. */
2006 if (dra == drb)
2007 return 0;
2009 /* Ordering of DRs according to base. */
2010 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2011 DR_BASE_ADDRESS (drb));
2012 if (cmp != 0)
2013 return cmp;
2015 /* And according to DR_OFFSET. */
2016 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2017 if (cmp != 0)
2018 return cmp;
2020 /* And after step. */
2021 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2022 if (cmp != 0)
2023 return cmp;
2025 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2026 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2027 if (cmp == 0)
2028 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2029 return cmp;
2032 /* Function vect_enhance_data_refs_alignment
2034 This pass will use loop versioning and loop peeling in order to enhance
2035 the alignment of data references in the loop.
2037 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2038 original loop is to be vectorized. Any other loops that are created by
2039 the transformations performed in this pass - are not supposed to be
2040 vectorized. This restriction will be relaxed.
2042 This pass will require a cost model to guide it whether to apply peeling
2043 or versioning or a combination of the two. For example, the scheme that
2044 intel uses when given a loop with several memory accesses, is as follows:
2045 choose one memory access ('p') which alignment you want to force by doing
2046 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2047 other accesses are not necessarily aligned, or (2) use loop versioning to
2048 generate one loop in which all accesses are aligned, and another loop in
2049 which only 'p' is necessarily aligned.
2051 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2052 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2053 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2055 Devising a cost model is the most critical aspect of this work. It will
2056 guide us on which access to peel for, whether to use loop versioning, how
2057 many versions to create, etc. The cost model will probably consist of
2058 generic considerations as well as target specific considerations (on
2059 powerpc for example, misaligned stores are more painful than misaligned
2060 loads).
2062 Here are the general steps involved in alignment enhancements:
2064 -- original loop, before alignment analysis:
2065 for (i=0; i<N; i++){
2066 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2067 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2070 -- After vect_compute_data_refs_alignment:
2071 for (i=0; i<N; i++){
2072 x = q[i]; # DR_MISALIGNMENT(q) = 3
2073 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2076 -- Possibility 1: we do loop versioning:
2077 if (p is aligned) {
2078 for (i=0; i<N; i++){ # loop 1A
2079 x = q[i]; # DR_MISALIGNMENT(q) = 3
2080 p[i] = y; # DR_MISALIGNMENT(p) = 0
2083 else {
2084 for (i=0; i<N; i++){ # loop 1B
2085 x = q[i]; # DR_MISALIGNMENT(q) = 3
2086 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2090 -- Possibility 2: we do loop peeling:
2091 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2092 x = q[i];
2093 p[i] = y;
2095 for (i = 3; i < N; i++){ # loop 2A
2096 x = q[i]; # DR_MISALIGNMENT(q) = 0
2097 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2100 -- Possibility 3: combination of loop peeling and versioning:
2101 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2102 x = q[i];
2103 p[i] = y;
2105 if (p is aligned) {
2106 for (i = 3; i<N; i++){ # loop 3A
2107 x = q[i]; # DR_MISALIGNMENT(q) = 0
2108 p[i] = y; # DR_MISALIGNMENT(p) = 0
2111 else {
2112 for (i = 3; i<N; i++){ # loop 3B
2113 x = q[i]; # DR_MISALIGNMENT(q) = 0
2114 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2118 These loops are later passed to loop_transform to be vectorized. The
2119 vectorizer will use the alignment information to guide the transformation
2120 (whether to generate regular loads/stores, or with special handling for
2121 misalignment). */
2123 opt_result
2124 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2126 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2127 dr_vec_info *first_store = NULL;
2128 dr_vec_info *dr0_info = NULL;
2129 struct data_reference *dr;
2130 unsigned int i;
2131 bool do_peeling = false;
2132 bool do_versioning = false;
2133 unsigned int npeel = 0;
2134 bool one_misalignment_known = false;
2135 bool one_misalignment_unknown = false;
2136 bool one_dr_unsupportable = false;
2137 dr_vec_info *unsupportable_dr_info = NULL;
2138 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2139 hash_table<peel_info_hasher> peeling_htab (1);
2141 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2143 /* Reset data so we can safely be called multiple times. */
2144 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2145 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2147 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2148 return opt_result::success ();
2150 /* Sort the vector of datarefs so DRs that have the same or dependent
2151 alignment are next to each other. */
2152 auto_vec<data_reference_p> datarefs
2153 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2154 datarefs.qsort (dr_align_group_sort_cmp);
2156 /* Compute the number of DRs that become aligned when we peel
2157 a dataref so it becomes aligned. */
2158 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2159 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2160 unsigned i0;
2161 for (i0 = 0; i0 < datarefs.length (); ++i0)
2162 if (DR_BASE_ADDRESS (datarefs[i0]))
2163 break;
2164 for (i = i0 + 1; i <= datarefs.length (); ++i)
2166 if (i == datarefs.length ()
2167 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2168 DR_BASE_ADDRESS (datarefs[i]), 0)
2169 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2170 DR_OFFSET (datarefs[i]), 0)
2171 || !operand_equal_p (DR_STEP (datarefs[i0]),
2172 DR_STEP (datarefs[i]), 0))
2174 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2175 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2176 will get known misalignment if we align one of the refs
2177 with the largest DR_TARGET_ALIGNMENT. */
2178 for (unsigned j = i0; j < i; ++j)
2180 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2181 for (unsigned k = i0; k < i; ++k)
2183 if (k == j)
2184 continue;
2185 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2186 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2187 dr_infoj))
2188 n_same_align_refs[j]++;
2191 i0 = i;
2195 /* While cost model enhancements are expected in the future, the high level
2196 view of the code at this time is as follows:
2198 A) If there is a misaligned access then see if peeling to align
2199 this access can make all data references satisfy
2200 vect_supportable_dr_alignment. If so, update data structures
2201 as needed and return true.
2203 B) If peeling wasn't possible and there is a data reference with an
2204 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2205 then see if loop versioning checks can be used to make all data
2206 references satisfy vect_supportable_dr_alignment. If so, update
2207 data structures as needed and return true.
2209 C) If neither peeling nor versioning were successful then return false if
2210 any data reference does not satisfy vect_supportable_dr_alignment.
2212 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2214 Note, Possibility 3 above (which is peeling and versioning together) is not
2215 being done at this time. */
2217 /* (1) Peeling to force alignment. */
2219 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2220 Considerations:
2221 + How many accesses will become aligned due to the peeling
2222 - How many accesses will become unaligned due to the peeling,
2223 and the cost of misaligned accesses.
2224 - The cost of peeling (the extra runtime checks, the increase
2225 in code size). */
2227 FOR_EACH_VEC_ELT (datarefs, i, dr)
2229 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2230 if (!vect_relevant_for_alignment_p (dr_info))
2231 continue;
2233 stmt_vec_info stmt_info = dr_info->stmt;
2234 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2235 do_peeling = vector_alignment_reachable_p (dr_info);
2236 if (do_peeling)
2238 if (known_alignment_for_access_p (dr_info, vectype))
2240 unsigned int npeel_tmp = 0;
2241 bool negative = tree_int_cst_compare (DR_STEP (dr),
2242 size_zero_node) < 0;
2244 /* If known_alignment_for_access_p then we have set
2245 DR_MISALIGNMENT which is only done if we know it at compiler
2246 time, so it is safe to assume target alignment is constant.
2248 unsigned int target_align =
2249 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2250 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2251 poly_int64 off = 0;
2252 if (negative)
2253 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2254 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2255 mis = negative ? mis : -mis;
2256 if (mis != 0)
2257 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2259 /* For multiple types, it is possible that the bigger type access
2260 will have more than one peeling option. E.g., a loop with two
2261 types: one of size (vector size / 4), and the other one of
2262 size (vector size / 8). Vectorization factor will 8. If both
2263 accesses are misaligned by 3, the first one needs one scalar
2264 iteration to be aligned, and the second one needs 5. But the
2265 first one will be aligned also by peeling 5 scalar
2266 iterations, and in that case both accesses will be aligned.
2267 Hence, except for the immediate peeling amount, we also want
2268 to try to add full vector size, while we don't exceed
2269 vectorization factor.
2270 We do this automatically for cost model, since we calculate
2271 cost for every peeling option. */
2272 poly_uint64 nscalars = npeel_tmp;
2273 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2275 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2276 nscalars = (STMT_SLP_TYPE (stmt_info)
2277 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2280 /* Save info about DR in the hash table. Also include peeling
2281 amounts according to the explanation above. Indicate
2282 the alignment status when the ref is not aligned.
2283 ??? Rather than using unknown alignment here we should
2284 prune all entries from the peeling hashtable which cause
2285 DRs to be not supported. */
2286 bool supportable_if_not_aligned
2287 = vect_supportable_dr_alignment
2288 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2289 while (known_le (npeel_tmp, nscalars))
2291 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2292 dr_info, npeel_tmp,
2293 supportable_if_not_aligned);
2294 npeel_tmp += MAX (1, target_align / dr_size);
2297 one_misalignment_known = true;
2299 else
2301 /* If we don't know any misalignment values, we prefer
2302 peeling for data-ref that has the maximum number of data-refs
2303 with the same alignment, unless the target prefers to align
2304 stores over load. */
2305 unsigned same_align_drs = n_same_align_refs[i];
2306 if (!dr0_info
2307 || dr0_same_align_drs < same_align_drs)
2309 dr0_same_align_drs = same_align_drs;
2310 dr0_info = dr_info;
2312 /* For data-refs with the same number of related
2313 accesses prefer the one where the misalign
2314 computation will be invariant in the outermost loop. */
2315 else if (dr0_same_align_drs == same_align_drs)
2317 class loop *ivloop0, *ivloop;
2318 ivloop0 = outermost_invariant_loop_for_expr
2319 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2320 ivloop = outermost_invariant_loop_for_expr
2321 (loop, DR_BASE_ADDRESS (dr));
2322 if ((ivloop && !ivloop0)
2323 || (ivloop && ivloop0
2324 && flow_loop_nested_p (ivloop, ivloop0)))
2325 dr0_info = dr_info;
2328 one_misalignment_unknown = true;
2330 /* Check for data refs with unsupportable alignment that
2331 can be peeled. */
2332 enum dr_alignment_support supportable_dr_alignment
2333 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2334 DR_MISALIGNMENT_UNKNOWN);
2335 if (supportable_dr_alignment == dr_unaligned_unsupported)
2337 one_dr_unsupportable = true;
2338 unsupportable_dr_info = dr_info;
2341 if (!first_store && DR_IS_WRITE (dr))
2343 first_store = dr_info;
2344 first_store_same_align_drs = same_align_drs;
2348 else
2350 if (!aligned_access_p (dr_info, vectype))
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354 "vector alignment may not be reachable\n");
2355 break;
2360 /* Check if we can possibly peel the loop. */
2361 if (!vect_can_advance_ivs_p (loop_vinfo)
2362 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2363 loop_preheader_edge (loop))
2364 || loop->inner
2365 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2366 do_peeling = false;
2368 struct _vect_peel_extended_info peel_for_known_alignment;
2369 struct _vect_peel_extended_info peel_for_unknown_alignment;
2370 struct _vect_peel_extended_info best_peel;
2372 peel_for_unknown_alignment.inside_cost = INT_MAX;
2373 peel_for_unknown_alignment.outside_cost = INT_MAX;
2374 peel_for_unknown_alignment.peel_info.count = 0;
2376 if (do_peeling
2377 && one_misalignment_unknown)
2379 /* Check if the target requires to prefer stores over loads, i.e., if
2380 misaligned stores are more expensive than misaligned loads (taking
2381 drs with same alignment into account). */
2382 unsigned int load_inside_cost = 0;
2383 unsigned int load_outside_cost = 0;
2384 unsigned int store_inside_cost = 0;
2385 unsigned int store_outside_cost = 0;
2386 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2388 stmt_vector_for_cost dummy;
2389 dummy.create (2);
2390 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2391 &load_inside_cost,
2392 &load_outside_cost,
2393 &dummy, &dummy, estimated_npeels);
2394 dummy.release ();
2396 if (first_store)
2398 dummy.create (2);
2399 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2400 &store_inside_cost,
2401 &store_outside_cost,
2402 &dummy, &dummy,
2403 estimated_npeels);
2404 dummy.release ();
2406 else
2408 store_inside_cost = INT_MAX;
2409 store_outside_cost = INT_MAX;
2412 if (load_inside_cost > store_inside_cost
2413 || (load_inside_cost == store_inside_cost
2414 && load_outside_cost > store_outside_cost))
2416 dr0_info = first_store;
2417 dr0_same_align_drs = first_store_same_align_drs;
2418 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2419 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2421 else
2423 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2424 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2427 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2428 prologue_cost_vec.create (2);
2429 epilogue_cost_vec.create (2);
2431 int dummy2;
2432 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2433 (loop_vinfo, estimated_npeels, &dummy2,
2434 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2435 &prologue_cost_vec, &epilogue_cost_vec);
2437 prologue_cost_vec.release ();
2438 epilogue_cost_vec.release ();
2440 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2443 peel_for_unknown_alignment.peel_info.npeel = 0;
2444 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2446 best_peel = peel_for_unknown_alignment;
2448 peel_for_known_alignment.inside_cost = INT_MAX;
2449 peel_for_known_alignment.outside_cost = INT_MAX;
2450 peel_for_known_alignment.peel_info.count = 0;
2451 peel_for_known_alignment.peel_info.dr_info = NULL;
2453 if (do_peeling && one_misalignment_known)
2455 /* Peeling is possible, but there is no data access that is not supported
2456 unless aligned. So we try to choose the best possible peeling from
2457 the hash table. */
2458 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2459 (&peeling_htab, loop_vinfo);
2462 /* Compare costs of peeling for known and unknown alignment. */
2463 if (peel_for_known_alignment.peel_info.dr_info != NULL
2464 && peel_for_unknown_alignment.inside_cost
2465 >= peel_for_known_alignment.inside_cost)
2467 best_peel = peel_for_known_alignment;
2469 /* If the best peeling for known alignment has NPEEL == 0, perform no
2470 peeling at all except if there is an unsupportable dr that we can
2471 align. */
2472 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2473 do_peeling = false;
2476 /* If there is an unsupportable data ref, prefer this over all choices so far
2477 since we'd have to discard a chosen peeling except when it accidentally
2478 aligned the unsupportable data ref. */
2479 if (one_dr_unsupportable)
2480 dr0_info = unsupportable_dr_info;
2481 else if (do_peeling)
2483 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2484 TODO: Use nopeel_outside_cost or get rid of it? */
2485 unsigned nopeel_inside_cost = 0;
2486 unsigned nopeel_outside_cost = 0;
2488 stmt_vector_for_cost dummy;
2489 dummy.create (2);
2490 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2491 &nopeel_outside_cost, &dummy, &dummy, 0);
2492 dummy.release ();
2494 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2495 costs will be recorded. */
2496 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2497 prologue_cost_vec.create (2);
2498 epilogue_cost_vec.create (2);
2500 int dummy2;
2501 nopeel_outside_cost += vect_get_known_peeling_cost
2502 (loop_vinfo, 0, &dummy2,
2503 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2504 &prologue_cost_vec, &epilogue_cost_vec);
2506 prologue_cost_vec.release ();
2507 epilogue_cost_vec.release ();
2509 npeel = best_peel.peel_info.npeel;
2510 dr0_info = best_peel.peel_info.dr_info;
2512 /* If no peeling is not more expensive than the best peeling we
2513 have so far, don't perform any peeling. */
2514 if (nopeel_inside_cost <= best_peel.inside_cost)
2515 do_peeling = false;
2518 if (do_peeling)
2520 stmt_vec_info stmt_info = dr0_info->stmt;
2521 if (known_alignment_for_access_p (dr0_info,
2522 STMT_VINFO_VECTYPE (stmt_info)))
2524 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2525 size_zero_node) < 0;
2526 if (!npeel)
2528 /* Since it's known at compile time, compute the number of
2529 iterations in the peeled loop (the peeling factor) for use in
2530 updating DR_MISALIGNMENT values. The peeling factor is the
2531 vectorization factor minus the misalignment as an element
2532 count. */
2533 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2534 poly_int64 off = 0;
2535 if (negative)
2536 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2537 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2538 unsigned int mis
2539 = dr_misalignment (dr0_info, vectype, off);
2540 mis = negative ? mis : -mis;
2541 /* If known_alignment_for_access_p then we have set
2542 DR_MISALIGNMENT which is only done if we know it at compiler
2543 time, so it is safe to assume target alignment is constant.
2545 unsigned int target_align =
2546 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2547 npeel = ((mis & (target_align - 1))
2548 / vect_get_scalar_dr_size (dr0_info));
2551 /* For interleaved data access every iteration accesses all the
2552 members of the group, therefore we divide the number of iterations
2553 by the group size. */
2554 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2555 npeel /= DR_GROUP_SIZE (stmt_info);
2557 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE, vect_location,
2559 "Try peeling by %d\n", npeel);
2562 /* Ensure that all datarefs can be vectorized after the peel. */
2563 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2564 do_peeling = false;
2566 /* Check if all datarefs are supportable and log. */
2567 if (do_peeling
2568 && npeel == 0
2569 && known_alignment_for_access_p (dr0_info,
2570 STMT_VINFO_VECTYPE (stmt_info)))
2571 return opt_result::success ();
2573 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2574 if (do_peeling)
2576 unsigned max_allowed_peel
2577 = param_vect_max_peeling_for_alignment;
2578 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2579 max_allowed_peel = 0;
2580 if (max_allowed_peel != (unsigned)-1)
2582 unsigned max_peel = npeel;
2583 if (max_peel == 0)
2585 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2586 unsigned HOST_WIDE_INT target_align_c;
2587 if (target_align.is_constant (&target_align_c))
2588 max_peel =
2589 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2590 else
2592 do_peeling = false;
2593 if (dump_enabled_p ())
2594 dump_printf_loc (MSG_NOTE, vect_location,
2595 "Disable peeling, max peels set and vector"
2596 " alignment unknown\n");
2599 if (max_peel > max_allowed_peel)
2601 do_peeling = false;
2602 if (dump_enabled_p ())
2603 dump_printf_loc (MSG_NOTE, vect_location,
2604 "Disable peeling, max peels reached: %d\n", max_peel);
2609 /* Cost model #2 - if peeling may result in a remaining loop not
2610 iterating enough to be vectorized then do not peel. Since this
2611 is a cost heuristic rather than a correctness decision, use the
2612 most likely runtime value for variable vectorization factors. */
2613 if (do_peeling
2614 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2616 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2617 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2618 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2619 < assumed_vf + max_peel)
2620 do_peeling = false;
2623 if (do_peeling)
2625 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2626 If the misalignment of DR_i is identical to that of dr0 then set
2627 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2628 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2629 by the peeling factor times the element size of DR_i (MOD the
2630 vectorization factor times the size). Otherwise, the
2631 misalignment of DR_i must be set to unknown. */
2632 FOR_EACH_VEC_ELT (datarefs, i, dr)
2633 if (dr != dr0_info->dr)
2635 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2636 if (!vect_relevant_for_alignment_p (dr_info))
2637 continue;
2639 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2642 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2643 if (npeel)
2644 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2645 else
2646 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2647 SET_DR_MISALIGNMENT (dr0_info,
2648 vect_dr_misalign_for_aligned_access (dr0_info));
2649 if (dump_enabled_p ())
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "Alignment of access forced using peeling.\n");
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "Peeling for alignment will be applied.\n");
2657 /* The inside-loop cost will be accounted for in vectorizable_load
2658 and vectorizable_store correctly with adjusted alignments.
2659 Drop the body_cst_vec on the floor here. */
2660 return opt_result::success ();
2664 /* (2) Versioning to force alignment. */
2666 /* Try versioning if:
2667 1) optimize loop for speed and the cost-model is not cheap
2668 2) there is at least one unsupported misaligned data ref with an unknown
2669 misalignment, and
2670 3) all misaligned data refs with a known misalignment are supported, and
2671 4) the number of runtime alignment checks is within reason. */
2673 do_versioning
2674 = (optimize_loop_nest_for_speed_p (loop)
2675 && !loop->inner /* FORNOW */
2676 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2678 if (do_versioning)
2680 FOR_EACH_VEC_ELT (datarefs, i, dr)
2682 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2683 if (!vect_relevant_for_alignment_p (dr_info))
2684 continue;
2686 stmt_vec_info stmt_info = dr_info->stmt;
2687 if (STMT_VINFO_STRIDED_P (stmt_info))
2689 do_versioning = false;
2690 break;
2693 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2694 bool negative = tree_int_cst_compare (DR_STEP (dr),
2695 size_zero_node) < 0;
2696 poly_int64 off = 0;
2697 if (negative)
2698 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2699 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2700 int misalignment;
2701 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2702 continue;
2704 enum dr_alignment_support supportable_dr_alignment
2705 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2706 misalignment);
2707 if (supportable_dr_alignment == dr_unaligned_unsupported)
2709 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2710 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2711 >= (unsigned) param_vect_max_version_for_alignment_checks))
2713 do_versioning = false;
2714 break;
2717 /* At present we don't support versioning for alignment
2718 with variable VF, since there's no guarantee that the
2719 VF is a power of two. We could relax this if we added
2720 a way of enforcing a power-of-two size. */
2721 unsigned HOST_WIDE_INT size;
2722 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2724 do_versioning = false;
2725 break;
2728 /* Forcing alignment in the first iteration is no good if
2729 we don't keep it across iterations. For now, just disable
2730 versioning in this case.
2731 ?? We could actually unroll the loop to achieve the required
2732 overall step alignment, and forcing the alignment could be
2733 done by doing some iterations of the non-vectorized loop. */
2734 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2735 * DR_STEP_ALIGNMENT (dr),
2736 DR_TARGET_ALIGNMENT (dr_info)))
2738 do_versioning = false;
2739 break;
2742 /* The rightmost bits of an aligned address must be zeros.
2743 Construct the mask needed for this test. For example,
2744 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2745 mask must be 15 = 0xf. */
2746 int mask = size - 1;
2748 /* FORNOW: use the same mask to test all potentially unaligned
2749 references in the loop. */
2750 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2751 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2753 do_versioning = false;
2754 break;
2757 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2758 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2762 /* Versioning requires at least one misaligned data reference. */
2763 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2764 do_versioning = false;
2765 else if (!do_versioning)
2766 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2769 if (do_versioning)
2771 const vec<stmt_vec_info> &may_misalign_stmts
2772 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2773 stmt_vec_info stmt_info;
2775 /* It can now be assumed that the data references in the statements
2776 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2777 of the loop being vectorized. */
2778 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2780 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2781 SET_DR_MISALIGNMENT (dr_info,
2782 vect_dr_misalign_for_aligned_access (dr_info));
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "Alignment of access forced using versioning.\n");
2788 if (dump_enabled_p ())
2789 dump_printf_loc (MSG_NOTE, vect_location,
2790 "Versioning for alignment will be applied.\n");
2792 /* Peeling and versioning can't be done together at this time. */
2793 gcc_assert (! (do_peeling && do_versioning));
2795 return opt_result::success ();
2798 /* This point is reached if neither peeling nor versioning is being done. */
2799 gcc_assert (! (do_peeling || do_versioning));
2801 return opt_result::success ();
2805 /* Function vect_analyze_data_refs_alignment
2807 Analyze the alignment of the data-references in the loop.
2808 Return FALSE if a data reference is found that cannot be vectorized. */
2810 opt_result
2811 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2813 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2815 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2816 struct data_reference *dr;
2817 unsigned int i;
2819 vect_record_base_alignments (loop_vinfo);
2820 FOR_EACH_VEC_ELT (datarefs, i, dr)
2822 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2823 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2825 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2826 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2827 continue;
2828 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2829 STMT_VINFO_VECTYPE (dr_info->stmt));
2833 return opt_result::success ();
2837 /* Analyze alignment of DRs of stmts in NODE. */
2839 static bool
2840 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2842 /* Alignment is maintained in the first element of the group. */
2843 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2844 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2845 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2846 tree vectype = SLP_TREE_VECTYPE (node);
2847 poly_uint64 vector_alignment
2848 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2849 BITS_PER_UNIT);
2850 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2851 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2852 /* Re-analyze alignment when we're facing a vectorization with a bigger
2853 alignment requirement. */
2854 else if (known_lt (dr_info->target_alignment, vector_alignment))
2856 poly_uint64 old_target_alignment = dr_info->target_alignment;
2857 int old_misalignment = dr_info->misalignment;
2858 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2859 /* But keep knowledge about a smaller alignment. */
2860 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2861 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2863 dr_info->target_alignment = old_target_alignment;
2864 dr_info->misalignment = old_misalignment;
2867 /* When we ever face unordered target alignments the first one wins in terms
2868 of analyzing and the other will become unknown in dr_misalignment. */
2869 return true;
2872 /* Function vect_slp_analyze_instance_alignment
2874 Analyze the alignment of the data-references in the SLP instance.
2875 Return FALSE if a data reference is found that cannot be vectorized. */
2877 bool
2878 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2879 slp_instance instance)
2881 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2883 slp_tree node;
2884 unsigned i;
2885 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2886 if (! vect_slp_analyze_node_alignment (vinfo, node))
2887 return false;
2889 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2890 && ! vect_slp_analyze_node_alignment
2891 (vinfo, SLP_INSTANCE_TREE (instance)))
2892 return false;
2894 return true;
2898 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2899 accesses of legal size, step, etc. Detect gaps, single element
2900 interleaving, and other special cases. Set grouped access info.
2901 Collect groups of strided stores for further use in SLP analysis.
2902 Worker for vect_analyze_group_access. */
2904 static bool
2905 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2907 data_reference *dr = dr_info->dr;
2908 tree step = DR_STEP (dr);
2909 tree scalar_type = TREE_TYPE (DR_REF (dr));
2910 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2911 stmt_vec_info stmt_info = dr_info->stmt;
2912 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2913 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2914 HOST_WIDE_INT dr_step = -1;
2915 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2916 bool slp_impossible = false;
2918 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2919 size of the interleaving group (including gaps). */
2920 if (tree_fits_shwi_p (step))
2922 dr_step = tree_to_shwi (step);
2923 /* Check that STEP is a multiple of type size. Otherwise there is
2924 a non-element-sized gap at the end of the group which we
2925 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2926 ??? As we can handle non-constant step fine here we should
2927 simply remove uses of DR_GROUP_GAP between the last and first
2928 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2929 simply not include that gap. */
2930 if ((dr_step % type_size) != 0)
2932 if (dump_enabled_p ())
2933 dump_printf_loc (MSG_NOTE, vect_location,
2934 "Step %T is not a multiple of the element size"
2935 " for %T\n",
2936 step, DR_REF (dr));
2937 return false;
2939 groupsize = absu_hwi (dr_step) / type_size;
2941 else
2942 groupsize = 0;
2944 /* Not consecutive access is possible only if it is a part of interleaving. */
2945 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2947 /* Check if it this DR is a part of interleaving, and is a single
2948 element of the group that is accessed in the loop. */
2950 /* Gaps are supported only for loads. STEP must be a multiple of the type
2951 size. */
2952 if (DR_IS_READ (dr)
2953 && (dr_step % type_size) == 0
2954 && groupsize > 0
2955 /* This could be UINT_MAX but as we are generating code in a very
2956 inefficient way we have to cap earlier.
2957 See PR91403 for example. */
2958 && groupsize <= 4096)
2960 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2961 DR_GROUP_SIZE (stmt_info) = groupsize;
2962 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2963 if (dump_enabled_p ())
2964 dump_printf_loc (MSG_NOTE, vect_location,
2965 "Detected single element interleaving %T"
2966 " step %T\n",
2967 DR_REF (dr), step);
2969 return true;
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 "not consecutive access %G", stmt_info->stmt);
2976 if (bb_vinfo)
2978 /* Mark the statement as unvectorizable. */
2979 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2980 return true;
2983 if (dump_enabled_p ())
2984 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2985 STMT_VINFO_STRIDED_P (stmt_info) = true;
2986 return true;
2989 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2991 /* First stmt in the interleaving chain. Check the chain. */
2992 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2993 struct data_reference *data_ref = dr;
2994 unsigned int count = 1;
2995 tree prev_init = DR_INIT (data_ref);
2996 HOST_WIDE_INT diff, gaps = 0;
2998 /* By construction, all group members have INTEGER_CST DR_INITs. */
2999 while (next)
3001 /* We never have the same DR multiple times. */
3002 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3003 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3005 data_ref = STMT_VINFO_DATA_REF (next);
3007 /* All group members have the same STEP by construction. */
3008 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3010 /* Check that the distance between two accesses is equal to the type
3011 size. Otherwise, we have gaps. */
3012 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3013 - TREE_INT_CST_LOW (prev_init)) / type_size;
3014 if (diff < 1 || diff > UINT_MAX)
3016 /* For artificial testcases with array accesses with large
3017 constant indices we can run into overflow issues which
3018 can end up fooling the groupsize constraint below so
3019 check the individual gaps (which are represented as
3020 unsigned int) as well. */
3021 if (dump_enabled_p ())
3022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3023 "interleaved access with gap larger "
3024 "than representable\n");
3025 return false;
3027 if (diff != 1)
3029 /* FORNOW: SLP of accesses with gaps is not supported. */
3030 slp_impossible = true;
3031 if (DR_IS_WRITE (data_ref))
3033 if (dump_enabled_p ())
3034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3035 "interleaved store with gaps\n");
3036 return false;
3039 gaps += diff - 1;
3042 last_accessed_element += diff;
3044 /* Store the gap from the previous member of the group. If there is no
3045 gap in the access, DR_GROUP_GAP is always 1. */
3046 DR_GROUP_GAP (next) = diff;
3048 prev_init = DR_INIT (data_ref);
3049 next = DR_GROUP_NEXT_ELEMENT (next);
3050 /* Count the number of data-refs in the chain. */
3051 count++;
3054 if (groupsize == 0)
3055 groupsize = count + gaps;
3057 /* This could be UINT_MAX but as we are generating code in a very
3058 inefficient way we have to cap earlier. See PR78699 for example. */
3059 if (groupsize > 4096)
3061 if (dump_enabled_p ())
3062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063 "group is too large\n");
3064 return false;
3067 /* Check that the size of the interleaving is equal to count for stores,
3068 i.e., that there are no gaps. */
3069 if (groupsize != count
3070 && !DR_IS_READ (dr))
3072 groupsize = count;
3073 STMT_VINFO_STRIDED_P (stmt_info) = true;
3076 /* If there is a gap after the last load in the group it is the
3077 difference between the groupsize and the last accessed
3078 element.
3079 When there is no gap, this difference should be 0. */
3080 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3082 DR_GROUP_SIZE (stmt_info) = groupsize;
3083 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE, vect_location,
3086 "Detected interleaving ");
3087 if (DR_IS_READ (dr))
3088 dump_printf (MSG_NOTE, "load ");
3089 else if (STMT_VINFO_STRIDED_P (stmt_info))
3090 dump_printf (MSG_NOTE, "strided store ");
3091 else
3092 dump_printf (MSG_NOTE, "store ");
3093 dump_printf (MSG_NOTE, "of size %u\n",
3094 (unsigned)groupsize);
3095 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3096 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3097 while (next)
3099 if (DR_GROUP_GAP (next) != 1)
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 "\t<gap of %d elements>\n",
3102 DR_GROUP_GAP (next) - 1);
3103 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3104 next = DR_GROUP_NEXT_ELEMENT (next);
3106 if (DR_GROUP_GAP (stmt_info) != 0)
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "\t<gap of %d elements>\n",
3109 DR_GROUP_GAP (stmt_info));
3112 /* SLP: create an SLP data structure for every interleaving group of
3113 stores for further analysis in vect_analyse_slp. */
3114 if (DR_IS_WRITE (dr) && !slp_impossible)
3116 if (loop_vinfo)
3117 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3118 if (bb_vinfo)
3119 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3123 return true;
3126 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3127 accesses of legal size, step, etc. Detect gaps, single element
3128 interleaving, and other special cases. Set grouped access info.
3129 Collect groups of strided stores for further use in SLP analysis. */
3131 static bool
3132 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3134 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3136 /* Dissolve the group if present. */
3137 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3138 while (stmt_info)
3140 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3141 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3142 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3143 stmt_info = next;
3145 return false;
3147 return true;
3150 /* Analyze the access pattern of the data-reference DR_INFO.
3151 In case of non-consecutive accesses call vect_analyze_group_access() to
3152 analyze groups of accesses. */
3154 static bool
3155 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3157 data_reference *dr = dr_info->dr;
3158 tree step = DR_STEP (dr);
3159 tree scalar_type = TREE_TYPE (DR_REF (dr));
3160 stmt_vec_info stmt_info = dr_info->stmt;
3161 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3162 class loop *loop = NULL;
3164 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3165 return true;
3167 if (loop_vinfo)
3168 loop = LOOP_VINFO_LOOP (loop_vinfo);
3170 if (loop_vinfo && !step)
3172 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3174 "bad data-ref access in loop\n");
3175 return false;
3178 /* Allow loads with zero step in inner-loop vectorization. */
3179 if (loop_vinfo && integer_zerop (step))
3181 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3182 if (!nested_in_vect_loop_p (loop, stmt_info))
3183 return DR_IS_READ (dr);
3184 /* Allow references with zero step for outer loops marked
3185 with pragma omp simd only - it guarantees absence of
3186 loop-carried dependencies between inner loop iterations. */
3187 if (loop->safelen < 2)
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 "zero step in inner loop of nest\n");
3192 return false;
3196 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3198 /* Interleaved accesses are not yet supported within outer-loop
3199 vectorization for references in the inner-loop. */
3200 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3202 /* For the rest of the analysis we use the outer-loop step. */
3203 step = STMT_VINFO_DR_STEP (stmt_info);
3204 if (integer_zerop (step))
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_NOTE, vect_location,
3208 "zero step in outer loop.\n");
3209 return DR_IS_READ (dr);
3213 /* Consecutive? */
3214 if (TREE_CODE (step) == INTEGER_CST)
3216 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3217 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3218 || (dr_step < 0
3219 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3221 /* Mark that it is not interleaving. */
3222 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3223 return true;
3227 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3229 if (dump_enabled_p ())
3230 dump_printf_loc (MSG_NOTE, vect_location,
3231 "grouped access in outer loop.\n");
3232 return false;
3236 /* Assume this is a DR handled by non-constant strided load case. */
3237 if (TREE_CODE (step) != INTEGER_CST)
3238 return (STMT_VINFO_STRIDED_P (stmt_info)
3239 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3240 || vect_analyze_group_access (vinfo, dr_info)));
3242 /* Not consecutive access - check if it's a part of interleaving group. */
3243 return vect_analyze_group_access (vinfo, dr_info);
3246 /* Compare two data-references DRA and DRB to group them into chunks
3247 suitable for grouping. */
3249 static int
3250 dr_group_sort_cmp (const void *dra_, const void *drb_)
3252 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3253 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3254 data_reference_p dra = dra_info->dr;
3255 data_reference_p drb = drb_info->dr;
3256 int cmp;
3258 /* Stabilize sort. */
3259 if (dra == drb)
3260 return 0;
3262 /* Different group IDs lead never belong to the same group. */
3263 if (dra_info->group != drb_info->group)
3264 return dra_info->group < drb_info->group ? -1 : 1;
3266 /* Ordering of DRs according to base. */
3267 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3268 DR_BASE_ADDRESS (drb));
3269 if (cmp != 0)
3270 return cmp;
3272 /* And according to DR_OFFSET. */
3273 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3274 if (cmp != 0)
3275 return cmp;
3277 /* Put reads before writes. */
3278 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3279 return DR_IS_READ (dra) ? -1 : 1;
3281 /* Then sort after access size. */
3282 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3283 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3284 if (cmp != 0)
3285 return cmp;
3287 /* And after step. */
3288 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3289 if (cmp != 0)
3290 return cmp;
3292 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3293 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3294 if (cmp == 0)
3295 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3296 return cmp;
3299 /* If OP is the result of a conversion, return the unconverted value,
3300 otherwise return null. */
3302 static tree
3303 strip_conversion (tree op)
3305 if (TREE_CODE (op) != SSA_NAME)
3306 return NULL_TREE;
3307 gimple *stmt = SSA_NAME_DEF_STMT (op);
3308 if (!is_gimple_assign (stmt)
3309 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3310 return NULL_TREE;
3311 return gimple_assign_rhs1 (stmt);
3314 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3315 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3316 be grouped in SLP mode. */
3318 static bool
3319 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3320 bool allow_slp_p)
3322 if (gimple_assign_single_p (stmt1_info->stmt))
3323 return gimple_assign_single_p (stmt2_info->stmt);
3325 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3326 if (call1 && gimple_call_internal_p (call1))
3328 /* Check for two masked loads or two masked stores. */
3329 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3330 if (!call2 || !gimple_call_internal_p (call2))
3331 return false;
3332 internal_fn ifn = gimple_call_internal_fn (call1);
3333 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3334 return false;
3335 if (ifn != gimple_call_internal_fn (call2))
3336 return false;
3338 /* Check that the masks are the same. Cope with casts of masks,
3339 like those created by build_mask_conversion. */
3340 tree mask1 = gimple_call_arg (call1, 2);
3341 tree mask2 = gimple_call_arg (call2, 2);
3342 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3344 mask1 = strip_conversion (mask1);
3345 if (!mask1)
3346 return false;
3347 mask2 = strip_conversion (mask2);
3348 if (!mask2)
3349 return false;
3350 if (!operand_equal_p (mask1, mask2, 0))
3351 return false;
3353 return true;
3356 return false;
3359 /* Function vect_analyze_data_ref_accesses.
3361 Analyze the access pattern of all the data references in the loop.
3363 FORNOW: the only access pattern that is considered vectorizable is a
3364 simple step 1 (consecutive) access.
3366 FORNOW: handle only arrays and pointer accesses. */
3368 opt_result
3369 vect_analyze_data_ref_accesses (vec_info *vinfo,
3370 vec<int> *dataref_groups)
3372 unsigned int i;
3373 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3375 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3377 if (datarefs.is_empty ())
3378 return opt_result::success ();
3380 /* Sort the array of datarefs to make building the interleaving chains
3381 linear. Don't modify the original vector's order, it is needed for
3382 determining what dependencies are reversed. */
3383 vec<dr_vec_info *> datarefs_copy;
3384 datarefs_copy.create (datarefs.length ());
3385 for (unsigned i = 0; i < datarefs.length (); i++)
3387 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3388 /* If the caller computed DR grouping use that, otherwise group by
3389 basic blocks. */
3390 if (dataref_groups)
3391 dr_info->group = (*dataref_groups)[i];
3392 else
3393 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3394 datarefs_copy.quick_push (dr_info);
3396 datarefs_copy.qsort (dr_group_sort_cmp);
3397 hash_set<stmt_vec_info> to_fixup;
3399 /* Build the interleaving chains. */
3400 for (i = 0; i < datarefs_copy.length () - 1;)
3402 dr_vec_info *dr_info_a = datarefs_copy[i];
3403 data_reference_p dra = dr_info_a->dr;
3404 int dra_group_id = dr_info_a->group;
3405 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3406 stmt_vec_info lastinfo = NULL;
3407 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3408 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3410 ++i;
3411 continue;
3413 for (i = i + 1; i < datarefs_copy.length (); ++i)
3415 dr_vec_info *dr_info_b = datarefs_copy[i];
3416 data_reference_p drb = dr_info_b->dr;
3417 int drb_group_id = dr_info_b->group;
3418 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3419 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3420 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3421 break;
3423 /* ??? Imperfect sorting (non-compatible types, non-modulo
3424 accesses, same accesses) can lead to a group to be artificially
3425 split here as we don't just skip over those. If it really
3426 matters we can push those to a worklist and re-iterate
3427 over them. The we can just skip ahead to the next DR here. */
3429 /* DRs in a different DR group should not be put into the same
3430 interleaving group. */
3431 if (dra_group_id != drb_group_id)
3432 break;
3434 /* Check that the data-refs have same first location (except init)
3435 and they are both either store or load (not load and store,
3436 not masked loads or stores). */
3437 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3438 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3439 DR_BASE_ADDRESS (drb)) != 0
3440 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3441 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3442 break;
3444 /* Check that the data-refs have the same constant size. */
3445 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3446 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3447 if (!tree_fits_uhwi_p (sza)
3448 || !tree_fits_uhwi_p (szb)
3449 || !tree_int_cst_equal (sza, szb))
3450 break;
3452 /* Check that the data-refs have the same step. */
3453 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3454 break;
3456 /* Check the types are compatible.
3457 ??? We don't distinguish this during sorting. */
3458 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3459 TREE_TYPE (DR_REF (drb))))
3460 break;
3462 /* Check that the DR_INITs are compile-time constants. */
3463 if (!tree_fits_shwi_p (DR_INIT (dra))
3464 || !tree_fits_shwi_p (DR_INIT (drb)))
3465 break;
3467 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3468 just hold extra information. */
3469 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3470 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3471 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3472 break;
3474 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3475 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3476 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3477 HOST_WIDE_INT init_prev
3478 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3479 gcc_assert (init_a <= init_b
3480 && init_a <= init_prev
3481 && init_prev <= init_b);
3483 /* Do not place the same access in the interleaving chain twice. */
3484 if (init_b == init_prev)
3486 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3487 < gimple_uid (DR_STMT (drb)));
3488 /* Simply link in duplicates and fix up the chain below. */
3490 else
3492 /* If init_b == init_a + the size of the type * k, we have an
3493 interleaving, and DRA is accessed before DRB. */
3494 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3495 if (type_size_a == 0
3496 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3497 % type_size_a != 0))
3498 break;
3500 /* If we have a store, the accesses are adjacent. This splits
3501 groups into chunks we support (we don't support vectorization
3502 of stores with gaps). */
3503 if (!DR_IS_READ (dra)
3504 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3505 != type_size_a))
3506 break;
3508 /* If the step (if not zero or non-constant) is smaller than the
3509 difference between data-refs' inits this splits groups into
3510 suitable sizes. */
3511 if (tree_fits_shwi_p (DR_STEP (dra)))
3513 unsigned HOST_WIDE_INT step
3514 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3515 if (step != 0
3516 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3517 break;
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location,
3523 DR_IS_READ (dra)
3524 ? "Detected interleaving load %T and %T\n"
3525 : "Detected interleaving store %T and %T\n",
3526 DR_REF (dra), DR_REF (drb));
3528 /* Link the found element into the group list. */
3529 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3531 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3532 lastinfo = stmtinfo_a;
3534 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3535 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3536 lastinfo = stmtinfo_b;
3538 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3539 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3541 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3542 dump_printf_loc (MSG_NOTE, vect_location,
3543 "Load suitable for SLP vectorization only.\n");
3545 if (init_b == init_prev
3546 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3547 && dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "Queuing group with duplicate access for fixup\n");
3553 /* Fixup groups with duplicate entries by splitting it. */
3554 while (1)
3556 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3557 if (!(it != to_fixup.end ()))
3558 break;
3559 stmt_vec_info grp = *it;
3560 to_fixup.remove (grp);
3562 /* Find the earliest duplicate group member. */
3563 unsigned first_duplicate = -1u;
3564 stmt_vec_info next, g = grp;
3565 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3567 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3568 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3569 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3570 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3571 g = next;
3573 if (first_duplicate == -1U)
3574 continue;
3576 /* Then move all stmts after the first duplicate to a new group.
3577 Note this is a heuristic but one with the property that *it
3578 is fixed up completely. */
3579 g = grp;
3580 stmt_vec_info newgroup = NULL, ng = grp;
3581 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3583 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3585 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3586 if (!newgroup)
3587 newgroup = next;
3588 else
3589 DR_GROUP_NEXT_ELEMENT (ng) = next;
3590 ng = next;
3591 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3593 else
3594 g = DR_GROUP_NEXT_ELEMENT (g);
3596 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3598 /* Fixup the new group which still may contain duplicates. */
3599 to_fixup.add (newgroup);
3602 dr_vec_info *dr_info;
3603 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3605 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3606 && !vect_analyze_data_ref_access (vinfo, dr_info))
3608 if (dump_enabled_p ())
3609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610 "not vectorized: complicated access pattern.\n");
3612 if (is_a <bb_vec_info> (vinfo))
3614 /* Mark the statement as not vectorizable. */
3615 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3616 continue;
3618 else
3620 datarefs_copy.release ();
3621 return opt_result::failure_at (dr_info->stmt->stmt,
3622 "not vectorized:"
3623 " complicated access pattern.\n");
3628 datarefs_copy.release ();
3629 return opt_result::success ();
3632 /* Function vect_vfa_segment_size.
3634 Input:
3635 DR_INFO: The data reference.
3636 LENGTH_FACTOR: segment length to consider.
3638 Return a value suitable for the dr_with_seg_len::seg_len field.
3639 This is the "distance travelled" by the pointer from the first
3640 iteration in the segment to the last. Note that it does not include
3641 the size of the access; in effect it only describes the first byte. */
3643 static tree
3644 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3646 length_factor = size_binop (MINUS_EXPR,
3647 fold_convert (sizetype, length_factor),
3648 size_one_node);
3649 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3650 length_factor);
3653 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3654 gives the worst-case number of bytes covered by the segment. */
3656 static unsigned HOST_WIDE_INT
3657 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3659 stmt_vec_info stmt_vinfo = dr_info->stmt;
3660 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3661 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3662 unsigned HOST_WIDE_INT access_size = ref_size;
3663 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3665 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3666 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3668 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3669 int misalignment;
3670 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3671 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3672 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3673 == dr_explicit_realign_optimized))
3675 /* We might access a full vector's worth. */
3676 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3678 return access_size;
3681 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3682 describes. */
3684 static unsigned int
3685 vect_vfa_align (dr_vec_info *dr_info)
3687 return dr_alignment (dr_info->dr);
3690 /* Function vect_no_alias_p.
3692 Given data references A and B with equal base and offset, see whether
3693 the alias relation can be decided at compilation time. Return 1 if
3694 it can and the references alias, 0 if it can and the references do
3695 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3696 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3697 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3699 static int
3700 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3701 tree segment_length_a, tree segment_length_b,
3702 unsigned HOST_WIDE_INT access_size_a,
3703 unsigned HOST_WIDE_INT access_size_b)
3705 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3706 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3707 poly_uint64 const_length_a;
3708 poly_uint64 const_length_b;
3710 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3711 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3712 [a, a+12) */
3713 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3715 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3716 offset_a -= const_length_a;
3718 else
3719 const_length_a = tree_to_poly_uint64 (segment_length_a);
3720 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3722 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3723 offset_b -= const_length_b;
3725 else
3726 const_length_b = tree_to_poly_uint64 (segment_length_b);
3728 const_length_a += access_size_a;
3729 const_length_b += access_size_b;
3731 if (ranges_known_overlap_p (offset_a, const_length_a,
3732 offset_b, const_length_b))
3733 return 1;
3735 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3736 offset_b, const_length_b))
3737 return 0;
3739 return -1;
3742 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3743 in DDR is >= VF. */
3745 static bool
3746 dependence_distance_ge_vf (data_dependence_relation *ddr,
3747 unsigned int loop_depth, poly_uint64 vf)
3749 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3750 || DDR_NUM_DIST_VECTS (ddr) == 0)
3751 return false;
3753 /* If the dependence is exact, we should have limited the VF instead. */
3754 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3756 unsigned int i;
3757 lambda_vector dist_v;
3758 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3760 HOST_WIDE_INT dist = dist_v[loop_depth];
3761 if (dist != 0
3762 && !(dist > 0 && DDR_REVERSED_P (ddr))
3763 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3764 return false;
3767 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_NOTE, vect_location,
3769 "dependence distance between %T and %T is >= VF\n",
3770 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3772 return true;
3775 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3777 static void
3778 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3780 dump_printf (dump_kind, "%s (%T) >= ",
3781 lower_bound.unsigned_p ? "unsigned" : "abs",
3782 lower_bound.expr);
3783 dump_dec (dump_kind, lower_bound.min_value);
3786 /* Record that the vectorized loop requires the vec_lower_bound described
3787 by EXPR, UNSIGNED_P and MIN_VALUE. */
3789 static void
3790 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3791 poly_uint64 min_value)
3793 vec<vec_lower_bound> &lower_bounds
3794 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3795 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3796 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3798 unsigned_p &= lower_bounds[i].unsigned_p;
3799 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3800 if (lower_bounds[i].unsigned_p != unsigned_p
3801 || maybe_lt (lower_bounds[i].min_value, min_value))
3803 lower_bounds[i].unsigned_p = unsigned_p;
3804 lower_bounds[i].min_value = min_value;
3805 if (dump_enabled_p ())
3807 dump_printf_loc (MSG_NOTE, vect_location,
3808 "updating run-time check to ");
3809 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3810 dump_printf (MSG_NOTE, "\n");
3813 return;
3816 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3817 if (dump_enabled_p ())
3819 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3820 dump_lower_bound (MSG_NOTE, lower_bound);
3821 dump_printf (MSG_NOTE, "\n");
3823 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3826 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3827 will span fewer than GAP bytes. */
3829 static bool
3830 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3831 poly_int64 gap)
3833 stmt_vec_info stmt_info = dr_info->stmt;
3834 HOST_WIDE_INT count
3835 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3836 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3837 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3838 return (estimated_poly_value (gap)
3839 <= count * vect_get_scalar_dr_size (dr_info));
3842 /* Return true if we know that there is no alias between DR_INFO_A and
3843 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3844 When returning true, set *LOWER_BOUND_OUT to this N. */
3846 static bool
3847 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3848 poly_uint64 *lower_bound_out)
3850 /* Check that there is a constant gap of known sign between DR_A
3851 and DR_B. */
3852 data_reference *dr_a = dr_info_a->dr;
3853 data_reference *dr_b = dr_info_b->dr;
3854 poly_int64 init_a, init_b;
3855 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3856 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3857 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3858 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3859 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3860 || !ordered_p (init_a, init_b))
3861 return false;
3863 /* Sort DR_A and DR_B by the address they access. */
3864 if (maybe_lt (init_b, init_a))
3866 std::swap (init_a, init_b);
3867 std::swap (dr_info_a, dr_info_b);
3868 std::swap (dr_a, dr_b);
3871 /* If the two accesses could be dependent within a scalar iteration,
3872 make sure that we'd retain their order. */
3873 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3874 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3875 return false;
3877 /* There is no alias if abs (DR_STEP) is greater than or equal to
3878 the bytes spanned by the combination of the two accesses. */
3879 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3880 return true;
3883 /* Function vect_prune_runtime_alias_test_list.
3885 Prune a list of ddrs to be tested at run-time by versioning for alias.
3886 Merge several alias checks into one if possible.
3887 Return FALSE if resulting list of ddrs is longer then allowed by
3888 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3890 opt_result
3891 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3893 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3894 hash_set <tree_pair_hash> compared_objects;
3896 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3897 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3898 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3899 const vec<vec_object_pair> &check_unequal_addrs
3900 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3901 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3902 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3904 ddr_p ddr;
3905 unsigned int i;
3906 tree length_factor;
3908 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3910 /* Step values are irrelevant for aliasing if the number of vector
3911 iterations is equal to the number of scalar iterations (which can
3912 happen for fully-SLP loops). */
3913 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3915 if (!vf_one_p)
3917 /* Convert the checks for nonzero steps into bound tests. */
3918 tree value;
3919 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3920 vect_check_lower_bound (loop_vinfo, value, true, 1);
3923 if (may_alias_ddrs.is_empty ())
3924 return opt_result::success ();
3926 comp_alias_ddrs.create (may_alias_ddrs.length ());
3928 unsigned int loop_depth
3929 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3930 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3932 /* First, we collect all data ref pairs for aliasing checks. */
3933 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3935 poly_uint64 lower_bound;
3936 tree segment_length_a, segment_length_b;
3937 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3938 unsigned int align_a, align_b;
3940 /* Ignore the alias if the VF we chose ended up being no greater
3941 than the dependence distance. */
3942 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3943 continue;
3945 if (DDR_OBJECT_A (ddr))
3947 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3948 if (!compared_objects.add (new_pair))
3950 if (dump_enabled_p ())
3951 dump_printf_loc (MSG_NOTE, vect_location,
3952 "checking that %T and %T"
3953 " have different addresses\n",
3954 new_pair.first, new_pair.second);
3955 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3957 continue;
3960 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3961 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3963 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3964 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3966 bool preserves_scalar_order_p
3967 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3968 bool ignore_step_p
3969 = (vf_one_p
3970 && (preserves_scalar_order_p
3971 || operand_equal_p (DR_STEP (dr_info_a->dr),
3972 DR_STEP (dr_info_b->dr))));
3974 /* Skip the pair if inter-iteration dependencies are irrelevant
3975 and intra-iteration dependencies are guaranteed to be honored. */
3976 if (ignore_step_p
3977 && (preserves_scalar_order_p
3978 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3979 &lower_bound)))
3981 if (dump_enabled_p ())
3982 dump_printf_loc (MSG_NOTE, vect_location,
3983 "no need for alias check between "
3984 "%T and %T when VF is 1\n",
3985 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3986 continue;
3989 /* See whether we can handle the alias using a bounds check on
3990 the step, and whether that's likely to be the best approach.
3991 (It might not be, for example, if the minimum step is much larger
3992 than the number of bytes handled by one vector iteration.) */
3993 if (!ignore_step_p
3994 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3995 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3996 &lower_bound)
3997 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3998 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4000 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4001 if (dump_enabled_p ())
4003 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4004 "%T and %T when the step %T is outside ",
4005 DR_REF (dr_info_a->dr),
4006 DR_REF (dr_info_b->dr),
4007 DR_STEP (dr_info_a->dr));
4008 if (unsigned_p)
4009 dump_printf (MSG_NOTE, "[0");
4010 else
4012 dump_printf (MSG_NOTE, "(");
4013 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4015 dump_printf (MSG_NOTE, ", ");
4016 dump_dec (MSG_NOTE, lower_bound);
4017 dump_printf (MSG_NOTE, ")\n");
4019 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4020 unsigned_p, lower_bound);
4021 continue;
4024 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4025 if (dr_group_first_a)
4027 stmt_info_a = dr_group_first_a;
4028 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4031 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4032 if (dr_group_first_b)
4034 stmt_info_b = dr_group_first_b;
4035 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4038 if (ignore_step_p)
4040 segment_length_a = size_zero_node;
4041 segment_length_b = size_zero_node;
4043 else
4045 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4046 DR_STEP (dr_info_b->dr), 0))
4047 length_factor = scalar_loop_iters;
4048 else
4049 length_factor = size_int (vect_factor);
4050 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4051 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4053 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4054 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4055 align_a = vect_vfa_align (dr_info_a);
4056 align_b = vect_vfa_align (dr_info_b);
4058 /* See whether the alias is known at compilation time. */
4059 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4060 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4061 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4062 DR_OFFSET (dr_info_b->dr), 0)
4063 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4064 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4065 && poly_int_tree_p (segment_length_a)
4066 && poly_int_tree_p (segment_length_b))
4068 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4069 segment_length_a,
4070 segment_length_b,
4071 access_size_a,
4072 access_size_b);
4073 if (res >= 0 && dump_enabled_p ())
4075 dump_printf_loc (MSG_NOTE, vect_location,
4076 "can tell at compile time that %T and %T",
4077 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4078 if (res == 0)
4079 dump_printf (MSG_NOTE, " do not alias\n");
4080 else
4081 dump_printf (MSG_NOTE, " alias\n");
4084 if (res == 0)
4085 continue;
4087 if (res == 1)
4088 return opt_result::failure_at (stmt_info_b->stmt,
4089 "not vectorized:"
4090 " compilation time alias: %G%G",
4091 stmt_info_a->stmt,
4092 stmt_info_b->stmt);
4095 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4096 access_size_a, align_a);
4097 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4098 access_size_b, align_b);
4099 /* Canonicalize the order to be the one that's needed for accurate
4100 RAW, WAR and WAW flags, in cases where the data references are
4101 well-ordered. The order doesn't really matter otherwise,
4102 but we might as well be consistent. */
4103 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4104 std::swap (dr_a, dr_b);
4106 dr_with_seg_len_pair_t dr_with_seg_len_pair
4107 (dr_a, dr_b, (preserves_scalar_order_p
4108 ? dr_with_seg_len_pair_t::WELL_ORDERED
4109 : dr_with_seg_len_pair_t::REORDERED));
4111 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4114 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4116 unsigned int count = (comp_alias_ddrs.length ()
4117 + check_unequal_addrs.length ());
4119 if (count
4120 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4121 == VECT_COST_MODEL_VERY_CHEAP))
4122 return opt_result::failure_at
4123 (vect_location, "would need a runtime alias check\n");
4125 if (dump_enabled_p ())
4126 dump_printf_loc (MSG_NOTE, vect_location,
4127 "improved number of alias checks from %d to %d\n",
4128 may_alias_ddrs.length (), count);
4129 unsigned limit = param_vect_max_version_for_alias_checks;
4130 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4131 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4132 if (count > limit)
4133 return opt_result::failure_at
4134 (vect_location,
4135 "number of versioning for alias run-time tests exceeds %d "
4136 "(--param vect-max-version-for-alias-checks)\n", limit);
4138 return opt_result::success ();
4141 /* Check whether we can use an internal function for a gather load
4142 or scatter store. READ_P is true for loads and false for stores.
4143 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4144 the type of the memory elements being loaded or stored. OFFSET_TYPE
4145 is the type of the offset that is being applied to the invariant
4146 base address. SCALE is the amount by which the offset should
4147 be multiplied *after* it has been converted to address width.
4149 Return true if the function is supported, storing the function id in
4150 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4152 bool
4153 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4154 tree vectype, tree memory_type, tree offset_type,
4155 int scale, internal_fn *ifn_out,
4156 tree *offset_vectype_out)
4158 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4159 unsigned int element_bits = vector_element_bits (vectype);
4160 if (element_bits != memory_bits)
4161 /* For now the vector elements must be the same width as the
4162 memory elements. */
4163 return false;
4165 /* Work out which function we need. */
4166 internal_fn ifn, alt_ifn, alt_ifn2;
4167 if (read_p)
4169 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4170 alt_ifn = IFN_MASK_GATHER_LOAD;
4171 /* When target supports MASK_LEN_GATHER_LOAD, we always
4172 use MASK_LEN_GATHER_LOAD regardless whether len and
4173 mask are valid or not. */
4174 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4176 else
4178 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4179 alt_ifn = IFN_MASK_SCATTER_STORE;
4180 /* When target supports MASK_LEN_SCATTER_STORE, we always
4181 use MASK_LEN_SCATTER_STORE regardless whether len and
4182 mask are valid or not. */
4183 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4186 for (;;)
4188 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4189 if (!offset_vectype)
4190 return false;
4192 /* Test whether the target supports this combination. */
4193 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4194 offset_vectype, scale))
4196 *ifn_out = ifn;
4197 *offset_vectype_out = offset_vectype;
4198 return true;
4200 else if (!masked_p
4201 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4202 memory_type,
4203 offset_vectype,
4204 scale))
4206 *ifn_out = alt_ifn;
4207 *offset_vectype_out = offset_vectype;
4208 return true;
4210 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4211 memory_type,
4212 offset_vectype, scale))
4214 *ifn_out = alt_ifn2;
4215 *offset_vectype_out = offset_vectype;
4216 return true;
4219 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4220 && TYPE_PRECISION (offset_type) >= element_bits)
4221 return false;
4223 offset_type = build_nonstandard_integer_type
4224 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4228 /* STMT_INFO is a call to an internal gather load or scatter store function.
4229 Describe the operation in INFO. */
4231 static void
4232 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4233 gather_scatter_info *info)
4235 gcall *call = as_a <gcall *> (stmt_info->stmt);
4236 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4237 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4239 info->ifn = gimple_call_internal_fn (call);
4240 info->decl = NULL_TREE;
4241 info->base = gimple_call_arg (call, 0);
4242 info->offset = gimple_call_arg (call, 1);
4243 info->offset_dt = vect_unknown_def_type;
4244 info->offset_vectype = NULL_TREE;
4245 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4246 info->element_type = TREE_TYPE (vectype);
4247 info->memory_type = TREE_TYPE (DR_REF (dr));
4250 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4251 gather load or scatter store. Describe the operation in *INFO if so. */
4253 bool
4254 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4255 gather_scatter_info *info)
4257 HOST_WIDE_INT scale = 1;
4258 poly_int64 pbitpos, pbitsize;
4259 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4260 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4261 tree offtype = NULL_TREE;
4262 tree decl = NULL_TREE, base, off;
4263 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4264 tree memory_type = TREE_TYPE (DR_REF (dr));
4265 machine_mode pmode;
4266 int punsignedp, reversep, pvolatilep = 0;
4267 internal_fn ifn;
4268 tree offset_vectype;
4269 bool masked_p = false;
4271 /* See whether this is already a call to a gather/scatter internal function.
4272 If not, see whether it's a masked load or store. */
4273 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4274 if (call && gimple_call_internal_p (call))
4276 ifn = gimple_call_internal_fn (call);
4277 if (internal_gather_scatter_fn_p (ifn))
4279 vect_describe_gather_scatter_call (stmt_info, info);
4280 return true;
4282 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4285 /* True if we should aim to use internal functions rather than
4286 built-in functions. */
4287 bool use_ifn_p = (DR_IS_READ (dr)
4288 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4289 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4291 base = DR_REF (dr);
4292 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4293 see if we can use the def stmt of the address. */
4294 if (masked_p
4295 && TREE_CODE (base) == MEM_REF
4296 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4297 && integer_zerop (TREE_OPERAND (base, 1))
4298 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4300 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4301 if (is_gimple_assign (def_stmt)
4302 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4303 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4306 /* The gather and scatter builtins need address of the form
4307 loop_invariant + vector * {1, 2, 4, 8}
4309 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4310 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4311 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4312 multiplications and additions in it. To get a vector, we need
4313 a single SSA_NAME that will be defined in the loop and will
4314 contain everything that is not loop invariant and that can be
4315 vectorized. The following code attempts to find such a preexistng
4316 SSA_NAME OFF and put the loop invariants into a tree BASE
4317 that can be gimplified before the loop. */
4318 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4319 &punsignedp, &reversep, &pvolatilep);
4320 if (reversep)
4321 return false;
4323 /* PR 107346. Packed structs can have fields at offsets that are not
4324 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4325 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4326 return false;
4328 /* We need to be able to form an address to the base which for example
4329 isn't possible for hard registers. */
4330 if (may_be_nonaddressable_p (base))
4331 return false;
4333 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4335 if (TREE_CODE (base) == MEM_REF)
4337 if (!integer_zerop (TREE_OPERAND (base, 1)))
4339 if (off == NULL_TREE)
4340 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4341 else
4342 off = size_binop (PLUS_EXPR, off,
4343 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4345 base = TREE_OPERAND (base, 0);
4347 else
4348 base = build_fold_addr_expr (base);
4350 if (off == NULL_TREE)
4351 off = size_zero_node;
4353 /* If base is not loop invariant, either off is 0, then we start with just
4354 the constant offset in the loop invariant BASE and continue with base
4355 as OFF, otherwise give up.
4356 We could handle that case by gimplifying the addition of base + off
4357 into some SSA_NAME and use that as off, but for now punt. */
4358 if (!expr_invariant_in_loop_p (loop, base))
4360 if (!integer_zerop (off))
4361 return false;
4362 off = base;
4363 base = size_int (pbytepos);
4365 /* Otherwise put base + constant offset into the loop invariant BASE
4366 and continue with OFF. */
4367 else
4369 base = fold_convert (sizetype, base);
4370 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4373 /* OFF at this point may be either a SSA_NAME or some tree expression
4374 from get_inner_reference. Try to peel off loop invariants from it
4375 into BASE as long as possible. */
4376 STRIP_NOPS (off);
4377 while (offtype == NULL_TREE)
4379 enum tree_code code;
4380 tree op0, op1, add = NULL_TREE;
4382 if (TREE_CODE (off) == SSA_NAME)
4384 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4386 if (expr_invariant_in_loop_p (loop, off))
4387 return false;
4389 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4390 break;
4392 op0 = gimple_assign_rhs1 (def_stmt);
4393 code = gimple_assign_rhs_code (def_stmt);
4394 op1 = gimple_assign_rhs2 (def_stmt);
4396 else
4398 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4399 return false;
4400 code = TREE_CODE (off);
4401 extract_ops_from_tree (off, &code, &op0, &op1);
4403 switch (code)
4405 case POINTER_PLUS_EXPR:
4406 case PLUS_EXPR:
4407 if (expr_invariant_in_loop_p (loop, op0))
4409 add = op0;
4410 off = op1;
4411 do_add:
4412 add = fold_convert (sizetype, add);
4413 if (scale != 1)
4414 add = size_binop (MULT_EXPR, add, size_int (scale));
4415 base = size_binop (PLUS_EXPR, base, add);
4416 continue;
4418 if (expr_invariant_in_loop_p (loop, op1))
4420 add = op1;
4421 off = op0;
4422 goto do_add;
4424 break;
4425 case MINUS_EXPR:
4426 if (expr_invariant_in_loop_p (loop, op1))
4428 add = fold_convert (sizetype, op1);
4429 add = size_binop (MINUS_EXPR, size_zero_node, add);
4430 off = op0;
4431 goto do_add;
4433 break;
4434 case MULT_EXPR:
4435 if (scale == 1 && tree_fits_shwi_p (op1))
4437 int new_scale = tree_to_shwi (op1);
4438 /* Only treat this as a scaling operation if the target
4439 supports it for at least some offset type. */
4440 if (use_ifn_p
4441 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4442 masked_p, vectype, memory_type,
4443 signed_char_type_node,
4444 new_scale, &ifn,
4445 &offset_vectype)
4446 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4447 masked_p, vectype, memory_type,
4448 unsigned_char_type_node,
4449 new_scale, &ifn,
4450 &offset_vectype))
4451 break;
4452 scale = new_scale;
4453 off = op0;
4454 continue;
4456 break;
4457 case SSA_NAME:
4458 off = op0;
4459 continue;
4460 CASE_CONVERT:
4461 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4462 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4463 break;
4465 /* Don't include the conversion if the target is happy with
4466 the current offset type. */
4467 if (use_ifn_p
4468 && TREE_CODE (off) == SSA_NAME
4469 && !POINTER_TYPE_P (TREE_TYPE (off))
4470 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4471 masked_p, vectype, memory_type,
4472 TREE_TYPE (off), scale, &ifn,
4473 &offset_vectype))
4474 break;
4476 if (TYPE_PRECISION (TREE_TYPE (op0))
4477 == TYPE_PRECISION (TREE_TYPE (off)))
4479 off = op0;
4480 continue;
4483 /* Include the conversion if it is widening and we're using
4484 the IFN path or the target can handle the converted from
4485 offset or the current size is not already the same as the
4486 data vector element size. */
4487 if ((TYPE_PRECISION (TREE_TYPE (op0))
4488 < TYPE_PRECISION (TREE_TYPE (off)))
4489 && (use_ifn_p
4490 || (DR_IS_READ (dr)
4491 ? (targetm.vectorize.builtin_gather
4492 && targetm.vectorize.builtin_gather (vectype,
4493 TREE_TYPE (op0),
4494 scale))
4495 : (targetm.vectorize.builtin_scatter
4496 && targetm.vectorize.builtin_scatter (vectype,
4497 TREE_TYPE (op0),
4498 scale)))
4499 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4500 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4502 off = op0;
4503 offtype = TREE_TYPE (off);
4504 STRIP_NOPS (off);
4505 continue;
4507 break;
4508 default:
4509 break;
4511 break;
4514 /* If at the end OFF still isn't a SSA_NAME or isn't
4515 defined in the loop, punt. */
4516 if (TREE_CODE (off) != SSA_NAME
4517 || expr_invariant_in_loop_p (loop, off))
4518 return false;
4520 if (offtype == NULL_TREE)
4521 offtype = TREE_TYPE (off);
4523 if (use_ifn_p)
4525 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4526 vectype, memory_type, offtype, scale,
4527 &ifn, &offset_vectype))
4528 ifn = IFN_LAST;
4529 decl = NULL_TREE;
4531 else
4533 if (DR_IS_READ (dr))
4535 if (targetm.vectorize.builtin_gather)
4536 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4538 else
4540 if (targetm.vectorize.builtin_scatter)
4541 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4543 ifn = IFN_LAST;
4544 /* The offset vector type will be read from DECL when needed. */
4545 offset_vectype = NULL_TREE;
4548 info->ifn = ifn;
4549 info->decl = decl;
4550 info->base = base;
4551 info->offset = off;
4552 info->offset_dt = vect_unknown_def_type;
4553 info->offset_vectype = offset_vectype;
4554 info->scale = scale;
4555 info->element_type = TREE_TYPE (vectype);
4556 info->memory_type = memory_type;
4557 return true;
4560 /* Find the data references in STMT, analyze them with respect to LOOP and
4561 append them to DATAREFS. Return false if datarefs in this stmt cannot
4562 be handled. */
4564 opt_result
4565 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4566 vec<data_reference_p> *datarefs,
4567 vec<int> *dataref_groups, int group_id)
4569 /* We can ignore clobbers for dataref analysis - they are removed during
4570 loop vectorization and BB vectorization checks dependences with a
4571 stmt walk. */
4572 if (gimple_clobber_p (stmt))
4573 return opt_result::success ();
4575 if (gimple_has_volatile_ops (stmt))
4576 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4577 stmt);
4579 if (stmt_can_throw_internal (cfun, stmt))
4580 return opt_result::failure_at (stmt,
4581 "not vectorized:"
4582 " statement can throw an exception: %G",
4583 stmt);
4585 auto_vec<data_reference_p, 2> refs;
4586 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4587 if (!res)
4588 return res;
4590 if (refs.is_empty ())
4591 return opt_result::success ();
4593 if (refs.length () > 1)
4595 while (!refs.is_empty ())
4596 free_data_ref (refs.pop ());
4597 return opt_result::failure_at (stmt,
4598 "not vectorized: more than one "
4599 "data ref in stmt: %G", stmt);
4602 data_reference_p dr = refs.pop ();
4603 if (gcall *call = dyn_cast <gcall *> (stmt))
4604 if (!gimple_call_internal_p (call)
4605 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4606 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4608 free_data_ref (dr);
4609 return opt_result::failure_at (stmt,
4610 "not vectorized: dr in a call %G", stmt);
4613 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4614 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4616 free_data_ref (dr);
4617 return opt_result::failure_at (stmt,
4618 "not vectorized:"
4619 " statement is an unsupported"
4620 " bitfield access %G", stmt);
4623 if (DR_BASE_ADDRESS (dr)
4624 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4626 free_data_ref (dr);
4627 return opt_result::failure_at (stmt,
4628 "not vectorized:"
4629 " base addr of dr is a constant\n");
4632 /* Check whether this may be a SIMD lane access and adjust the
4633 DR to make it easier for us to handle it. */
4634 if (loop
4635 && loop->simduid
4636 && (!DR_BASE_ADDRESS (dr)
4637 || !DR_OFFSET (dr)
4638 || !DR_INIT (dr)
4639 || !DR_STEP (dr)))
4641 struct data_reference *newdr
4642 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4643 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4644 if (DR_BASE_ADDRESS (newdr)
4645 && DR_OFFSET (newdr)
4646 && DR_INIT (newdr)
4647 && DR_STEP (newdr)
4648 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4649 && integer_zerop (DR_STEP (newdr)))
4651 tree base_address = DR_BASE_ADDRESS (newdr);
4652 tree off = DR_OFFSET (newdr);
4653 tree step = ssize_int (1);
4654 if (integer_zerop (off)
4655 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4657 off = TREE_OPERAND (base_address, 1);
4658 base_address = TREE_OPERAND (base_address, 0);
4660 STRIP_NOPS (off);
4661 if (TREE_CODE (off) == MULT_EXPR
4662 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4664 step = TREE_OPERAND (off, 1);
4665 off = TREE_OPERAND (off, 0);
4666 STRIP_NOPS (off);
4668 if (CONVERT_EXPR_P (off)
4669 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4670 < TYPE_PRECISION (TREE_TYPE (off))))
4671 off = TREE_OPERAND (off, 0);
4672 if (TREE_CODE (off) == SSA_NAME)
4674 gimple *def = SSA_NAME_DEF_STMT (off);
4675 /* Look through widening conversion. */
4676 if (is_gimple_assign (def)
4677 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4679 tree rhs1 = gimple_assign_rhs1 (def);
4680 if (TREE_CODE (rhs1) == SSA_NAME
4681 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4682 && (TYPE_PRECISION (TREE_TYPE (off))
4683 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4684 def = SSA_NAME_DEF_STMT (rhs1);
4686 if (is_gimple_call (def)
4687 && gimple_call_internal_p (def)
4688 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4690 tree arg = gimple_call_arg (def, 0);
4691 tree reft = TREE_TYPE (DR_REF (newdr));
4692 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4693 arg = SSA_NAME_VAR (arg);
4694 if (arg == loop->simduid
4695 /* For now. */
4696 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4698 DR_BASE_ADDRESS (newdr) = base_address;
4699 DR_OFFSET (newdr) = ssize_int (0);
4700 DR_STEP (newdr) = step;
4701 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4702 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4703 /* Mark as simd-lane access. */
4704 tree arg2 = gimple_call_arg (def, 1);
4705 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4706 free_data_ref (dr);
4707 datarefs->safe_push (newdr);
4708 if (dataref_groups)
4709 dataref_groups->safe_push (group_id);
4710 return opt_result::success ();
4715 free_data_ref (newdr);
4718 datarefs->safe_push (dr);
4719 if (dataref_groups)
4720 dataref_groups->safe_push (group_id);
4721 return opt_result::success ();
4724 /* Function vect_analyze_data_refs.
4726 Find all the data references in the loop or basic block.
4728 The general structure of the analysis of data refs in the vectorizer is as
4729 follows:
4730 1- vect_analyze_data_refs(loop/bb): call
4731 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4732 in the loop/bb and their dependences.
4733 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4734 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4735 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4739 opt_result
4740 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4742 class loop *loop = NULL;
4743 unsigned int i;
4744 struct data_reference *dr;
4745 tree scalar_type;
4747 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4749 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4750 loop = LOOP_VINFO_LOOP (loop_vinfo);
4752 /* Go through the data-refs, check that the analysis succeeded. Update
4753 pointer from stmt_vec_info struct to DR and vectype. */
4755 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4756 FOR_EACH_VEC_ELT (datarefs, i, dr)
4758 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4759 poly_uint64 vf;
4761 gcc_assert (DR_REF (dr));
4762 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4763 gcc_assert (!stmt_info->dr_aux.dr);
4764 stmt_info->dr_aux.dr = dr;
4765 stmt_info->dr_aux.stmt = stmt_info;
4767 /* Check that analysis of the data-ref succeeded. */
4768 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4769 || !DR_STEP (dr))
4771 bool maybe_gather
4772 = DR_IS_READ (dr)
4773 && !TREE_THIS_VOLATILE (DR_REF (dr));
4774 bool maybe_scatter
4775 = DR_IS_WRITE (dr)
4776 && !TREE_THIS_VOLATILE (DR_REF (dr));
4778 /* If target supports vector gather loads or scatter stores,
4779 see if they can't be used. */
4780 if (is_a <loop_vec_info> (vinfo)
4781 && !nested_in_vect_loop_p (loop, stmt_info))
4783 if (maybe_gather || maybe_scatter)
4785 if (maybe_gather)
4786 gatherscatter = GATHER;
4787 else
4788 gatherscatter = SCATTER;
4792 if (gatherscatter == SG_NONE)
4794 if (dump_enabled_p ())
4795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4796 "not vectorized: data ref analysis "
4797 "failed %G", stmt_info->stmt);
4798 if (is_a <bb_vec_info> (vinfo))
4800 /* In BB vectorization the ref can still participate
4801 in dependence analysis, we just can't vectorize it. */
4802 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4803 continue;
4805 return opt_result::failure_at (stmt_info->stmt,
4806 "not vectorized:"
4807 " data ref analysis failed: %G",
4808 stmt_info->stmt);
4812 /* See if this was detected as SIMD lane access. */
4813 if (dr->aux == (void *)-1
4814 || dr->aux == (void *)-2
4815 || dr->aux == (void *)-3
4816 || dr->aux == (void *)-4)
4818 if (nested_in_vect_loop_p (loop, stmt_info))
4819 return opt_result::failure_at (stmt_info->stmt,
4820 "not vectorized:"
4821 " data ref analysis failed: %G",
4822 stmt_info->stmt);
4823 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4824 = -(uintptr_t) dr->aux;
4827 tree base = get_base_address (DR_REF (dr));
4828 if (base && VAR_P (base) && DECL_NONALIASED (base))
4830 if (dump_enabled_p ())
4831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4832 "not vectorized: base object not addressable "
4833 "for stmt: %G", stmt_info->stmt);
4834 if (is_a <bb_vec_info> (vinfo))
4836 /* In BB vectorization the ref can still participate
4837 in dependence analysis, we just can't vectorize it. */
4838 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4839 continue;
4841 return opt_result::failure_at (stmt_info->stmt,
4842 "not vectorized: base object not"
4843 " addressable for stmt: %G",
4844 stmt_info->stmt);
4847 if (is_a <loop_vec_info> (vinfo)
4848 && DR_STEP (dr)
4849 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4851 if (nested_in_vect_loop_p (loop, stmt_info))
4852 return opt_result::failure_at (stmt_info->stmt,
4853 "not vectorized: "
4854 "not suitable for strided load %G",
4855 stmt_info->stmt);
4856 STMT_VINFO_STRIDED_P (stmt_info) = true;
4859 /* Update DR field in stmt_vec_info struct. */
4861 /* If the dataref is in an inner-loop of the loop that is considered for
4862 for vectorization, we also want to analyze the access relative to
4863 the outer-loop (DR contains information only relative to the
4864 inner-most enclosing loop). We do that by building a reference to the
4865 first location accessed by the inner-loop, and analyze it relative to
4866 the outer-loop. */
4867 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4869 /* Build a reference to the first location accessed by the
4870 inner loop: *(BASE + INIT + OFFSET). By construction,
4871 this address must be invariant in the inner loop, so we
4872 can consider it as being used in the outer loop. */
4873 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4874 tree offset = unshare_expr (DR_OFFSET (dr));
4875 tree init = unshare_expr (DR_INIT (dr));
4876 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4877 init, offset);
4878 tree init_addr = fold_build_pointer_plus (base, init_offset);
4879 tree init_ref = build_fold_indirect_ref (init_addr);
4881 if (dump_enabled_p ())
4882 dump_printf_loc (MSG_NOTE, vect_location,
4883 "analyze in outer loop: %T\n", init_ref);
4885 opt_result res
4886 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4887 init_ref, loop, stmt_info->stmt);
4888 if (!res)
4889 /* dr_analyze_innermost already explained the failure. */
4890 return res;
4892 if (dump_enabled_p ())
4893 dump_printf_loc (MSG_NOTE, vect_location,
4894 "\touter base_address: %T\n"
4895 "\touter offset from base address: %T\n"
4896 "\touter constant offset from base address: %T\n"
4897 "\touter step: %T\n"
4898 "\touter base alignment: %d\n\n"
4899 "\touter base misalignment: %d\n"
4900 "\touter offset alignment: %d\n"
4901 "\touter step alignment: %d\n",
4902 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4903 STMT_VINFO_DR_OFFSET (stmt_info),
4904 STMT_VINFO_DR_INIT (stmt_info),
4905 STMT_VINFO_DR_STEP (stmt_info),
4906 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4907 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4908 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4909 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4912 /* Set vectype for STMT. */
4913 scalar_type = TREE_TYPE (DR_REF (dr));
4914 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4915 if (!vectype)
4917 if (dump_enabled_p ())
4919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4920 "not vectorized: no vectype for stmt: %G",
4921 stmt_info->stmt);
4922 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4923 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4924 scalar_type);
4925 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4928 if (is_a <bb_vec_info> (vinfo))
4930 /* No vector type is fine, the ref can still participate
4931 in dependence analysis, we just can't vectorize it. */
4932 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4933 continue;
4935 if (fatal)
4936 *fatal = false;
4937 return opt_result::failure_at (stmt_info->stmt,
4938 "not vectorized:"
4939 " no vectype for stmt: %G"
4940 " scalar_type: %T\n",
4941 stmt_info->stmt, scalar_type);
4943 else
4945 if (dump_enabled_p ())
4946 dump_printf_loc (MSG_NOTE, vect_location,
4947 "got vectype for stmt: %G%T\n",
4948 stmt_info->stmt, vectype);
4951 /* Adjust the minimal vectorization factor according to the
4952 vector type. */
4953 vf = TYPE_VECTOR_SUBPARTS (vectype);
4954 *min_vf = upper_bound (*min_vf, vf);
4956 /* Leave the BB vectorizer to pick the vector type later, based on
4957 the final dataref group size and SLP node size. */
4958 if (is_a <loop_vec_info> (vinfo))
4959 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4961 if (gatherscatter != SG_NONE)
4963 gather_scatter_info gs_info;
4964 if (!vect_check_gather_scatter (stmt_info,
4965 as_a <loop_vec_info> (vinfo),
4966 &gs_info)
4967 || !get_vectype_for_scalar_type (vinfo,
4968 TREE_TYPE (gs_info.offset)))
4970 if (fatal)
4971 *fatal = false;
4972 return opt_result::failure_at
4973 (stmt_info->stmt,
4974 (gatherscatter == GATHER)
4975 ? "not vectorized: not suitable for gather load %G"
4976 : "not vectorized: not suitable for scatter store %G",
4977 stmt_info->stmt);
4979 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4983 /* We used to stop processing and prune the list here. Verify we no
4984 longer need to. */
4985 gcc_assert (i == datarefs.length ());
4987 return opt_result::success ();
4991 /* Function vect_get_new_vect_var.
4993 Returns a name for a new variable. The current naming scheme appends the
4994 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4995 the name of vectorizer generated variables, and appends that to NAME if
4996 provided. */
4998 tree
4999 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5001 const char *prefix;
5002 tree new_vect_var;
5004 switch (var_kind)
5006 case vect_simple_var:
5007 prefix = "vect";
5008 break;
5009 case vect_scalar_var:
5010 prefix = "stmp";
5011 break;
5012 case vect_mask_var:
5013 prefix = "mask";
5014 break;
5015 case vect_pointer_var:
5016 prefix = "vectp";
5017 break;
5018 default:
5019 gcc_unreachable ();
5022 if (name)
5024 char* tmp = concat (prefix, "_", name, NULL);
5025 new_vect_var = create_tmp_reg (type, tmp);
5026 free (tmp);
5028 else
5029 new_vect_var = create_tmp_reg (type, prefix);
5031 return new_vect_var;
5034 /* Like vect_get_new_vect_var but return an SSA name. */
5036 tree
5037 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5039 const char *prefix;
5040 tree new_vect_var;
5042 switch (var_kind)
5044 case vect_simple_var:
5045 prefix = "vect";
5046 break;
5047 case vect_scalar_var:
5048 prefix = "stmp";
5049 break;
5050 case vect_pointer_var:
5051 prefix = "vectp";
5052 break;
5053 default:
5054 gcc_unreachable ();
5057 if (name)
5059 char* tmp = concat (prefix, "_", name, NULL);
5060 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5061 free (tmp);
5063 else
5064 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5066 return new_vect_var;
5069 /* Duplicate points-to info on NAME from DR_INFO. */
5071 static void
5072 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5074 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5075 /* DR_PTR_INFO is for a base SSA name, not including constant or
5076 variable offsets in the ref so its alignment info does not apply. */
5077 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5080 /* Function vect_create_addr_base_for_vector_ref.
5082 Create an expression that computes the address of the first memory location
5083 that will be accessed for a data reference.
5085 Input:
5086 STMT_INFO: The statement containing the data reference.
5087 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5088 OFFSET: Optional. If supplied, it is be added to the initial address.
5089 LOOP: Specify relative to which loop-nest should the address be computed.
5090 For example, when the dataref is in an inner-loop nested in an
5091 outer-loop that is now being vectorized, LOOP can be either the
5092 outer-loop, or the inner-loop. The first memory location accessed
5093 by the following dataref ('in' points to short):
5095 for (i=0; i<N; i++)
5096 for (j=0; j<M; j++)
5097 s += in[i+j]
5099 is as follows:
5100 if LOOP=i_loop: &in (relative to i_loop)
5101 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5103 Output:
5104 1. Return an SSA_NAME whose value is the address of the memory location of
5105 the first vector of the data reference.
5106 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5107 these statement(s) which define the returned SSA_NAME.
5109 FORNOW: We are only handling array accesses with step 1. */
5111 tree
5112 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5113 gimple_seq *new_stmt_list,
5114 tree offset)
5116 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5117 struct data_reference *dr = dr_info->dr;
5118 const char *base_name;
5119 tree addr_base;
5120 tree dest;
5121 gimple_seq seq = NULL;
5122 tree vect_ptr_type;
5123 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5124 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5126 tree data_ref_base = unshare_expr (drb->base_address);
5127 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5128 tree init = unshare_expr (drb->init);
5130 if (loop_vinfo)
5131 base_name = get_name (data_ref_base);
5132 else
5134 base_offset = ssize_int (0);
5135 init = ssize_int (0);
5136 base_name = get_name (DR_REF (dr));
5139 /* Create base_offset */
5140 base_offset = size_binop (PLUS_EXPR,
5141 fold_convert (sizetype, base_offset),
5142 fold_convert (sizetype, init));
5144 if (offset)
5146 offset = fold_convert (sizetype, offset);
5147 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5148 base_offset, offset);
5151 /* base + base_offset */
5152 if (loop_vinfo)
5153 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5154 else
5155 addr_base = build1 (ADDR_EXPR,
5156 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5157 /* Strip zero offset components since we don't need
5158 them and they can confuse late diagnostics if
5159 we CSE them wrongly. See PR106904 for example. */
5160 unshare_expr (strip_zero_offset_components
5161 (DR_REF (dr))));
5163 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5164 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5165 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5166 gimple_seq_add_seq (new_stmt_list, seq);
5168 if (DR_PTR_INFO (dr)
5169 && TREE_CODE (addr_base) == SSA_NAME
5170 /* We should only duplicate pointer info to newly created SSA names. */
5171 && SSA_NAME_VAR (addr_base) == dest)
5173 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5174 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5177 if (dump_enabled_p ())
5178 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5180 return addr_base;
5184 /* Function vect_create_data_ref_ptr.
5186 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5187 location accessed in the loop by STMT_INFO, along with the def-use update
5188 chain to appropriately advance the pointer through the loop iterations.
5189 Also set aliasing information for the pointer. This pointer is used by
5190 the callers to this function to create a memory reference expression for
5191 vector load/store access.
5193 Input:
5194 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5195 GIMPLE_ASSIGN <name, data-ref> or
5196 GIMPLE_ASSIGN <data-ref, name>.
5197 2. AGGR_TYPE: the type of the reference, which should be either a vector
5198 or an array.
5199 3. AT_LOOP: the loop where the vector memref is to be created.
5200 4. OFFSET (optional): a byte offset to be added to the initial address
5201 accessed by the data-ref in STMT_INFO.
5202 5. BSI: location where the new stmts are to be placed if there is no loop
5203 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5204 pointing to the initial address.
5205 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5206 to the IV during each iteration of the loop. NULL says to move
5207 by one copy of AGGR_TYPE up or down, depending on the step of the
5208 data reference.
5210 Output:
5211 1. Declare a new ptr to vector_type, and have it point to the base of the
5212 data reference (initial addressed accessed by the data reference).
5213 For example, for vector of type V8HI, the following code is generated:
5215 v8hi *ap;
5216 ap = (v8hi *)initial_address;
5218 if OFFSET is not supplied:
5219 initial_address = &a[init];
5220 if OFFSET is supplied:
5221 initial_address = &a[init] + OFFSET;
5222 if BYTE_OFFSET is supplied:
5223 initial_address = &a[init] + BYTE_OFFSET;
5225 Return the initial_address in INITIAL_ADDRESS.
5227 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5228 update the pointer in each iteration of the loop.
5230 Return the increment stmt that updates the pointer in PTR_INCR.
5232 3. Return the pointer. */
5234 tree
5235 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5236 tree aggr_type, class loop *at_loop, tree offset,
5237 tree *initial_address, gimple_stmt_iterator *gsi,
5238 gimple **ptr_incr, bool only_init,
5239 tree iv_step)
5241 const char *base_name;
5242 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5243 class loop *loop = NULL;
5244 bool nested_in_vect_loop = false;
5245 class loop *containing_loop = NULL;
5246 tree aggr_ptr_type;
5247 tree aggr_ptr;
5248 tree new_temp;
5249 gimple_seq new_stmt_list = NULL;
5250 edge pe = NULL;
5251 basic_block new_bb;
5252 tree aggr_ptr_init;
5253 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5254 struct data_reference *dr = dr_info->dr;
5255 tree aptr;
5256 gimple_stmt_iterator incr_gsi;
5257 bool insert_after;
5258 tree indx_before_incr, indx_after_incr;
5259 gimple *incr;
5260 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5262 gcc_assert (iv_step != NULL_TREE
5263 || TREE_CODE (aggr_type) == ARRAY_TYPE
5264 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5266 if (loop_vinfo)
5268 loop = LOOP_VINFO_LOOP (loop_vinfo);
5269 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5270 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5271 pe = loop_preheader_edge (loop);
5273 else
5275 gcc_assert (bb_vinfo);
5276 only_init = true;
5277 *ptr_incr = NULL;
5280 /* Create an expression for the first address accessed by this load
5281 in LOOP. */
5282 base_name = get_name (DR_BASE_ADDRESS (dr));
5284 if (dump_enabled_p ())
5286 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5287 dump_printf_loc (MSG_NOTE, vect_location,
5288 "create %s-pointer variable to type: %T",
5289 get_tree_code_name (TREE_CODE (aggr_type)),
5290 aggr_type);
5291 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5292 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5293 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5294 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5295 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5296 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5297 else
5298 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5299 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5302 /* (1) Create the new aggregate-pointer variable.
5303 Vector and array types inherit the alias set of their component
5304 type by default so we need to use a ref-all pointer if the data
5305 reference does not conflict with the created aggregated data
5306 reference because it is not addressable. */
5307 bool need_ref_all = false;
5308 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5309 get_alias_set (DR_REF (dr))))
5310 need_ref_all = true;
5311 /* Likewise for any of the data references in the stmt group. */
5312 else if (DR_GROUP_SIZE (stmt_info) > 1)
5314 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5317 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5318 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5319 get_alias_set (DR_REF (sdr))))
5321 need_ref_all = true;
5322 break;
5324 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5326 while (sinfo);
5328 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5329 need_ref_all);
5330 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5333 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5334 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5335 def-use update cycles for the pointer: one relative to the outer-loop
5336 (LOOP), which is what steps (3) and (4) below do. The other is relative
5337 to the inner-loop (which is the inner-most loop containing the dataref),
5338 and this is done be step (5) below.
5340 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5341 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5342 redundant. Steps (3),(4) create the following:
5344 vp0 = &base_addr;
5345 LOOP: vp1 = phi(vp0,vp2)
5348 vp2 = vp1 + step
5349 goto LOOP
5351 If there is an inner-loop nested in loop, then step (5) will also be
5352 applied, and an additional update in the inner-loop will be created:
5354 vp0 = &base_addr;
5355 LOOP: vp1 = phi(vp0,vp2)
5357 inner: vp3 = phi(vp1,vp4)
5358 vp4 = vp3 + inner_step
5359 if () goto inner
5361 vp2 = vp1 + step
5362 if () goto LOOP */
5364 /* (2) Calculate the initial address of the aggregate-pointer, and set
5365 the aggregate-pointer to point to it before the loop. */
5367 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5369 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5370 stmt_info, &new_stmt_list,
5371 offset);
5372 if (new_stmt_list)
5374 if (pe)
5376 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5377 gcc_assert (!new_bb);
5379 else
5380 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5383 *initial_address = new_temp;
5384 aggr_ptr_init = new_temp;
5386 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5387 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5388 inner-loop nested in LOOP (during outer-loop vectorization). */
5390 /* No update in loop is required. */
5391 if (only_init && (!loop_vinfo || at_loop == loop))
5392 aptr = aggr_ptr_init;
5393 else
5395 /* Accesses to invariant addresses should be handled specially
5396 by the caller. */
5397 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5398 gcc_assert (!integer_zerop (step));
5400 if (iv_step == NULL_TREE)
5402 /* The step of the aggregate pointer is the type size,
5403 negated for downward accesses. */
5404 iv_step = TYPE_SIZE_UNIT (aggr_type);
5405 if (tree_int_cst_sgn (step) == -1)
5406 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5409 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5411 create_iv (aggr_ptr_init, PLUS_EXPR,
5412 fold_convert (aggr_ptr_type, iv_step),
5413 aggr_ptr, loop, &incr_gsi, insert_after,
5414 &indx_before_incr, &indx_after_incr);
5415 incr = gsi_stmt (incr_gsi);
5417 /* Copy the points-to information if it exists. */
5418 if (DR_PTR_INFO (dr))
5420 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5421 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5423 if (ptr_incr)
5424 *ptr_incr = incr;
5426 aptr = indx_before_incr;
5429 if (!nested_in_vect_loop || only_init)
5430 return aptr;
5433 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5434 nested in LOOP, if exists. */
5436 gcc_assert (nested_in_vect_loop);
5437 if (!only_init)
5439 standard_iv_increment_position (containing_loop, &incr_gsi,
5440 &insert_after);
5441 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5442 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5443 &indx_before_incr, &indx_after_incr);
5444 incr = gsi_stmt (incr_gsi);
5446 /* Copy the points-to information if it exists. */
5447 if (DR_PTR_INFO (dr))
5449 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5450 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5452 if (ptr_incr)
5453 *ptr_incr = incr;
5455 return indx_before_incr;
5457 else
5458 gcc_unreachable ();
5462 /* Function bump_vector_ptr
5464 Increment a pointer (to a vector type) by vector-size. If requested,
5465 i.e. if PTR-INCR is given, then also connect the new increment stmt
5466 to the existing def-use update-chain of the pointer, by modifying
5467 the PTR_INCR as illustrated below:
5469 The pointer def-use update-chain before this function:
5470 DATAREF_PTR = phi (p_0, p_2)
5471 ....
5472 PTR_INCR: p_2 = DATAREF_PTR + step
5474 The pointer def-use update-chain after this function:
5475 DATAREF_PTR = phi (p_0, p_2)
5476 ....
5477 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5478 ....
5479 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5481 Input:
5482 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5483 in the loop.
5484 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5485 the loop. The increment amount across iterations is expected
5486 to be vector_size.
5487 BSI - location where the new update stmt is to be placed.
5488 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5489 BUMP - optional. The offset by which to bump the pointer. If not given,
5490 the offset is assumed to be vector_size.
5492 Output: Return NEW_DATAREF_PTR as illustrated above.
5496 tree
5497 bump_vector_ptr (vec_info *vinfo,
5498 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5499 stmt_vec_info stmt_info, tree bump)
5501 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5502 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5503 tree update = TYPE_SIZE_UNIT (vectype);
5504 gimple *incr_stmt;
5505 ssa_op_iter iter;
5506 use_operand_p use_p;
5507 tree new_dataref_ptr;
5509 if (bump)
5510 update = bump;
5512 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5513 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5514 else if (is_gimple_min_invariant (dataref_ptr))
5515 /* When possible avoid emitting a separate increment stmt that will
5516 force the addressed object addressable. */
5517 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5518 fold_build2 (MEM_REF,
5519 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5520 dataref_ptr,
5521 fold_convert (ptr_type_node, update)));
5522 else
5523 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5524 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5525 dataref_ptr, update);
5526 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5527 /* Fold the increment, avoiding excessive chains use-def chains of
5528 those, leading to compile-time issues for passes until the next
5529 forwprop pass which would do this as well. */
5530 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5531 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5533 incr_stmt = gsi_stmt (fold_gsi);
5534 update_stmt (incr_stmt);
5537 /* Copy the points-to information if it exists. */
5538 if (DR_PTR_INFO (dr))
5540 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5541 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5544 if (!ptr_incr)
5545 return new_dataref_ptr;
5547 /* Update the vector-pointer's cross-iteration increment. */
5548 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5550 tree use = USE_FROM_PTR (use_p);
5552 if (use == dataref_ptr)
5553 SET_USE (use_p, new_dataref_ptr);
5554 else
5555 gcc_assert (operand_equal_p (use, update, 0));
5558 return new_dataref_ptr;
5562 /* Copy memory reference info such as base/clique from the SRC reference
5563 to the DEST MEM_REF. */
5565 void
5566 vect_copy_ref_info (tree dest, tree src)
5568 if (TREE_CODE (dest) != MEM_REF)
5569 return;
5571 tree src_base = src;
5572 while (handled_component_p (src_base))
5573 src_base = TREE_OPERAND (src_base, 0);
5574 if (TREE_CODE (src_base) != MEM_REF
5575 && TREE_CODE (src_base) != TARGET_MEM_REF)
5576 return;
5578 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5579 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5583 /* Function vect_create_destination_var.
5585 Create a new temporary of type VECTYPE. */
5587 tree
5588 vect_create_destination_var (tree scalar_dest, tree vectype)
5590 tree vec_dest;
5591 const char *name;
5592 char *new_name;
5593 tree type;
5594 enum vect_var_kind kind;
5596 kind = vectype
5597 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5598 ? vect_mask_var
5599 : vect_simple_var
5600 : vect_scalar_var;
5601 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5603 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5605 name = get_name (scalar_dest);
5606 if (name)
5607 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5608 else
5609 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5610 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5611 free (new_name);
5613 return vec_dest;
5616 /* Function vect_grouped_store_supported.
5618 Returns TRUE if interleave high and interleave low permutations
5619 are supported, and FALSE otherwise. */
5621 bool
5622 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5624 machine_mode mode = TYPE_MODE (vectype);
5626 /* vect_permute_store_chain requires the group size to be equal to 3 or
5627 be a power of two. */
5628 if (count != 3 && exact_log2 (count) == -1)
5630 if (dump_enabled_p ())
5631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5632 "the size of the group of accesses"
5633 " is not a power of 2 or not eqaul to 3\n");
5634 return false;
5637 /* Check that the permutation is supported. */
5638 if (VECTOR_MODE_P (mode))
5640 unsigned int i;
5641 if (count == 3)
5643 unsigned int j0 = 0, j1 = 0, j2 = 0;
5644 unsigned int i, j;
5646 unsigned int nelt;
5647 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5649 if (dump_enabled_p ())
5650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5651 "cannot handle groups of 3 stores for"
5652 " variable-length vectors\n");
5653 return false;
5656 vec_perm_builder sel (nelt, nelt, 1);
5657 sel.quick_grow (nelt);
5658 vec_perm_indices indices;
5659 for (j = 0; j < 3; j++)
5661 int nelt0 = ((3 - j) * nelt) % 3;
5662 int nelt1 = ((3 - j) * nelt + 1) % 3;
5663 int nelt2 = ((3 - j) * nelt + 2) % 3;
5664 for (i = 0; i < nelt; i++)
5666 if (3 * i + nelt0 < nelt)
5667 sel[3 * i + nelt0] = j0++;
5668 if (3 * i + nelt1 < nelt)
5669 sel[3 * i + nelt1] = nelt + j1++;
5670 if (3 * i + nelt2 < nelt)
5671 sel[3 * i + nelt2] = 0;
5673 indices.new_vector (sel, 2, nelt);
5674 if (!can_vec_perm_const_p (mode, mode, indices))
5676 if (dump_enabled_p ())
5677 dump_printf (MSG_MISSED_OPTIMIZATION,
5678 "permutation op not supported by target.\n");
5679 return false;
5682 for (i = 0; i < nelt; i++)
5684 if (3 * i + nelt0 < nelt)
5685 sel[3 * i + nelt0] = 3 * i + nelt0;
5686 if (3 * i + nelt1 < nelt)
5687 sel[3 * i + nelt1] = 3 * i + nelt1;
5688 if (3 * i + nelt2 < nelt)
5689 sel[3 * i + nelt2] = nelt + j2++;
5691 indices.new_vector (sel, 2, nelt);
5692 if (!can_vec_perm_const_p (mode, mode, indices))
5694 if (dump_enabled_p ())
5695 dump_printf (MSG_MISSED_OPTIMIZATION,
5696 "permutation op not supported by target.\n");
5697 return false;
5700 return true;
5702 else
5704 /* If length is not equal to 3 then only power of 2 is supported. */
5705 gcc_assert (pow2p_hwi (count));
5706 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5708 /* The encoding has 2 interleaved stepped patterns. */
5709 if(!multiple_p (nelt, 2))
5710 return false;
5711 vec_perm_builder sel (nelt, 2, 3);
5712 sel.quick_grow (6);
5713 for (i = 0; i < 3; i++)
5715 sel[i * 2] = i;
5716 sel[i * 2 + 1] = i + nelt;
5718 vec_perm_indices indices (sel, 2, nelt);
5719 if (can_vec_perm_const_p (mode, mode, indices))
5721 for (i = 0; i < 6; i++)
5722 sel[i] += exact_div (nelt, 2);
5723 indices.new_vector (sel, 2, nelt);
5724 if (can_vec_perm_const_p (mode, mode, indices))
5725 return true;
5730 if (dump_enabled_p ())
5731 dump_printf (MSG_MISSED_OPTIMIZATION,
5732 "permutation op not supported by target.\n");
5733 return false;
5736 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5737 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5739 internal_fn
5740 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5741 bool masked_p)
5743 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5744 vec_mask_len_store_lanes_optab, vectype,
5745 count))
5746 return IFN_MASK_LEN_STORE_LANES;
5747 else if (masked_p)
5749 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5750 vec_mask_store_lanes_optab, vectype,
5751 count))
5752 return IFN_MASK_STORE_LANES;
5754 else
5756 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5757 vec_store_lanes_optab, vectype, count))
5758 return IFN_STORE_LANES;
5760 return IFN_LAST;
5764 /* Function vect_permute_store_chain.
5766 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5767 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5768 the data correctly for the stores. Return the final references for stores
5769 in RESULT_CHAIN.
5771 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5772 The input is 4 vectors each containing 8 elements. We assign a number to
5773 each element, the input sequence is:
5775 1st vec: 0 1 2 3 4 5 6 7
5776 2nd vec: 8 9 10 11 12 13 14 15
5777 3rd vec: 16 17 18 19 20 21 22 23
5778 4th vec: 24 25 26 27 28 29 30 31
5780 The output sequence should be:
5782 1st vec: 0 8 16 24 1 9 17 25
5783 2nd vec: 2 10 18 26 3 11 19 27
5784 3rd vec: 4 12 20 28 5 13 21 30
5785 4th vec: 6 14 22 30 7 15 23 31
5787 i.e., we interleave the contents of the four vectors in their order.
5789 We use interleave_high/low instructions to create such output. The input of
5790 each interleave_high/low operation is two vectors:
5791 1st vec 2nd vec
5792 0 1 2 3 4 5 6 7
5793 the even elements of the result vector are obtained left-to-right from the
5794 high/low elements of the first vector. The odd elements of the result are
5795 obtained left-to-right from the high/low elements of the second vector.
5796 The output of interleave_high will be: 0 4 1 5
5797 and of interleave_low: 2 6 3 7
5800 The permutation is done in log LENGTH stages. In each stage interleave_high
5801 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5802 where the first argument is taken from the first half of DR_CHAIN and the
5803 second argument from it's second half.
5804 In our example,
5806 I1: interleave_high (1st vec, 3rd vec)
5807 I2: interleave_low (1st vec, 3rd vec)
5808 I3: interleave_high (2nd vec, 4th vec)
5809 I4: interleave_low (2nd vec, 4th vec)
5811 The output for the first stage is:
5813 I1: 0 16 1 17 2 18 3 19
5814 I2: 4 20 5 21 6 22 7 23
5815 I3: 8 24 9 25 10 26 11 27
5816 I4: 12 28 13 29 14 30 15 31
5818 The output of the second stage, i.e. the final result is:
5820 I1: 0 8 16 24 1 9 17 25
5821 I2: 2 10 18 26 3 11 19 27
5822 I3: 4 12 20 28 5 13 21 30
5823 I4: 6 14 22 30 7 15 23 31. */
5825 void
5826 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5827 unsigned int length,
5828 stmt_vec_info stmt_info,
5829 gimple_stmt_iterator *gsi,
5830 vec<tree> *result_chain)
5832 tree vect1, vect2, high, low;
5833 gimple *perm_stmt;
5834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5835 tree perm_mask_low, perm_mask_high;
5836 tree data_ref;
5837 tree perm3_mask_low, perm3_mask_high;
5838 unsigned int i, j, n, log_length = exact_log2 (length);
5840 result_chain->quick_grow (length);
5841 memcpy (result_chain->address (), dr_chain.address (),
5842 length * sizeof (tree));
5844 if (length == 3)
5846 /* vect_grouped_store_supported ensures that this is constant. */
5847 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5848 unsigned int j0 = 0, j1 = 0, j2 = 0;
5850 vec_perm_builder sel (nelt, nelt, 1);
5851 sel.quick_grow (nelt);
5852 vec_perm_indices indices;
5853 for (j = 0; j < 3; j++)
5855 int nelt0 = ((3 - j) * nelt) % 3;
5856 int nelt1 = ((3 - j) * nelt + 1) % 3;
5857 int nelt2 = ((3 - j) * nelt + 2) % 3;
5859 for (i = 0; i < nelt; i++)
5861 if (3 * i + nelt0 < nelt)
5862 sel[3 * i + nelt0] = j0++;
5863 if (3 * i + nelt1 < nelt)
5864 sel[3 * i + nelt1] = nelt + j1++;
5865 if (3 * i + nelt2 < nelt)
5866 sel[3 * i + nelt2] = 0;
5868 indices.new_vector (sel, 2, nelt);
5869 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5871 for (i = 0; i < nelt; i++)
5873 if (3 * i + nelt0 < nelt)
5874 sel[3 * i + nelt0] = 3 * i + nelt0;
5875 if (3 * i + nelt1 < nelt)
5876 sel[3 * i + nelt1] = 3 * i + nelt1;
5877 if (3 * i + nelt2 < nelt)
5878 sel[3 * i + nelt2] = nelt + j2++;
5880 indices.new_vector (sel, 2, nelt);
5881 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5883 vect1 = dr_chain[0];
5884 vect2 = dr_chain[1];
5886 /* Create interleaving stmt:
5887 low = VEC_PERM_EXPR <vect1, vect2,
5888 {j, nelt, *, j + 1, nelt + j + 1, *,
5889 j + 2, nelt + j + 2, *, ...}> */
5890 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5891 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5892 vect2, perm3_mask_low);
5893 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5895 vect1 = data_ref;
5896 vect2 = dr_chain[2];
5897 /* Create interleaving stmt:
5898 low = VEC_PERM_EXPR <vect1, vect2,
5899 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5900 6, 7, nelt + j + 2, ...}> */
5901 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5902 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5903 vect2, perm3_mask_high);
5904 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5905 (*result_chain)[j] = data_ref;
5908 else
5910 /* If length is not equal to 3 then only power of 2 is supported. */
5911 gcc_assert (pow2p_hwi (length));
5913 /* The encoding has 2 interleaved stepped patterns. */
5914 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5915 vec_perm_builder sel (nelt, 2, 3);
5916 sel.quick_grow (6);
5917 for (i = 0; i < 3; i++)
5919 sel[i * 2] = i;
5920 sel[i * 2 + 1] = i + nelt;
5922 vec_perm_indices indices (sel, 2, nelt);
5923 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5925 for (i = 0; i < 6; i++)
5926 sel[i] += exact_div (nelt, 2);
5927 indices.new_vector (sel, 2, nelt);
5928 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5930 for (i = 0, n = log_length; i < n; i++)
5932 for (j = 0; j < length/2; j++)
5934 vect1 = dr_chain[j];
5935 vect2 = dr_chain[j+length/2];
5937 /* Create interleaving stmt:
5938 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5939 ...}> */
5940 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5941 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5942 vect2, perm_mask_high);
5943 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5944 (*result_chain)[2*j] = high;
5946 /* Create interleaving stmt:
5947 low = VEC_PERM_EXPR <vect1, vect2,
5948 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5949 ...}> */
5950 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5951 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5952 vect2, perm_mask_low);
5953 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5954 (*result_chain)[2*j+1] = low;
5956 memcpy (dr_chain.address (), result_chain->address (),
5957 length * sizeof (tree));
5962 /* Function vect_setup_realignment
5964 This function is called when vectorizing an unaligned load using
5965 the dr_explicit_realign[_optimized] scheme.
5966 This function generates the following code at the loop prolog:
5968 p = initial_addr;
5969 x msq_init = *(floor(p)); # prolog load
5970 realignment_token = call target_builtin;
5971 loop:
5972 x msq = phi (msq_init, ---)
5974 The stmts marked with x are generated only for the case of
5975 dr_explicit_realign_optimized.
5977 The code above sets up a new (vector) pointer, pointing to the first
5978 location accessed by STMT_INFO, and a "floor-aligned" load using that
5979 pointer. It also generates code to compute the "realignment-token"
5980 (if the relevant target hook was defined), and creates a phi-node at the
5981 loop-header bb whose arguments are the result of the prolog-load (created
5982 by this function) and the result of a load that takes place in the loop
5983 (to be created by the caller to this function).
5985 For the case of dr_explicit_realign_optimized:
5986 The caller to this function uses the phi-result (msq) to create the
5987 realignment code inside the loop, and sets up the missing phi argument,
5988 as follows:
5989 loop:
5990 msq = phi (msq_init, lsq)
5991 lsq = *(floor(p')); # load in loop
5992 result = realign_load (msq, lsq, realignment_token);
5994 For the case of dr_explicit_realign:
5995 loop:
5996 msq = *(floor(p)); # load in loop
5997 p' = p + (VS-1);
5998 lsq = *(floor(p')); # load in loop
5999 result = realign_load (msq, lsq, realignment_token);
6001 Input:
6002 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6003 a memory location that may be unaligned.
6004 BSI - place where new code is to be inserted.
6005 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6006 is used.
6008 Output:
6009 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6010 target hook, if defined.
6011 Return value - the result of the loop-header phi node. */
6013 tree
6014 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6015 gimple_stmt_iterator *gsi, tree *realignment_token,
6016 enum dr_alignment_support alignment_support_scheme,
6017 tree init_addr,
6018 class loop **at_loop)
6020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6021 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6022 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6023 struct data_reference *dr = dr_info->dr;
6024 class loop *loop = NULL;
6025 edge pe = NULL;
6026 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6027 tree vec_dest;
6028 gimple *inc;
6029 tree ptr;
6030 tree data_ref;
6031 basic_block new_bb;
6032 tree msq_init = NULL_TREE;
6033 tree new_temp;
6034 gphi *phi_stmt;
6035 tree msq = NULL_TREE;
6036 gimple_seq stmts = NULL;
6037 bool compute_in_loop = false;
6038 bool nested_in_vect_loop = false;
6039 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6040 class loop *loop_for_initial_load = NULL;
6042 if (loop_vinfo)
6044 loop = LOOP_VINFO_LOOP (loop_vinfo);
6045 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6048 gcc_assert (alignment_support_scheme == dr_explicit_realign
6049 || alignment_support_scheme == dr_explicit_realign_optimized);
6051 /* We need to generate three things:
6052 1. the misalignment computation
6053 2. the extra vector load (for the optimized realignment scheme).
6054 3. the phi node for the two vectors from which the realignment is
6055 done (for the optimized realignment scheme). */
6057 /* 1. Determine where to generate the misalignment computation.
6059 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6060 calculation will be generated by this function, outside the loop (in the
6061 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6062 caller, inside the loop.
6064 Background: If the misalignment remains fixed throughout the iterations of
6065 the loop, then both realignment schemes are applicable, and also the
6066 misalignment computation can be done outside LOOP. This is because we are
6067 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6068 are a multiple of VS (the Vector Size), and therefore the misalignment in
6069 different vectorized LOOP iterations is always the same.
6070 The problem arises only if the memory access is in an inner-loop nested
6071 inside LOOP, which is now being vectorized using outer-loop vectorization.
6072 This is the only case when the misalignment of the memory access may not
6073 remain fixed throughout the iterations of the inner-loop (as explained in
6074 detail in vect_supportable_dr_alignment). In this case, not only is the
6075 optimized realignment scheme not applicable, but also the misalignment
6076 computation (and generation of the realignment token that is passed to
6077 REALIGN_LOAD) have to be done inside the loop.
6079 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6080 or not, which in turn determines if the misalignment is computed inside
6081 the inner-loop, or outside LOOP. */
6083 if (init_addr != NULL_TREE || !loop_vinfo)
6085 compute_in_loop = true;
6086 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6090 /* 2. Determine where to generate the extra vector load.
6092 For the optimized realignment scheme, instead of generating two vector
6093 loads in each iteration, we generate a single extra vector load in the
6094 preheader of the loop, and in each iteration reuse the result of the
6095 vector load from the previous iteration. In case the memory access is in
6096 an inner-loop nested inside LOOP, which is now being vectorized using
6097 outer-loop vectorization, we need to determine whether this initial vector
6098 load should be generated at the preheader of the inner-loop, or can be
6099 generated at the preheader of LOOP. If the memory access has no evolution
6100 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6101 to be generated inside LOOP (in the preheader of the inner-loop). */
6103 if (nested_in_vect_loop)
6105 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6106 bool invariant_in_outerloop =
6107 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6108 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6110 else
6111 loop_for_initial_load = loop;
6112 if (at_loop)
6113 *at_loop = loop_for_initial_load;
6115 tree vuse = NULL_TREE;
6116 if (loop_for_initial_load)
6118 pe = loop_preheader_edge (loop_for_initial_load);
6119 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6120 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6122 if (!vuse)
6123 vuse = gimple_vuse (gsi_stmt (*gsi));
6125 /* 3. For the case of the optimized realignment, create the first vector
6126 load at the loop preheader. */
6128 if (alignment_support_scheme == dr_explicit_realign_optimized)
6130 /* Create msq_init = *(floor(p1)) in the loop preheader */
6131 gassign *new_stmt;
6133 gcc_assert (!compute_in_loop);
6134 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6135 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6136 loop_for_initial_load, NULL_TREE,
6137 &init_addr, NULL, &inc, true);
6138 if (TREE_CODE (ptr) == SSA_NAME)
6139 new_temp = copy_ssa_name (ptr);
6140 else
6141 new_temp = make_ssa_name (TREE_TYPE (ptr));
6142 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6143 tree type = TREE_TYPE (ptr);
6144 new_stmt = gimple_build_assign
6145 (new_temp, BIT_AND_EXPR, ptr,
6146 fold_build2 (MINUS_EXPR, type,
6147 build_int_cst (type, 0),
6148 build_int_cst (type, align)));
6149 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6150 gcc_assert (!new_bb);
6151 data_ref
6152 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6153 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6154 vect_copy_ref_info (data_ref, DR_REF (dr));
6155 new_stmt = gimple_build_assign (vec_dest, data_ref);
6156 new_temp = make_ssa_name (vec_dest, new_stmt);
6157 gimple_assign_set_lhs (new_stmt, new_temp);
6158 gimple_set_vuse (new_stmt, vuse);
6159 if (pe)
6161 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6162 gcc_assert (!new_bb);
6164 else
6165 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6167 msq_init = gimple_assign_lhs (new_stmt);
6170 /* 4. Create realignment token using a target builtin, if available.
6171 It is done either inside the containing loop, or before LOOP (as
6172 determined above). */
6174 if (targetm.vectorize.builtin_mask_for_load)
6176 gcall *new_stmt;
6177 tree builtin_decl;
6179 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6180 if (!init_addr)
6182 /* Generate the INIT_ADDR computation outside LOOP. */
6183 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6184 stmt_info, &stmts,
6185 NULL_TREE);
6186 if (loop)
6188 pe = loop_preheader_edge (loop);
6189 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6190 gcc_assert (!new_bb);
6192 else
6193 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6196 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6197 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6198 vec_dest =
6199 vect_create_destination_var (scalar_dest,
6200 gimple_call_return_type (new_stmt));
6201 new_temp = make_ssa_name (vec_dest, new_stmt);
6202 gimple_call_set_lhs (new_stmt, new_temp);
6204 if (compute_in_loop)
6205 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6206 else
6208 /* Generate the misalignment computation outside LOOP. */
6209 pe = loop_preheader_edge (loop);
6210 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6211 gcc_assert (!new_bb);
6214 *realignment_token = gimple_call_lhs (new_stmt);
6216 /* The result of the CALL_EXPR to this builtin is determined from
6217 the value of the parameter and no global variables are touched
6218 which makes the builtin a "const" function. Requiring the
6219 builtin to have the "const" attribute makes it unnecessary
6220 to call mark_call_clobbered. */
6221 gcc_assert (TREE_READONLY (builtin_decl));
6224 if (alignment_support_scheme == dr_explicit_realign)
6225 return msq;
6227 gcc_assert (!compute_in_loop);
6228 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6231 /* 5. Create msq = phi <msq_init, lsq> in loop */
6233 pe = loop_preheader_edge (containing_loop);
6234 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6235 msq = make_ssa_name (vec_dest);
6236 phi_stmt = create_phi_node (msq, containing_loop->header);
6237 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6239 return msq;
6243 /* Function vect_grouped_load_supported.
6245 COUNT is the size of the load group (the number of statements plus the
6246 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6247 only one statement, with a gap of COUNT - 1.
6249 Returns true if a suitable permute exists. */
6251 bool
6252 vect_grouped_load_supported (tree vectype, bool single_element_p,
6253 unsigned HOST_WIDE_INT count)
6255 machine_mode mode = TYPE_MODE (vectype);
6257 /* If this is single-element interleaving with an element distance
6258 that leaves unused vector loads around punt - we at least create
6259 very sub-optimal code in that case (and blow up memory,
6260 see PR65518). */
6261 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6263 if (dump_enabled_p ())
6264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265 "single-element interleaving not supported "
6266 "for not adjacent vector loads\n");
6267 return false;
6270 /* vect_permute_load_chain requires the group size to be equal to 3 or
6271 be a power of two. */
6272 if (count != 3 && exact_log2 (count) == -1)
6274 if (dump_enabled_p ())
6275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6276 "the size of the group of accesses"
6277 " is not a power of 2 or not equal to 3\n");
6278 return false;
6281 /* Check that the permutation is supported. */
6282 if (VECTOR_MODE_P (mode))
6284 unsigned int i, j;
6285 if (count == 3)
6287 unsigned int nelt;
6288 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6290 if (dump_enabled_p ())
6291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292 "cannot handle groups of 3 loads for"
6293 " variable-length vectors\n");
6294 return false;
6297 vec_perm_builder sel (nelt, nelt, 1);
6298 sel.quick_grow (nelt);
6299 vec_perm_indices indices;
6300 unsigned int k;
6301 for (k = 0; k < 3; k++)
6303 for (i = 0; i < nelt; i++)
6304 if (3 * i + k < 2 * nelt)
6305 sel[i] = 3 * i + k;
6306 else
6307 sel[i] = 0;
6308 indices.new_vector (sel, 2, nelt);
6309 if (!can_vec_perm_const_p (mode, mode, indices))
6311 if (dump_enabled_p ())
6312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6313 "shuffle of 3 loads is not supported by"
6314 " target\n");
6315 return false;
6317 for (i = 0, j = 0; i < nelt; i++)
6318 if (3 * i + k < 2 * nelt)
6319 sel[i] = i;
6320 else
6321 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6322 indices.new_vector (sel, 2, nelt);
6323 if (!can_vec_perm_const_p (mode, mode, indices))
6325 if (dump_enabled_p ())
6326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6327 "shuffle of 3 loads is not supported by"
6328 " target\n");
6329 return false;
6332 return true;
6334 else
6336 /* If length is not equal to 3 then only power of 2 is supported. */
6337 gcc_assert (pow2p_hwi (count));
6338 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6340 /* The encoding has a single stepped pattern. */
6341 vec_perm_builder sel (nelt, 1, 3);
6342 sel.quick_grow (3);
6343 for (i = 0; i < 3; i++)
6344 sel[i] = i * 2;
6345 vec_perm_indices indices (sel, 2, nelt);
6346 if (can_vec_perm_const_p (mode, mode, indices))
6348 for (i = 0; i < 3; i++)
6349 sel[i] = i * 2 + 1;
6350 indices.new_vector (sel, 2, nelt);
6351 if (can_vec_perm_const_p (mode, mode, indices))
6352 return true;
6357 if (dump_enabled_p ())
6358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6359 "extract even/odd not supported by target\n");
6360 return false;
6363 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6364 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6366 internal_fn
6367 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6368 bool masked_p)
6370 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6371 vec_mask_len_load_lanes_optab, vectype,
6372 count))
6373 return IFN_MASK_LEN_LOAD_LANES;
6374 else if (masked_p)
6376 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6377 vec_mask_load_lanes_optab, vectype,
6378 count))
6379 return IFN_MASK_LOAD_LANES;
6381 else
6383 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6384 vectype, count))
6385 return IFN_LOAD_LANES;
6387 return IFN_LAST;
6390 /* Function vect_permute_load_chain.
6392 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6393 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6394 the input data correctly. Return the final references for loads in
6395 RESULT_CHAIN.
6397 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6398 The input is 4 vectors each containing 8 elements. We assign a number to each
6399 element, the input sequence is:
6401 1st vec: 0 1 2 3 4 5 6 7
6402 2nd vec: 8 9 10 11 12 13 14 15
6403 3rd vec: 16 17 18 19 20 21 22 23
6404 4th vec: 24 25 26 27 28 29 30 31
6406 The output sequence should be:
6408 1st vec: 0 4 8 12 16 20 24 28
6409 2nd vec: 1 5 9 13 17 21 25 29
6410 3rd vec: 2 6 10 14 18 22 26 30
6411 4th vec: 3 7 11 15 19 23 27 31
6413 i.e., the first output vector should contain the first elements of each
6414 interleaving group, etc.
6416 We use extract_even/odd instructions to create such output. The input of
6417 each extract_even/odd operation is two vectors
6418 1st vec 2nd vec
6419 0 1 2 3 4 5 6 7
6421 and the output is the vector of extracted even/odd elements. The output of
6422 extract_even will be: 0 2 4 6
6423 and of extract_odd: 1 3 5 7
6426 The permutation is done in log LENGTH stages. In each stage extract_even
6427 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6428 their order. In our example,
6430 E1: extract_even (1st vec, 2nd vec)
6431 E2: extract_odd (1st vec, 2nd vec)
6432 E3: extract_even (3rd vec, 4th vec)
6433 E4: extract_odd (3rd vec, 4th vec)
6435 The output for the first stage will be:
6437 E1: 0 2 4 6 8 10 12 14
6438 E2: 1 3 5 7 9 11 13 15
6439 E3: 16 18 20 22 24 26 28 30
6440 E4: 17 19 21 23 25 27 29 31
6442 In order to proceed and create the correct sequence for the next stage (or
6443 for the correct output, if the second stage is the last one, as in our
6444 example), we first put the output of extract_even operation and then the
6445 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6446 The input for the second stage is:
6448 1st vec (E1): 0 2 4 6 8 10 12 14
6449 2nd vec (E3): 16 18 20 22 24 26 28 30
6450 3rd vec (E2): 1 3 5 7 9 11 13 15
6451 4th vec (E4): 17 19 21 23 25 27 29 31
6453 The output of the second stage:
6455 E1: 0 4 8 12 16 20 24 28
6456 E2: 2 6 10 14 18 22 26 30
6457 E3: 1 5 9 13 17 21 25 29
6458 E4: 3 7 11 15 19 23 27 31
6460 And RESULT_CHAIN after reordering:
6462 1st vec (E1): 0 4 8 12 16 20 24 28
6463 2nd vec (E3): 1 5 9 13 17 21 25 29
6464 3rd vec (E2): 2 6 10 14 18 22 26 30
6465 4th vec (E4): 3 7 11 15 19 23 27 31. */
6467 static void
6468 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6469 unsigned int length,
6470 stmt_vec_info stmt_info,
6471 gimple_stmt_iterator *gsi,
6472 vec<tree> *result_chain)
6474 tree data_ref, first_vect, second_vect;
6475 tree perm_mask_even, perm_mask_odd;
6476 tree perm3_mask_low, perm3_mask_high;
6477 gimple *perm_stmt;
6478 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6479 unsigned int i, j, log_length = exact_log2 (length);
6481 result_chain->quick_grow (length);
6482 memcpy (result_chain->address (), dr_chain.address (),
6483 length * sizeof (tree));
6485 if (length == 3)
6487 /* vect_grouped_load_supported ensures that this is constant. */
6488 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6489 unsigned int k;
6491 vec_perm_builder sel (nelt, nelt, 1);
6492 sel.quick_grow (nelt);
6493 vec_perm_indices indices;
6494 for (k = 0; k < 3; k++)
6496 for (i = 0; i < nelt; i++)
6497 if (3 * i + k < 2 * nelt)
6498 sel[i] = 3 * i + k;
6499 else
6500 sel[i] = 0;
6501 indices.new_vector (sel, 2, nelt);
6502 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6504 for (i = 0, j = 0; i < nelt; i++)
6505 if (3 * i + k < 2 * nelt)
6506 sel[i] = i;
6507 else
6508 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6509 indices.new_vector (sel, 2, nelt);
6510 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6512 first_vect = dr_chain[0];
6513 second_vect = dr_chain[1];
6515 /* Create interleaving stmt (low part of):
6516 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6517 ...}> */
6518 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6519 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6520 second_vect, perm3_mask_low);
6521 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6523 /* Create interleaving stmt (high part of):
6524 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6525 ...}> */
6526 first_vect = data_ref;
6527 second_vect = dr_chain[2];
6528 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6529 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6530 second_vect, perm3_mask_high);
6531 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6532 (*result_chain)[k] = data_ref;
6535 else
6537 /* If length is not equal to 3 then only power of 2 is supported. */
6538 gcc_assert (pow2p_hwi (length));
6540 /* The encoding has a single stepped pattern. */
6541 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6542 vec_perm_builder sel (nelt, 1, 3);
6543 sel.quick_grow (3);
6544 for (i = 0; i < 3; ++i)
6545 sel[i] = i * 2;
6546 vec_perm_indices indices (sel, 2, nelt);
6547 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6549 for (i = 0; i < 3; ++i)
6550 sel[i] = i * 2 + 1;
6551 indices.new_vector (sel, 2, nelt);
6552 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6554 for (i = 0; i < log_length; i++)
6556 for (j = 0; j < length; j += 2)
6558 first_vect = dr_chain[j];
6559 second_vect = dr_chain[j+1];
6561 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6562 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6563 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6564 first_vect, second_vect,
6565 perm_mask_even);
6566 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6567 (*result_chain)[j/2] = data_ref;
6569 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6570 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6571 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6572 first_vect, second_vect,
6573 perm_mask_odd);
6574 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6575 (*result_chain)[j/2+length/2] = data_ref;
6577 memcpy (dr_chain.address (), result_chain->address (),
6578 length * sizeof (tree));
6583 /* Function vect_shift_permute_load_chain.
6585 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6586 sequence of stmts to reorder the input data accordingly.
6587 Return the final references for loads in RESULT_CHAIN.
6588 Return true if successed, false otherwise.
6590 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6591 The input is 3 vectors each containing 8 elements. We assign a
6592 number to each element, the input sequence is:
6594 1st vec: 0 1 2 3 4 5 6 7
6595 2nd vec: 8 9 10 11 12 13 14 15
6596 3rd vec: 16 17 18 19 20 21 22 23
6598 The output sequence should be:
6600 1st vec: 0 3 6 9 12 15 18 21
6601 2nd vec: 1 4 7 10 13 16 19 22
6602 3rd vec: 2 5 8 11 14 17 20 23
6604 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6606 First we shuffle all 3 vectors to get correct elements order:
6608 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6609 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6610 3rd vec: (16 19 22) (17 20 23) (18 21)
6612 Next we unite and shift vector 3 times:
6614 1st step:
6615 shift right by 6 the concatenation of:
6616 "1st vec" and "2nd vec"
6617 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6618 "2nd vec" and "3rd vec"
6619 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6620 "3rd vec" and "1st vec"
6621 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6622 | New vectors |
6624 So that now new vectors are:
6626 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6627 2nd vec: (10 13) (16 19 22) (17 20 23)
6628 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6630 2nd step:
6631 shift right by 5 the concatenation of:
6632 "1st vec" and "3rd vec"
6633 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6634 "2nd vec" and "1st vec"
6635 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6636 "3rd vec" and "2nd vec"
6637 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6638 | New vectors |
6640 So that now new vectors are:
6642 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6643 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6644 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6646 3rd step:
6647 shift right by 5 the concatenation of:
6648 "1st vec" and "1st vec"
6649 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6650 shift right by 3 the concatenation of:
6651 "2nd vec" and "2nd vec"
6652 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6653 | New vectors |
6655 So that now all vectors are READY:
6656 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6657 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6658 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6660 This algorithm is faster than one in vect_permute_load_chain if:
6661 1. "shift of a concatination" is faster than general permutation.
6662 This is usually so.
6663 2. The TARGET machine can't execute vector instructions in parallel.
6664 This is because each step of the algorithm depends on previous.
6665 The algorithm in vect_permute_load_chain is much more parallel.
6667 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6670 static bool
6671 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6672 unsigned int length,
6673 stmt_vec_info stmt_info,
6674 gimple_stmt_iterator *gsi,
6675 vec<tree> *result_chain)
6677 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6678 tree perm2_mask1, perm2_mask2, perm3_mask;
6679 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6680 gimple *perm_stmt;
6682 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6683 machine_mode vmode = TYPE_MODE (vectype);
6684 unsigned int i;
6685 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6687 unsigned HOST_WIDE_INT nelt, vf;
6688 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6689 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6690 /* Not supported for variable-length vectors. */
6691 return false;
6693 vec_perm_builder sel (nelt, nelt, 1);
6694 sel.quick_grow (nelt);
6696 result_chain->quick_grow (length);
6697 memcpy (result_chain->address (), dr_chain.address (),
6698 length * sizeof (tree));
6700 if (pow2p_hwi (length) && vf > 4)
6702 unsigned int j, log_length = exact_log2 (length);
6703 for (i = 0; i < nelt / 2; ++i)
6704 sel[i] = i * 2;
6705 for (i = 0; i < nelt / 2; ++i)
6706 sel[nelt / 2 + i] = i * 2 + 1;
6707 vec_perm_indices indices (sel, 2, nelt);
6708 if (!can_vec_perm_const_p (vmode, vmode, indices))
6710 if (dump_enabled_p ())
6711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6712 "shuffle of 2 fields structure is not \
6713 supported by target\n");
6714 return false;
6716 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6718 for (i = 0; i < nelt / 2; ++i)
6719 sel[i] = i * 2 + 1;
6720 for (i = 0; i < nelt / 2; ++i)
6721 sel[nelt / 2 + i] = i * 2;
6722 indices.new_vector (sel, 2, nelt);
6723 if (!can_vec_perm_const_p (vmode, vmode, indices))
6725 if (dump_enabled_p ())
6726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 "shuffle of 2 fields structure is not \
6728 supported by target\n");
6729 return false;
6731 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6733 /* Generating permutation constant to shift all elements.
6734 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6735 for (i = 0; i < nelt; i++)
6736 sel[i] = nelt / 2 + i;
6737 indices.new_vector (sel, 2, nelt);
6738 if (!can_vec_perm_const_p (vmode, vmode, indices))
6740 if (dump_enabled_p ())
6741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6742 "shift permutation is not supported by target\n");
6743 return false;
6745 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6747 /* Generating permutation constant to select vector from 2.
6748 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6749 for (i = 0; i < nelt / 2; i++)
6750 sel[i] = i;
6751 for (i = nelt / 2; i < nelt; i++)
6752 sel[i] = nelt + i;
6753 indices.new_vector (sel, 2, nelt);
6754 if (!can_vec_perm_const_p (vmode, vmode, indices))
6756 if (dump_enabled_p ())
6757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758 "select is not supported by target\n");
6759 return false;
6761 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6763 for (i = 0; i < log_length; i++)
6765 for (j = 0; j < length; j += 2)
6767 first_vect = dr_chain[j];
6768 second_vect = dr_chain[j + 1];
6770 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6771 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6772 first_vect, first_vect,
6773 perm2_mask1);
6774 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6775 vect[0] = data_ref;
6777 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6778 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6779 second_vect, second_vect,
6780 perm2_mask2);
6781 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6782 vect[1] = data_ref;
6784 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6785 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6786 vect[0], vect[1], shift1_mask);
6787 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6788 (*result_chain)[j/2 + length/2] = data_ref;
6790 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6791 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6792 vect[0], vect[1], select_mask);
6793 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6794 (*result_chain)[j/2] = data_ref;
6796 memcpy (dr_chain.address (), result_chain->address (),
6797 length * sizeof (tree));
6799 return true;
6801 if (length == 3 && vf > 2)
6803 unsigned int k = 0, l = 0;
6805 /* Generating permutation constant to get all elements in rigth order.
6806 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6807 for (i = 0; i < nelt; i++)
6809 if (3 * k + (l % 3) >= nelt)
6811 k = 0;
6812 l += (3 - (nelt % 3));
6814 sel[i] = 3 * k + (l % 3);
6815 k++;
6817 vec_perm_indices indices (sel, 2, nelt);
6818 if (!can_vec_perm_const_p (vmode, vmode, indices))
6820 if (dump_enabled_p ())
6821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6822 "shuffle of 3 fields structure is not \
6823 supported by target\n");
6824 return false;
6826 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6828 /* Generating permutation constant to shift all elements.
6829 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6830 for (i = 0; i < nelt; i++)
6831 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6832 indices.new_vector (sel, 2, nelt);
6833 if (!can_vec_perm_const_p (vmode, vmode, indices))
6835 if (dump_enabled_p ())
6836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837 "shift permutation is not supported by target\n");
6838 return false;
6840 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6842 /* Generating permutation constant to shift all elements.
6843 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6844 for (i = 0; i < nelt; i++)
6845 sel[i] = 2 * (nelt / 3) + 1 + i;
6846 indices.new_vector (sel, 2, nelt);
6847 if (!can_vec_perm_const_p (vmode, vmode, indices))
6849 if (dump_enabled_p ())
6850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6851 "shift permutation is not supported by target\n");
6852 return false;
6854 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6856 /* Generating permutation constant to shift all elements.
6857 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6858 for (i = 0; i < nelt; i++)
6859 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6860 indices.new_vector (sel, 2, nelt);
6861 if (!can_vec_perm_const_p (vmode, vmode, indices))
6863 if (dump_enabled_p ())
6864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6865 "shift permutation is not supported by target\n");
6866 return false;
6868 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6870 /* Generating permutation constant to shift all elements.
6871 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6872 for (i = 0; i < nelt; i++)
6873 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6874 indices.new_vector (sel, 2, nelt);
6875 if (!can_vec_perm_const_p (vmode, vmode, indices))
6877 if (dump_enabled_p ())
6878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6879 "shift permutation is not supported by target\n");
6880 return false;
6882 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6884 for (k = 0; k < 3; k++)
6886 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6887 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6888 dr_chain[k], dr_chain[k],
6889 perm3_mask);
6890 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6891 vect[k] = data_ref;
6894 for (k = 0; k < 3; k++)
6896 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6897 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6898 vect[k % 3], vect[(k + 1) % 3],
6899 shift1_mask);
6900 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6901 vect_shift[k] = data_ref;
6904 for (k = 0; k < 3; k++)
6906 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6907 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6908 vect_shift[(4 - k) % 3],
6909 vect_shift[(3 - k) % 3],
6910 shift2_mask);
6911 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6912 vect[k] = data_ref;
6915 (*result_chain)[3 - (nelt % 3)] = vect[2];
6917 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6918 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6919 vect[0], shift3_mask);
6920 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6921 (*result_chain)[nelt % 3] = data_ref;
6923 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6924 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6925 vect[1], shift4_mask);
6926 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6927 (*result_chain)[0] = data_ref;
6928 return true;
6930 return false;
6933 /* Function vect_transform_grouped_load.
6935 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6936 to perform their permutation and ascribe the result vectorized statements to
6937 the scalar statements.
6940 void
6941 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6942 vec<tree> dr_chain,
6943 int size, gimple_stmt_iterator *gsi)
6945 machine_mode mode;
6946 vec<tree> result_chain = vNULL;
6948 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6949 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6950 vectors, that are ready for vector computation. */
6951 result_chain.create (size);
6953 /* If reassociation width for vector type is 2 or greater target machine can
6954 execute 2 or more vector instructions in parallel. Otherwise try to
6955 get chain for loads group using vect_shift_permute_load_chain. */
6956 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6957 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6958 || pow2p_hwi (size)
6959 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6960 gsi, &result_chain))
6961 vect_permute_load_chain (vinfo, dr_chain,
6962 size, stmt_info, gsi, &result_chain);
6963 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6964 result_chain.release ();
6967 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6968 generated as part of the vectorization of STMT_INFO. Assign the statement
6969 for each vector to the associated scalar statement. */
6971 void
6972 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6973 vec<tree> result_chain)
6975 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6976 unsigned int i, gap_count;
6977 tree tmp_data_ref;
6979 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6980 Since we scan the chain starting from it's first node, their order
6981 corresponds the order of data-refs in RESULT_CHAIN. */
6982 stmt_vec_info next_stmt_info = first_stmt_info;
6983 gap_count = 1;
6984 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6986 if (!next_stmt_info)
6987 break;
6989 /* Skip the gaps. Loads created for the gaps will be removed by dead
6990 code elimination pass later. No need to check for the first stmt in
6991 the group, since it always exists.
6992 DR_GROUP_GAP is the number of steps in elements from the previous
6993 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6994 correspond to the gaps. */
6995 if (next_stmt_info != first_stmt_info
6996 && gap_count < DR_GROUP_GAP (next_stmt_info))
6998 gap_count++;
6999 continue;
7002 /* ??? The following needs cleanup after the removal of
7003 DR_GROUP_SAME_DR_STMT. */
7004 if (next_stmt_info)
7006 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7007 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7008 copies, and we put the new vector statement last. */
7009 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7011 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7012 gap_count = 1;
7017 /* Function vect_force_dr_alignment_p.
7019 Returns whether the alignment of a DECL can be forced to be aligned
7020 on ALIGNMENT bit boundary. */
7022 bool
7023 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7025 if (!VAR_P (decl))
7026 return false;
7028 if (decl_in_symtab_p (decl)
7029 && !symtab_node::get (decl)->can_increase_alignment_p ())
7030 return false;
7032 if (TREE_STATIC (decl))
7033 return (known_le (alignment,
7034 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7035 else
7036 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7039 /* Return whether the data reference DR_INFO is supported with respect to its
7040 alignment.
7041 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7042 it is aligned, i.e., check if it is possible to vectorize it with different
7043 alignment. */
7045 enum dr_alignment_support
7046 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7047 tree vectype, int misalignment)
7049 data_reference *dr = dr_info->dr;
7050 stmt_vec_info stmt_info = dr_info->stmt;
7051 machine_mode mode = TYPE_MODE (vectype);
7052 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7053 class loop *vect_loop = NULL;
7054 bool nested_in_vect_loop = false;
7056 if (misalignment == 0)
7057 return dr_aligned;
7059 /* For now assume all conditional loads/stores support unaligned
7060 access without any special code. */
7061 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7062 if (gimple_call_internal_p (stmt)
7063 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7064 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7065 return dr_unaligned_supported;
7067 if (loop_vinfo)
7069 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7070 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7073 /* Possibly unaligned access. */
7075 /* We can choose between using the implicit realignment scheme (generating
7076 a misaligned_move stmt) and the explicit realignment scheme (generating
7077 aligned loads with a REALIGN_LOAD). There are two variants to the
7078 explicit realignment scheme: optimized, and unoptimized.
7079 We can optimize the realignment only if the step between consecutive
7080 vector loads is equal to the vector size. Since the vector memory
7081 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7082 is guaranteed that the misalignment amount remains the same throughout the
7083 execution of the vectorized loop. Therefore, we can create the
7084 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7085 at the loop preheader.
7087 However, in the case of outer-loop vectorization, when vectorizing a
7088 memory access in the inner-loop nested within the LOOP that is now being
7089 vectorized, while it is guaranteed that the misalignment of the
7090 vectorized memory access will remain the same in different outer-loop
7091 iterations, it is *not* guaranteed that is will remain the same throughout
7092 the execution of the inner-loop. This is because the inner-loop advances
7093 with the original scalar step (and not in steps of VS). If the inner-loop
7094 step happens to be a multiple of VS, then the misalignment remains fixed
7095 and we can use the optimized realignment scheme. For example:
7097 for (i=0; i<N; i++)
7098 for (j=0; j<M; j++)
7099 s += a[i+j];
7101 When vectorizing the i-loop in the above example, the step between
7102 consecutive vector loads is 1, and so the misalignment does not remain
7103 fixed across the execution of the inner-loop, and the realignment cannot
7104 be optimized (as illustrated in the following pseudo vectorized loop):
7106 for (i=0; i<N; i+=4)
7107 for (j=0; j<M; j++){
7108 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7109 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7110 // (assuming that we start from an aligned address).
7113 We therefore have to use the unoptimized realignment scheme:
7115 for (i=0; i<N; i+=4)
7116 for (j=k; j<M; j+=4)
7117 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7118 // that the misalignment of the initial address is
7119 // 0).
7121 The loop can then be vectorized as follows:
7123 for (k=0; k<4; k++){
7124 rt = get_realignment_token (&vp[k]);
7125 for (i=0; i<N; i+=4){
7126 v1 = vp[i+k];
7127 for (j=k; j<M; j+=4){
7128 v2 = vp[i+j+VS-1];
7129 va = REALIGN_LOAD <v1,v2,rt>;
7130 vs += va;
7131 v1 = v2;
7134 } */
7136 if (DR_IS_READ (dr))
7138 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7139 && (!targetm.vectorize.builtin_mask_for_load
7140 || targetm.vectorize.builtin_mask_for_load ()))
7142 /* If we are doing SLP then the accesses need not have the
7143 same alignment, instead it depends on the SLP group size. */
7144 if (loop_vinfo
7145 && STMT_SLP_TYPE (stmt_info)
7146 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7147 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7148 * (DR_GROUP_SIZE
7149 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7150 TYPE_VECTOR_SUBPARTS (vectype))))
7152 else if (!loop_vinfo
7153 || (nested_in_vect_loop
7154 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7155 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7156 return dr_explicit_realign;
7157 else
7158 return dr_explicit_realign_optimized;
7162 bool is_packed = false;
7163 tree type = TREE_TYPE (DR_REF (dr));
7164 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7165 is_packed = not_size_aligned (DR_REF (dr));
7166 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7167 is_packed))
7168 return dr_unaligned_supported;
7170 /* Unsupported. */
7171 return dr_unaligned_unsupported;