Skip various cmp-mem-const tests on lp64 hppa*-*-*
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blob5e86da394680956ee5140f01be72a203cba9ebee
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
100 /* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
104 static cgraph_node*
105 simd_clone_call_p (gimple *stmt)
107 gcall *call = dyn_cast <gcall *> (stmt);
108 if (!call)
109 return NULL;
111 tree fndecl = NULL_TREE;
112 if (gimple_call_internal_p (call, IFN_MASK_CALL))
113 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
114 else
115 fndecl = gimple_call_fndecl (stmt);
117 if (fndecl == NULL_TREE)
118 return NULL;
120 cgraph_node *node = cgraph_node::get (fndecl);
121 if (node && node->simd_clones != NULL)
122 return node;
124 return NULL;
129 /* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
144 types. */
146 tree
147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
149 HOST_WIDE_INT lhs, rhs;
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
154 return scalar_type;
156 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
158 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
159 if (assign)
161 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
162 if (gimple_assign_cast_p (assign)
163 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
169 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
171 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
172 if (rhs < lhs)
173 scalar_type = rhs_type;
176 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
178 auto clone = node->simd_clones->simdclone;
179 for (unsigned int i = 0; i < clone->nargs; ++i)
181 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
183 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
184 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
185 if (rhs < lhs)
187 scalar_type = arg_scalar_type;
188 lhs = rhs;
193 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
195 unsigned int i = 0;
196 if (gimple_call_internal_p (call))
198 internal_fn ifn = gimple_call_internal_fn (call);
199 if (internal_load_fn_p (ifn))
200 /* For loads the LHS type does the trick. */
201 i = ~0U;
202 else if (internal_store_fn_p (ifn))
204 /* For stores use the tyep of the stored value. */
205 i = internal_fn_stored_value_index (ifn);
206 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
207 i = ~0U;
209 else if (internal_fn_mask_index (ifn) == 0)
210 i = 1;
212 if (i < gimple_call_num_args (call))
214 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
217 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
218 if (rhs < lhs)
219 scalar_type = rhs_type;
224 return scalar_type;
228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
232 static opt_result
233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
238 return opt_result::failure_at (vect_location,
239 "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
241 " == 0\n");
243 opt_result res
244 = runtime_alias_check_p (ddr, loop,
245 optimize_loop_nest_for_speed_p (loop));
246 if (!res)
247 return res;
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
250 return opt_result::success ();
253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
255 static void
256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
258 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
259 for (unsigned int i = 0; i < checks.length(); ++i)
260 if (checks[i] == value)
261 return;
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "need run-time check that %T is nonzero\n",
266 value);
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
270 /* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
274 static bool
275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
277 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
278 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
283 return true;
285 /* STMT_A and STMT_B belong to overlapping groups. All loads are
286 emitted at the position of the first scalar load.
287 Stores in a group are emitted at the position of the last scalar store.
288 Compute that position and check whether the resulting order matches
289 the current one. */
290 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
291 if (il_a)
293 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
294 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
295 s = DR_GROUP_NEXT_ELEMENT (s))
296 il_a = get_later_stmt (il_a, s);
297 else /* DR_IS_READ */
298 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
299 s = DR_GROUP_NEXT_ELEMENT (s))
300 if (get_later_stmt (il_a, s) == il_a)
301 il_a = s;
303 else
304 il_a = stmtinfo_a;
305 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
306 if (il_b)
308 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
309 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
310 s = DR_GROUP_NEXT_ELEMENT (s))
311 il_b = get_later_stmt (il_b, s);
312 else /* DR_IS_READ */
313 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
314 s = DR_GROUP_NEXT_ELEMENT (s))
315 if (get_later_stmt (il_b, s) == il_b)
316 il_b = s;
318 else
319 il_b = stmtinfo_b;
320 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
321 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
324 /* A subroutine of vect_analyze_data_ref_dependence. Handle
325 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
326 distances. These distances are conservatively correct but they don't
327 reflect a guaranteed dependence.
329 Return true if this function does all the work necessary to avoid
330 an alias or false if the caller should use the dependence distances
331 to limit the vectorization factor in the usual way. LOOP_DEPTH is
332 the depth of the loop described by LOOP_VINFO and the other arguments
333 are as for vect_analyze_data_ref_dependence. */
335 static bool
336 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
337 loop_vec_info loop_vinfo,
338 int loop_depth, unsigned int *max_vf)
340 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
341 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
343 int dist = dist_v[loop_depth];
344 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
346 /* If the user asserted safelen >= DIST consecutive iterations
347 can be executed concurrently, assume independence.
349 ??? An alternative would be to add the alias check even
350 in this case, and vectorize the fallback loop with the
351 maximum VF set to safelen. However, if the user has
352 explicitly given a length, it's less likely that that
353 would be a win. */
354 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
356 if ((unsigned int) loop->safelen < *max_vf)
357 *max_vf = loop->safelen;
358 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
359 continue;
362 /* For dependence distances of 2 or more, we have the option
363 of limiting VF or checking for an alias at runtime.
364 Prefer to check at runtime if we can, to avoid limiting
365 the VF unnecessarily when the bases are in fact independent.
367 Note that the alias checks will be removed if the VF ends up
368 being small enough. */
369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
371 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
372 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
373 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
376 return true;
380 /* Function vect_analyze_data_ref_dependence.
382 FIXME: I needed to change the sense of the returned flag.
384 Return FALSE if there (might) exist a dependence between a memory-reference
385 DRA and a memory-reference DRB. When versioning for alias may check a
386 dependence at run-time, return TRUE. Adjust *MAX_VF according to
387 the data dependence. */
389 static opt_result
390 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
391 loop_vec_info loop_vinfo,
392 unsigned int *max_vf)
394 unsigned int i;
395 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
396 struct data_reference *dra = DDR_A (ddr);
397 struct data_reference *drb = DDR_B (ddr);
398 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
399 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
400 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
401 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
402 lambda_vector dist_v;
403 unsigned int loop_depth;
405 /* If user asserted safelen consecutive iterations can be
406 executed concurrently, assume independence. */
407 auto apply_safelen = [&]()
409 if (loop->safelen >= 2)
411 if ((unsigned int) loop->safelen < *max_vf)
412 *max_vf = loop->safelen;
413 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
414 return true;
416 return false;
419 /* In loop analysis all data references should be vectorizable. */
420 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
421 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
422 gcc_unreachable ();
424 /* Independent data accesses. */
425 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
426 return opt_result::success ();
428 if (dra == drb
429 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
430 return opt_result::success ();
432 /* We do not have to consider dependences between accesses that belong
433 to the same group, unless the stride could be smaller than the
434 group size. */
435 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
436 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
437 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
438 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
439 return opt_result::success ();
441 /* Even if we have an anti-dependence then, as the vectorized loop covers at
442 least two scalar iterations, there is always also a true dependence.
443 As the vectorizer does not re-order loads and stores we can ignore
444 the anti-dependence if TBAA can disambiguate both DRs similar to the
445 case with known negative distance anti-dependences (positive
446 distance anti-dependences would violate TBAA constraints). */
447 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
448 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
449 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
450 get_alias_set (DR_REF (drb))))
451 return opt_result::success ();
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
456 if (apply_safelen ())
457 return opt_result::success ();
459 return opt_result::failure_at
460 (stmtinfo_a->stmt,
461 "possible alias involving gather/scatter between %T and %T\n",
462 DR_REF (dra), DR_REF (drb));
465 /* Unknown data dependence. */
466 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
468 if (apply_safelen ())
469 return opt_result::success ();
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
473 "versioning for alias required: "
474 "can't determine dependence between %T and %T\n",
475 DR_REF (dra), DR_REF (drb));
477 /* Add to list of ddrs that need to be tested at run-time. */
478 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
481 /* Known data dependence. */
482 if (DDR_NUM_DIST_VECTS (ddr) == 0)
484 if (apply_safelen ())
485 return opt_result::success ();
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
489 "versioning for alias required: "
490 "bad dist vector for %T and %T\n",
491 DR_REF (dra), DR_REF (drb));
492 /* Add to list of ddrs that need to be tested at run-time. */
493 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
496 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
498 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
499 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
500 loop_depth, max_vf))
501 return opt_result::success ();
503 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
505 int dist = dist_v[loop_depth];
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location,
509 "dependence distance = %d.\n", dist);
511 if (dist == 0)
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance == 0 between %T and %T\n",
516 DR_REF (dra), DR_REF (drb));
518 /* When we perform grouped accesses and perform implicit CSE
519 by detecting equal accesses and doing disambiguation with
520 runtime alias tests like for
521 .. = a[i];
522 .. = a[i+1];
523 a[i] = ..;
524 a[i+1] = ..;
525 *p = ..;
526 .. = a[i];
527 .. = a[i+1];
528 where we will end up loading { a[i], a[i+1] } once, make
529 sure that inserting group loads before the first load and
530 stores after the last store will do the right thing.
531 Similar for groups like
532 a[i] = ...;
533 ... = a[i];
534 a[i+1] = ...;
535 where loads from the group interleave with the store. */
536 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
537 return opt_result::failure_at (stmtinfo_a->stmt,
538 "READ_WRITE dependence"
539 " in interleaving.\n");
541 if (loop->safelen < 2)
543 tree indicator = dr_zero_step_indicator (dra);
544 if (!indicator || integer_zerop (indicator))
545 return opt_result::failure_at (stmtinfo_a->stmt,
546 "access also has a zero step\n");
547 else if (TREE_CODE (indicator) != INTEGER_CST)
548 vect_check_nonzero_value (loop_vinfo, indicator);
550 continue;
553 if (dist > 0 && DDR_REVERSED_P (ddr))
555 /* If DDR_REVERSED_P the order of the data-refs in DDR was
556 reversed (to make distance vector positive), and the actual
557 distance is negative. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "dependence distance negative.\n");
561 /* When doing outer loop vectorization, we need to check if there is
562 a backward dependence at the inner loop level if the dependence
563 at the outer loop is reversed. See PR81740. */
564 if (nested_in_vect_loop_p (loop, stmtinfo_a)
565 || nested_in_vect_loop_p (loop, stmtinfo_b))
567 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
568 DDR_LOOP_NEST (ddr));
569 if (dist_v[inner_depth] < 0)
570 return opt_result::failure_at (stmtinfo_a->stmt,
571 "not vectorized, dependence "
572 "between data-refs %T and %T\n",
573 DR_REF (dra), DR_REF (drb));
575 /* Record a negative dependence distance to later limit the
576 amount of stmt copying / unrolling we can perform.
577 Only need to handle read-after-write dependence. */
578 if (DR_IS_READ (drb)
579 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
580 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
581 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
582 continue;
585 unsigned int abs_dist = abs (dist);
586 if (abs_dist >= 2 && abs_dist < *max_vf)
588 /* The dependence distance requires reduction of the maximal
589 vectorization factor. */
590 *max_vf = abs_dist;
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "adjusting maximal vectorization factor to %i\n",
594 *max_vf);
597 if (abs_dist >= *max_vf)
599 /* Dependence distance does not create dependence, as far as
600 vectorization is concerned, in this case. */
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "dependence distance >= VF.\n");
604 continue;
607 return opt_result::failure_at (stmtinfo_a->stmt,
608 "not vectorized, possible dependence "
609 "between data-refs %T and %T\n",
610 DR_REF (dra), DR_REF (drb));
613 return opt_result::success ();
616 /* Funcion vect_analyze_early_break_dependences.
618 Examime all the data references in the loop and make sure that if we have
619 mulitple exits that we are able to safely move stores such that they become
620 safe for vectorization. The function also calculates the place where to move
621 the instructions to and computes what the new vUSE chain should be.
623 This works in tandem with the CFG that will be produced by
624 slpeel_tree_duplicate_loop_to_edge_cfg later on.
626 This function tries to validate whether an early break vectorization
627 is possible for the current instruction sequence. Returns True i
628 possible, otherwise False.
630 Requirements:
631 - Any memory access must be to a fixed size buffer.
632 - There must not be any loads and stores to the same object.
633 - Multiple loads are allowed as long as they don't alias.
635 NOTE:
636 This implemementation is very conservative. Any overlappig loads/stores
637 that take place before the early break statement gets rejected aside from
638 WAR dependencies.
640 i.e.:
642 a[i] = 8
643 c = a[i]
644 if (b[i])
647 is not allowed, but
649 c = a[i]
650 a[i] = 8
651 if (b[i])
654 is which is the common case. */
656 static opt_result
657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
659 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
661 /* List of all load data references found during traversal. */
662 auto_vec<data_reference *> bases;
663 basic_block dest_bb = NULL;
665 hash_set <gimple *> visited;
666 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
667 class loop *loop_nest = loop_outer (loop);
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "loop contains multiple exits, analyzing"
672 " statement dependencies.\n");
674 /* Since we don't support general control flow, the location we'll move the
675 side-effects to is always the latch connected exit. When we support
676 general control flow we can do better but for now this is fine. */
677 dest_bb = single_pred (loop->latch);
678 basic_block bb = dest_bb;
682 /* If the destination block is also the header then we have nothing to do. */
683 if (!single_pred_p (bb))
684 continue;
686 bb = single_pred (bb);
687 gimple_stmt_iterator gsi = gsi_last_bb (bb);
689 /* Now analyze all the remaining statements and try to determine which
690 instructions are allowed/needed to be moved. */
691 while (!gsi_end_p (gsi))
693 gimple *stmt = gsi_stmt (gsi);
694 gsi_prev (&gsi);
695 if (!gimple_has_ops (stmt)
696 || is_gimple_debug (stmt))
697 continue;
699 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
700 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
701 if (!dr_ref)
702 continue;
704 /* We currently only support statically allocated objects due to
705 not having first-faulting loads support or peeling for
706 alignment support. Compute the size of the referenced object
707 (it could be dynamically allocated). */
708 tree obj = DR_BASE_ADDRESS (dr_ref);
709 if (!obj || TREE_CODE (obj) != ADDR_EXPR)
711 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "early breaks only supported on statically"
714 " allocated objects.\n");
715 return opt_result::failure_at (stmt,
716 "can't safely apply code motion to "
717 "dependencies of %G to vectorize "
718 "the early exit.\n", stmt);
721 tree refop = TREE_OPERAND (obj, 0);
722 tree refbase = get_base_address (refop);
723 if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
724 || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "early breaks only supported on"
729 " statically allocated objects.\n");
730 return opt_result::failure_at (stmt,
731 "can't safely apply code motion to "
732 "dependencies of %G to vectorize "
733 "the early exit.\n", stmt);
736 /* Check if vector accesses to the object will be within bounds.
737 must be a constant or assume loop will be versioned or niters
738 bounded by VF so accesses are within range. */
739 if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "early breaks not supported: vectorization "
744 "would %s beyond size of obj.",
745 DR_IS_READ (dr_ref) ? "read" : "write");
746 return opt_result::failure_at (stmt,
747 "can't safely apply code motion to "
748 "dependencies of %G to vectorize "
749 "the early exit.\n", stmt);
752 if (DR_IS_READ (dr_ref))
753 bases.safe_push (dr_ref);
754 else if (DR_IS_WRITE (dr_ref))
756 /* We are moving writes down in the CFG. To be sure that this
757 is valid after vectorization we have to check all the loads
758 we are sinking the stores past to see if any of them may
759 alias or are the same object.
761 Same objects will not be an issue because unless the store
762 is marked volatile the value can be forwarded. If the
763 store is marked volatile we don't vectorize the loop
764 anyway.
766 That leaves the check for aliasing. We don't really need
767 to care about the stores aliasing with each other since the
768 stores are moved in order so the effects are still observed
769 correctly. This leaves the check for WAR dependencies
770 which we would be introducing here if the DR can alias.
771 The check is quadratic in loads/stores but I have not found
772 a better API to do this. I believe all loads and stores
773 must be checked. We also must check them when we
774 encountered the store, since we don't care about loads past
775 the store. */
777 for (auto dr_read : bases)
778 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
780 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
782 vect_location,
783 "early breaks not supported: "
784 "overlapping loads and stores "
785 "found before the break "
786 "statement.\n");
788 return opt_result::failure_at (stmt,
789 "can't safely apply code motion to dependencies"
790 " to vectorize the early exit. %G may alias with"
791 " %G\n", stmt, dr_read->stmt);
795 if (gimple_vdef (stmt))
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE, vect_location,
799 "==> recording stmt %G", stmt);
801 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
803 else if (gimple_vuse (stmt))
805 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
806 if (dump_enabled_p ())
807 dump_printf_loc (MSG_NOTE, vect_location,
808 "marked statement for vUSE update: %G", stmt);
812 while (bb != loop->header);
814 /* We don't allow outer -> inner loop transitions which should have been
815 trapped already during loop form analysis. */
816 gcc_assert (dest_bb->loop_father == loop);
818 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
820 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
822 /* All uses shall be updated to that of the first load. Entries are
823 stored in reverse order. */
824 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
825 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE, vect_location,
829 "will update use: %T, mem_ref: %G", vuse, g);
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_NOTE, vect_location,
835 "recorded statements to be moved to BB %d\n",
836 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
838 return opt_result::success ();
841 /* Function vect_analyze_data_ref_dependences.
843 Examine all the data references in the loop, and make sure there do not
844 exist any data dependences between them. Set *MAX_VF according to
845 the maximum vectorization factor the data dependences allow. */
847 opt_result
848 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
849 unsigned int *max_vf)
851 unsigned int i;
852 struct data_dependence_relation *ddr;
854 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
856 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
858 LOOP_VINFO_DDRS (loop_vinfo)
859 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
860 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
861 /* We do not need read-read dependences. */
862 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
863 &LOOP_VINFO_DDRS (loop_vinfo),
864 LOOP_VINFO_LOOP_NEST (loop_vinfo),
865 false);
866 gcc_assert (res);
869 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
871 /* For epilogues we either have no aliases or alias versioning
872 was applied to original loop. Therefore we may just get max_vf
873 using VF of original loop. */
874 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
875 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
876 else
877 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
879 opt_result res
880 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
881 if (!res)
882 return res;
885 /* If we have early break statements in the loop, check to see if they
886 are of a form we can vectorizer. */
887 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
888 return vect_analyze_early_break_dependences (loop_vinfo);
890 return opt_result::success ();
894 /* Function vect_slp_analyze_data_ref_dependence.
896 Return TRUE if there (might) exist a dependence between a memory-reference
897 DRA and a memory-reference DRB for VINFO. When versioning for alias
898 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
899 according to the data dependence. */
901 static bool
902 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
903 struct data_dependence_relation *ddr)
905 struct data_reference *dra = DDR_A (ddr);
906 struct data_reference *drb = DDR_B (ddr);
907 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
908 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
910 /* We need to check dependences of statements marked as unvectorizable
911 as well, they still can prohibit vectorization. */
913 /* Independent data accesses. */
914 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
915 return false;
917 if (dra == drb)
918 return false;
920 /* Read-read is OK. */
921 if (DR_IS_READ (dra) && DR_IS_READ (drb))
922 return false;
924 /* If dra and drb are part of the same interleaving chain consider
925 them independent. */
926 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
927 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
928 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
929 return false;
931 /* Unknown data dependence. */
932 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
934 if (dump_enabled_p ())
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "can't determine dependence between %T and %T\n",
937 DR_REF (dra), DR_REF (drb));
939 else if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location,
941 "determined dependence between %T and %T\n",
942 DR_REF (dra), DR_REF (drb));
944 return true;
948 /* Analyze dependences involved in the transform of a store SLP NODE. */
950 static bool
951 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
953 /* This walks over all stmts involved in the SLP store done
954 in NODE verifying we can sink them up to the last stmt in the
955 group. */
956 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
957 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
959 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
961 stmt_vec_info access_info
962 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
963 if (access_info == last_access_info)
964 continue;
965 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
966 ao_ref ref;
967 bool ref_initialized_p = false;
968 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
969 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
971 gimple *stmt = gsi_stmt (gsi);
972 if (! gimple_vuse (stmt))
973 continue;
975 /* If we couldn't record a (single) data reference for this
976 stmt we have to resort to the alias oracle. */
977 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
978 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
979 if (!dr_b)
981 /* We are moving a store - this means
982 we cannot use TBAA for disambiguation. */
983 if (!ref_initialized_p)
984 ao_ref_init (&ref, DR_REF (dr_a));
985 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
986 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
987 return false;
988 continue;
991 gcc_assert (!gimple_visited_p (stmt));
993 ddr_p ddr = initialize_data_dependence_relation (dr_a,
994 dr_b, vNULL);
995 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
996 free_dependence_relation (ddr);
997 if (dependent)
998 return false;
1001 return true;
1004 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1005 contain the vector of scalar stores of this instance if we are
1006 disambiguating the loads. */
1008 static bool
1009 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1010 vec<stmt_vec_info> stores,
1011 stmt_vec_info last_store_info)
1013 /* This walks over all stmts involved in the SLP load done
1014 in NODE verifying we can hoist them up to the first stmt in the
1015 group. */
1016 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1017 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1019 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1021 stmt_vec_info access_info
1022 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1023 if (access_info == first_access_info)
1024 continue;
1025 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1026 ao_ref ref;
1027 bool ref_initialized_p = false;
1028 hash_set<stmt_vec_info> grp_visited;
1029 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1030 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1032 gimple *stmt = gsi_stmt (gsi);
1033 if (! gimple_vdef (stmt))
1034 continue;
1036 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1038 /* If we run into a store of this same instance (we've just
1039 marked those) then delay dependence checking until we run
1040 into the last store because this is where it will have
1041 been sunk to (and we verified that we can do that already). */
1042 if (gimple_visited_p (stmt))
1044 if (stmt_info != last_store_info)
1045 continue;
1047 for (stmt_vec_info &store_info : stores)
1049 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1050 ddr_p ddr = initialize_data_dependence_relation
1051 (dr_a, store_dr, vNULL);
1052 bool dependent
1053 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1054 free_dependence_relation (ddr);
1055 if (dependent)
1056 return false;
1058 continue;
1061 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1063 /* We are hoisting a load - this means we can use TBAA for
1064 disambiguation. */
1065 if (!ref_initialized_p)
1066 ao_ref_init (&ref, DR_REF (dr_a));
1067 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1069 /* If we couldn't record a (single) data reference for this
1070 stmt we have to give up now. */
1071 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1072 if (!dr_b)
1073 return false;
1074 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1075 dr_b, vNULL);
1076 bool dependent
1077 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1078 free_dependence_relation (ddr);
1079 if (dependent)
1080 return false;
1082 /* No dependence. */
1083 return true;
1085 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1087 /* When we run into a store group we have to honor
1088 that earlier stores might be moved here. We don't
1089 know exactly which and where to since we lack a
1090 back-mapping from DR to SLP node, so assume all
1091 earlier stores are sunk here. It's enough to
1092 consider the last stmt of a group for this.
1093 ??? Both this and the fact that we disregard that
1094 the conflicting instance might be removed later
1095 is overly conservative. */
1096 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1097 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1098 store_info != NULL;
1099 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1100 if ((store_info == stmt_info
1101 || get_later_stmt (store_info, stmt_info) == stmt_info)
1102 && !check_hoist (store_info))
1103 return false;
1105 else
1107 if (!check_hoist (stmt_info))
1108 return false;
1112 return true;
1116 /* Function vect_analyze_data_ref_dependences.
1118 Examine all the data references in the basic-block, and make sure there
1119 do not exist any data dependences between them. Set *MAX_VF according to
1120 the maximum vectorization factor the data dependences allow. */
1122 bool
1123 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1125 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1127 /* The stores of this instance are at the root of the SLP tree. */
1128 slp_tree store = NULL;
1129 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1130 store = SLP_INSTANCE_TREE (instance);
1132 /* Verify we can sink stores to the vectorized stmt insert location. */
1133 stmt_vec_info last_store_info = NULL;
1134 if (store)
1136 if (! vect_slp_analyze_store_dependences (vinfo, store))
1137 return false;
1139 /* Mark stores in this instance and remember the last one. */
1140 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1141 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1142 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1145 bool res = true;
1147 /* Verify we can sink loads to the vectorized stmt insert location,
1148 special-casing stores of this instance. */
1149 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1150 if (! vect_slp_analyze_load_dependences (vinfo, load,
1151 store
1152 ? SLP_TREE_SCALAR_STMTS (store)
1153 : vNULL, last_store_info))
1155 res = false;
1156 break;
1159 /* Unset the visited flag. */
1160 if (store)
1161 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1162 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1164 return res;
1167 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1168 applied. */
1171 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1173 HOST_WIDE_INT diff = 0;
1174 /* Alignment is only analyzed for the first element of a DR group,
1175 use that but adjust misalignment by the offset of the access. */
1176 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1178 dr_vec_info *first_dr
1179 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1180 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1181 INTEGER_CSTs and the first element in the group has the lowest
1182 address. */
1183 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1184 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1185 gcc_assert (diff >= 0);
1186 dr_info = first_dr;
1189 int misalign = dr_info->misalignment;
1190 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1191 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1192 return misalign;
1194 /* If the access is only aligned for a vector type with smaller alignment
1195 requirement the access has unknown misalignment. */
1196 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1197 targetm.vectorize.preferred_vector_alignment (vectype)))
1198 return DR_MISALIGNMENT_UNKNOWN;
1200 /* Apply the offset from the DR group start and the externally supplied
1201 offset which can for example result from a negative stride access. */
1202 poly_int64 misalignment = misalign + diff + offset;
1204 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1205 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1206 unsigned HOST_WIDE_INT target_alignment_c
1207 = dr_info->target_alignment.to_constant ();
1208 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1209 return DR_MISALIGNMENT_UNKNOWN;
1210 return misalign;
1213 /* Record the base alignment guarantee given by DRB, which occurs
1214 in STMT_INFO. */
1216 static void
1217 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1218 innermost_loop_behavior *drb)
1220 bool existed;
1221 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1222 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1223 if (!existed || entry.second->base_alignment < drb->base_alignment)
1225 entry = std::make_pair (stmt_info, drb);
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_NOTE, vect_location,
1228 "recording new base alignment for %T\n"
1229 " alignment: %d\n"
1230 " misalignment: %d\n"
1231 " based on: %G",
1232 drb->base_address,
1233 drb->base_alignment,
1234 drb->base_misalignment,
1235 stmt_info->stmt);
1239 /* If the region we're going to vectorize is reached, all unconditional
1240 data references occur at least once. We can therefore pool the base
1241 alignment guarantees from each unconditional reference. Do this by
1242 going through all the data references in VINFO and checking whether
1243 the containing statement makes the reference unconditionally. If so,
1244 record the alignment of the base address in VINFO so that it can be
1245 used for all other references with the same base. */
1247 void
1248 vect_record_base_alignments (vec_info *vinfo)
1250 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1251 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1252 for (data_reference *dr : vinfo->shared->datarefs)
1254 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1255 stmt_vec_info stmt_info = dr_info->stmt;
1256 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1257 && STMT_VINFO_VECTORIZABLE (stmt_info)
1258 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1260 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1262 /* If DR is nested in the loop that is being vectorized, we can also
1263 record the alignment of the base wrt the outer loop. */
1264 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1265 vect_record_base_alignment
1266 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1271 /* Function vect_compute_data_ref_alignment
1273 Compute the misalignment of the data reference DR_INFO when vectorizing
1274 with VECTYPE.
1276 Output:
1277 1. initialized misalignment info for DR_INFO
1279 FOR NOW: No analysis is actually performed. Misalignment is calculated
1280 only for trivial cases. TODO. */
1282 static void
1283 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1284 tree vectype)
1286 stmt_vec_info stmt_info = dr_info->stmt;
1287 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1288 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1289 class loop *loop = NULL;
1290 tree ref = DR_REF (dr_info->dr);
1292 if (dump_enabled_p ())
1293 dump_printf_loc (MSG_NOTE, vect_location,
1294 "vect_compute_data_ref_alignment:\n");
1296 if (loop_vinfo)
1297 loop = LOOP_VINFO_LOOP (loop_vinfo);
1299 /* Initialize misalignment to unknown. */
1300 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1302 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1303 return;
1305 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1306 bool step_preserves_misalignment_p;
1308 poly_uint64 vector_alignment
1309 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1310 BITS_PER_UNIT);
1311 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1313 /* If the main loop has peeled for alignment we have no way of knowing
1314 whether the data accesses in the epilogues are aligned. We can't at
1315 compile time answer the question whether we have entered the main loop or
1316 not. Fixes PR 92351. */
1317 if (loop_vinfo)
1319 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1320 if (orig_loop_vinfo
1321 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1322 return;
1325 unsigned HOST_WIDE_INT vect_align_c;
1326 if (!vector_alignment.is_constant (&vect_align_c))
1327 return;
1329 /* No step for BB vectorization. */
1330 if (!loop)
1332 gcc_assert (integer_zerop (drb->step));
1333 step_preserves_misalignment_p = true;
1336 /* In case the dataref is in an inner-loop of the loop that is being
1337 vectorized (LOOP), we use the base and misalignment information
1338 relative to the outer-loop (LOOP). This is ok only if the misalignment
1339 stays the same throughout the execution of the inner-loop, which is why
1340 we have to check that the stride of the dataref in the inner-loop evenly
1341 divides by the vector alignment. */
1342 else if (nested_in_vect_loop_p (loop, stmt_info))
1344 step_preserves_misalignment_p
1345 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1347 if (dump_enabled_p ())
1349 if (step_preserves_misalignment_p)
1350 dump_printf_loc (MSG_NOTE, vect_location,
1351 "inner step divides the vector alignment.\n");
1352 else
1353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354 "inner step doesn't divide the vector"
1355 " alignment.\n");
1359 /* Similarly we can only use base and misalignment information relative to
1360 an innermost loop if the misalignment stays the same throughout the
1361 execution of the loop. As above, this is the case if the stride of
1362 the dataref evenly divides by the alignment. */
1363 else
1365 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1366 step_preserves_misalignment_p
1367 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1369 if (!step_preserves_misalignment_p && dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "step doesn't divide the vector alignment.\n");
1374 unsigned int base_alignment = drb->base_alignment;
1375 unsigned int base_misalignment = drb->base_misalignment;
1377 /* Calculate the maximum of the pooled base address alignment and the
1378 alignment that we can compute for DR itself. */
1379 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1380 = base_alignments->get (drb->base_address);
1381 if (entry
1382 && base_alignment < (*entry).second->base_alignment
1383 && (loop_vinfo
1384 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1385 gimple_bb (entry->first->stmt))
1386 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1387 || (entry->first->dr_aux.group <= dr_info->group)))))
1389 base_alignment = entry->second->base_alignment;
1390 base_misalignment = entry->second->base_misalignment;
1393 if (drb->offset_alignment < vect_align_c
1394 || !step_preserves_misalignment_p
1395 /* We need to know whether the step wrt the vectorized loop is
1396 negative when computing the starting misalignment below. */
1397 || TREE_CODE (drb->step) != INTEGER_CST)
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401 "Unknown alignment for access: %T\n", ref);
1402 return;
1405 if (base_alignment < vect_align_c)
1407 unsigned int max_alignment;
1408 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1409 if (max_alignment < vect_align_c
1410 || !vect_can_force_dr_alignment_p (base,
1411 vect_align_c * BITS_PER_UNIT))
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE, vect_location,
1415 "can't force alignment of ref: %T\n", ref);
1416 return;
1419 /* Force the alignment of the decl.
1420 NOTE: This is the only change to the code we make during
1421 the analysis phase, before deciding to vectorize the loop. */
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "force alignment of %T\n", ref);
1426 dr_info->base_decl = base;
1427 dr_info->base_misaligned = true;
1428 base_misalignment = 0;
1430 poly_int64 misalignment
1431 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1433 unsigned int const_misalignment;
1434 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438 "Non-constant misalignment for access: %T\n", ref);
1439 return;
1442 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1444 if (dump_enabled_p ())
1445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446 "misalign = %d bytes of ref %T\n",
1447 const_misalignment, ref);
1449 return;
1452 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1453 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1454 is made aligned via peeling. */
1456 static bool
1457 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1458 dr_vec_info *dr_peel_info)
1460 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1461 DR_TARGET_ALIGNMENT (dr_info)))
1463 poly_offset_int diff
1464 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1465 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1466 if (known_eq (diff, 0)
1467 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1468 return true;
1470 return false;
1473 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1474 aligned via peeling. */
1476 static bool
1477 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1478 dr_vec_info *dr_peel_info)
1480 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1481 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1482 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1483 DR_OFFSET (dr_peel_info->dr), 0)
1484 || !operand_equal_p (DR_STEP (dr_info->dr),
1485 DR_STEP (dr_peel_info->dr), 0))
1486 return false;
1488 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1491 /* Compute the value for dr_info->misalign so that the access appears
1492 aligned. This is used by peeling to compensate for dr_misalignment
1493 applying the offset for negative step. */
1496 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1498 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1499 return 0;
1501 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1502 poly_int64 misalignment
1503 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1504 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1506 unsigned HOST_WIDE_INT target_alignment_c;
1507 int misalign;
1508 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1509 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1510 return DR_MISALIGNMENT_UNKNOWN;
1511 return misalign;
1514 /* Function vect_update_misalignment_for_peel.
1515 Sets DR_INFO's misalignment
1516 - to 0 if it has the same alignment as DR_PEEL_INFO,
1517 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1518 - to -1 (unknown) otherwise.
1520 DR_INFO - the data reference whose misalignment is to be adjusted.
1521 DR_PEEL_INFO - the data reference whose misalignment is being made
1522 zero in the vector loop by the peel.
1523 NPEEL - the number of iterations in the peel loop if the misalignment
1524 of DR_PEEL_INFO is known at compile time. */
1526 static void
1527 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1528 dr_vec_info *dr_peel_info, int npeel)
1530 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1531 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1533 SET_DR_MISALIGNMENT (dr_info,
1534 vect_dr_misalign_for_aligned_access (dr_peel_info));
1535 return;
1538 unsigned HOST_WIDE_INT alignment;
1539 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1540 && known_alignment_for_access_p (dr_info,
1541 STMT_VINFO_VECTYPE (dr_info->stmt))
1542 && known_alignment_for_access_p (dr_peel_info,
1543 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1545 int misal = dr_info->misalignment;
1546 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1547 misal &= alignment - 1;
1548 set_dr_misalignment (dr_info, misal);
1549 return;
1552 if (dump_enabled_p ())
1553 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1554 "to unknown (-1).\n");
1555 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1558 /* Return true if alignment is relevant for DR_INFO. */
1560 static bool
1561 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1563 stmt_vec_info stmt_info = dr_info->stmt;
1565 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1566 return false;
1568 /* For interleaving, only the alignment of the first access matters. */
1569 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1570 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1571 return false;
1573 /* Scatter-gather and invariant accesses continue to address individual
1574 scalars, so vector-level alignment is irrelevant. */
1575 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1576 || integer_zerop (DR_STEP (dr_info->dr)))
1577 return false;
1579 /* Strided accesses perform only component accesses, alignment is
1580 irrelevant for them. */
1581 if (STMT_VINFO_STRIDED_P (stmt_info)
1582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1583 return false;
1585 return true;
1588 /* Given an memory reference EXP return whether its alignment is less
1589 than its size. */
1591 static bool
1592 not_size_aligned (tree exp)
1594 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1595 return true;
1597 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1598 > get_object_alignment (exp));
1601 /* Function vector_alignment_reachable_p
1603 Return true if vector alignment for DR_INFO is reachable by peeling
1604 a few loop iterations. Return false otherwise. */
1606 static bool
1607 vector_alignment_reachable_p (dr_vec_info *dr_info)
1609 stmt_vec_info stmt_info = dr_info->stmt;
1610 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1614 /* For interleaved access we peel only if number of iterations in
1615 the prolog loop ({VF - misalignment}), is a multiple of the
1616 number of the interleaved accesses. */
1617 int elem_size, mis_in_elements;
1619 /* FORNOW: handle only known alignment. */
1620 if (!known_alignment_for_access_p (dr_info, vectype))
1621 return false;
1623 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1624 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1625 elem_size = vector_element_size (vector_size, nelements);
1626 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1628 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1629 return false;
1632 /* If misalignment is known at the compile time then allow peeling
1633 only if natural alignment is reachable through peeling. */
1634 if (known_alignment_for_access_p (dr_info, vectype)
1635 && !aligned_access_p (dr_info, vectype))
1637 HOST_WIDE_INT elmsize =
1638 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1639 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_NOTE, vect_location,
1642 "data size = %wd. misalignment = %d.\n", elmsize,
1643 dr_misalignment (dr_info, vectype));
1645 if (dr_misalignment (dr_info, vectype) % elmsize)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649 "data size does not divide the misalignment.\n");
1650 return false;
1654 if (!known_alignment_for_access_p (dr_info, vectype))
1656 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1657 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 "Unknown misalignment, %snaturally aligned\n",
1661 is_packed ? "not " : "");
1662 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1665 return true;
1669 /* Calculate the cost of the memory access represented by DR_INFO. */
1671 static void
1672 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1673 dr_alignment_support alignment_support_scheme,
1674 int misalignment,
1675 unsigned int *inside_cost,
1676 unsigned int *outside_cost,
1677 stmt_vector_for_cost *body_cost_vec,
1678 stmt_vector_for_cost *prologue_cost_vec)
1680 stmt_vec_info stmt_info = dr_info->stmt;
1681 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1682 int ncopies;
1684 if (PURE_SLP_STMT (stmt_info))
1685 ncopies = 1;
1686 else
1687 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1689 if (DR_IS_READ (dr_info->dr))
1690 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1691 misalignment, true, inside_cost,
1692 outside_cost, prologue_cost_vec, body_cost_vec, false);
1693 else
1694 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1695 misalignment, inside_cost, body_cost_vec);
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_NOTE, vect_location,
1699 "vect_get_data_access_cost: inside_cost = %d, "
1700 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1704 typedef struct _vect_peel_info
1706 dr_vec_info *dr_info;
1707 int npeel;
1708 unsigned int count;
1709 } *vect_peel_info;
1711 typedef struct _vect_peel_extended_info
1713 vec_info *vinfo;
1714 struct _vect_peel_info peel_info;
1715 unsigned int inside_cost;
1716 unsigned int outside_cost;
1717 } *vect_peel_extended_info;
1720 /* Peeling hashtable helpers. */
1722 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1724 static inline hashval_t hash (const _vect_peel_info *);
1725 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1728 inline hashval_t
1729 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1731 return (hashval_t) peel_info->npeel;
1734 inline bool
1735 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1737 return (a->npeel == b->npeel);
1741 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1743 static void
1744 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1745 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1746 int npeel, bool supportable_if_not_aligned)
1748 struct _vect_peel_info elem, *slot;
1749 _vect_peel_info **new_slot;
1751 elem.npeel = npeel;
1752 slot = peeling_htab->find (&elem);
1753 if (slot)
1754 slot->count++;
1755 else
1757 slot = XNEW (struct _vect_peel_info);
1758 slot->npeel = npeel;
1759 slot->dr_info = dr_info;
1760 slot->count = 1;
1761 new_slot = peeling_htab->find_slot (slot, INSERT);
1762 *new_slot = slot;
1765 /* If this DR is not supported with unknown misalignment then bias
1766 this slot when the cost model is disabled. */
1767 if (!supportable_if_not_aligned
1768 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1769 slot->count += VECT_MAX_COST;
1773 /* Traverse peeling hash table to find peeling option that aligns maximum
1774 number of data accesses. */
1777 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1778 _vect_peel_extended_info *max)
1780 vect_peel_info elem = *slot;
1782 if (elem->count > max->peel_info.count
1783 || (elem->count == max->peel_info.count
1784 && max->peel_info.npeel > elem->npeel))
1786 max->peel_info.npeel = elem->npeel;
1787 max->peel_info.count = elem->count;
1788 max->peel_info.dr_info = elem->dr_info;
1791 return 1;
1794 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1795 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1796 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1797 after peeling. */
1799 static void
1800 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1801 dr_vec_info *dr0_info,
1802 unsigned int *inside_cost,
1803 unsigned int *outside_cost,
1804 stmt_vector_for_cost *body_cost_vec,
1805 stmt_vector_for_cost *prologue_cost_vec,
1806 unsigned int npeel)
1808 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1810 bool dr0_alignment_known_p
1811 = (dr0_info
1812 && known_alignment_for_access_p (dr0_info,
1813 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1815 for (data_reference *dr : datarefs)
1817 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1818 if (!vect_relevant_for_alignment_p (dr_info))
1819 continue;
1821 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1822 dr_alignment_support alignment_support_scheme;
1823 int misalignment;
1824 unsigned HOST_WIDE_INT alignment;
1826 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1827 size_zero_node) < 0;
1828 poly_int64 off = 0;
1829 if (negative)
1830 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1831 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1833 if (npeel == 0)
1834 misalignment = dr_misalignment (dr_info, vectype, off);
1835 else if (dr_info == dr0_info
1836 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1837 misalignment = 0;
1838 else if (!dr0_alignment_known_p
1839 || !known_alignment_for_access_p (dr_info, vectype)
1840 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1841 misalignment = DR_MISALIGNMENT_UNKNOWN;
1842 else
1844 misalignment = dr_misalignment (dr_info, vectype, off);
1845 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1846 misalignment &= alignment - 1;
1848 alignment_support_scheme
1849 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1850 misalignment);
1852 vect_get_data_access_cost (loop_vinfo, dr_info,
1853 alignment_support_scheme, misalignment,
1854 inside_cost, outside_cost,
1855 body_cost_vec, prologue_cost_vec);
1859 /* Traverse peeling hash table and calculate cost for each peeling option.
1860 Find the one with the lowest cost. */
1863 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1864 _vect_peel_extended_info *min)
1866 vect_peel_info elem = *slot;
1867 int dummy;
1868 unsigned int inside_cost = 0, outside_cost = 0;
1869 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1870 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1871 epilogue_cost_vec;
1873 prologue_cost_vec.create (2);
1874 body_cost_vec.create (2);
1875 epilogue_cost_vec.create (2);
1877 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1878 &outside_cost, &body_cost_vec,
1879 &prologue_cost_vec, elem->npeel);
1881 body_cost_vec.release ();
1883 outside_cost += vect_get_known_peeling_cost
1884 (loop_vinfo, elem->npeel, &dummy,
1885 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1886 &prologue_cost_vec, &epilogue_cost_vec);
1888 /* Prologue and epilogue costs are added to the target model later.
1889 These costs depend only on the scalar iteration cost, the
1890 number of peeling iterations finally chosen, and the number of
1891 misaligned statements. So discard the information found here. */
1892 prologue_cost_vec.release ();
1893 epilogue_cost_vec.release ();
1895 if (inside_cost < min->inside_cost
1896 || (inside_cost == min->inside_cost
1897 && outside_cost < min->outside_cost))
1899 min->inside_cost = inside_cost;
1900 min->outside_cost = outside_cost;
1901 min->peel_info.dr_info = elem->dr_info;
1902 min->peel_info.npeel = elem->npeel;
1903 min->peel_info.count = elem->count;
1906 return 1;
1910 /* Choose best peeling option by traversing peeling hash table and either
1911 choosing an option with the lowest cost (if cost model is enabled) or the
1912 option that aligns as many accesses as possible. */
1914 static struct _vect_peel_extended_info
1915 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1916 loop_vec_info loop_vinfo)
1918 struct _vect_peel_extended_info res;
1920 res.peel_info.dr_info = NULL;
1921 res.vinfo = loop_vinfo;
1923 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1925 res.inside_cost = INT_MAX;
1926 res.outside_cost = INT_MAX;
1927 peeling_htab->traverse <_vect_peel_extended_info *,
1928 vect_peeling_hash_get_lowest_cost> (&res);
1930 else
1932 res.peel_info.count = 0;
1933 peeling_htab->traverse <_vect_peel_extended_info *,
1934 vect_peeling_hash_get_most_frequent> (&res);
1935 res.inside_cost = 0;
1936 res.outside_cost = 0;
1939 return res;
1942 /* Return true if the new peeling NPEEL is supported. */
1944 static bool
1945 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1946 unsigned npeel)
1948 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1949 enum dr_alignment_support supportable_dr_alignment;
1951 bool dr0_alignment_known_p
1952 = known_alignment_for_access_p (dr0_info,
1953 STMT_VINFO_VECTYPE (dr0_info->stmt));
1955 /* Ensure that all data refs can be vectorized after the peel. */
1956 for (data_reference *dr : datarefs)
1958 if (dr == dr0_info->dr)
1959 continue;
1961 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1962 if (!vect_relevant_for_alignment_p (dr_info)
1963 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1964 continue;
1966 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1967 int misalignment;
1968 unsigned HOST_WIDE_INT alignment;
1969 if (!dr0_alignment_known_p
1970 || !known_alignment_for_access_p (dr_info, vectype)
1971 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1972 misalignment = DR_MISALIGNMENT_UNKNOWN;
1973 else
1975 misalignment = dr_misalignment (dr_info, vectype);
1976 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1977 misalignment &= alignment - 1;
1979 supportable_dr_alignment
1980 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1981 misalignment);
1982 if (supportable_dr_alignment == dr_unaligned_unsupported)
1983 return false;
1986 return true;
1989 /* Compare two data-references DRA and DRB to group them into chunks
1990 with related alignment. */
1992 static int
1993 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1995 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1996 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1997 int cmp;
1999 /* Stabilize sort. */
2000 if (dra == drb)
2001 return 0;
2003 /* Ordering of DRs according to base. */
2004 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2005 DR_BASE_ADDRESS (drb));
2006 if (cmp != 0)
2007 return cmp;
2009 /* And according to DR_OFFSET. */
2010 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2011 if (cmp != 0)
2012 return cmp;
2014 /* And after step. */
2015 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2016 if (cmp != 0)
2017 return cmp;
2019 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2020 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2021 if (cmp == 0)
2022 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2023 return cmp;
2026 /* Function vect_enhance_data_refs_alignment
2028 This pass will use loop versioning and loop peeling in order to enhance
2029 the alignment of data references in the loop.
2031 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2032 original loop is to be vectorized. Any other loops that are created by
2033 the transformations performed in this pass - are not supposed to be
2034 vectorized. This restriction will be relaxed.
2036 This pass will require a cost model to guide it whether to apply peeling
2037 or versioning or a combination of the two. For example, the scheme that
2038 intel uses when given a loop with several memory accesses, is as follows:
2039 choose one memory access ('p') which alignment you want to force by doing
2040 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2041 other accesses are not necessarily aligned, or (2) use loop versioning to
2042 generate one loop in which all accesses are aligned, and another loop in
2043 which only 'p' is necessarily aligned.
2045 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2046 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2047 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2049 Devising a cost model is the most critical aspect of this work. It will
2050 guide us on which access to peel for, whether to use loop versioning, how
2051 many versions to create, etc. The cost model will probably consist of
2052 generic considerations as well as target specific considerations (on
2053 powerpc for example, misaligned stores are more painful than misaligned
2054 loads).
2056 Here are the general steps involved in alignment enhancements:
2058 -- original loop, before alignment analysis:
2059 for (i=0; i<N; i++){
2060 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2061 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2064 -- After vect_compute_data_refs_alignment:
2065 for (i=0; i<N; i++){
2066 x = q[i]; # DR_MISALIGNMENT(q) = 3
2067 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2070 -- Possibility 1: we do loop versioning:
2071 if (p is aligned) {
2072 for (i=0; i<N; i++){ # loop 1A
2073 x = q[i]; # DR_MISALIGNMENT(q) = 3
2074 p[i] = y; # DR_MISALIGNMENT(p) = 0
2077 else {
2078 for (i=0; i<N; i++){ # loop 1B
2079 x = q[i]; # DR_MISALIGNMENT(q) = 3
2080 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2084 -- Possibility 2: we do loop peeling:
2085 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2086 x = q[i];
2087 p[i] = y;
2089 for (i = 3; i < N; i++){ # loop 2A
2090 x = q[i]; # DR_MISALIGNMENT(q) = 0
2091 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2094 -- Possibility 3: combination of loop peeling and versioning:
2095 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2096 x = q[i];
2097 p[i] = y;
2099 if (p is aligned) {
2100 for (i = 3; i<N; i++){ # loop 3A
2101 x = q[i]; # DR_MISALIGNMENT(q) = 0
2102 p[i] = y; # DR_MISALIGNMENT(p) = 0
2105 else {
2106 for (i = 3; i<N; i++){ # loop 3B
2107 x = q[i]; # DR_MISALIGNMENT(q) = 0
2108 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2112 These loops are later passed to loop_transform to be vectorized. The
2113 vectorizer will use the alignment information to guide the transformation
2114 (whether to generate regular loads/stores, or with special handling for
2115 misalignment). */
2117 opt_result
2118 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2120 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2121 dr_vec_info *first_store = NULL;
2122 dr_vec_info *dr0_info = NULL;
2123 struct data_reference *dr;
2124 unsigned int i;
2125 bool do_peeling = false;
2126 bool do_versioning = false;
2127 unsigned int npeel = 0;
2128 bool one_misalignment_known = false;
2129 bool one_misalignment_unknown = false;
2130 bool one_dr_unsupportable = false;
2131 dr_vec_info *unsupportable_dr_info = NULL;
2132 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2133 hash_table<peel_info_hasher> peeling_htab (1);
2135 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2137 /* Reset data so we can safely be called multiple times. */
2138 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2139 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2141 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2142 return opt_result::success ();
2144 /* Sort the vector of datarefs so DRs that have the same or dependent
2145 alignment are next to each other. */
2146 auto_vec<data_reference_p> datarefs
2147 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2148 datarefs.qsort (dr_align_group_sort_cmp);
2150 /* Compute the number of DRs that become aligned when we peel
2151 a dataref so it becomes aligned. */
2152 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2153 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2154 unsigned i0;
2155 for (i0 = 0; i0 < datarefs.length (); ++i0)
2156 if (DR_BASE_ADDRESS (datarefs[i0]))
2157 break;
2158 for (i = i0 + 1; i <= datarefs.length (); ++i)
2160 if (i == datarefs.length ()
2161 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2162 DR_BASE_ADDRESS (datarefs[i]), 0)
2163 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2164 DR_OFFSET (datarefs[i]), 0)
2165 || !operand_equal_p (DR_STEP (datarefs[i0]),
2166 DR_STEP (datarefs[i]), 0))
2168 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2169 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2170 will get known misalignment if we align one of the refs
2171 with the largest DR_TARGET_ALIGNMENT. */
2172 for (unsigned j = i0; j < i; ++j)
2174 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2175 for (unsigned k = i0; k < i; ++k)
2177 if (k == j)
2178 continue;
2179 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2180 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2181 dr_infoj))
2182 n_same_align_refs[j]++;
2185 i0 = i;
2189 /* While cost model enhancements are expected in the future, the high level
2190 view of the code at this time is as follows:
2192 A) If there is a misaligned access then see if peeling to align
2193 this access can make all data references satisfy
2194 vect_supportable_dr_alignment. If so, update data structures
2195 as needed and return true.
2197 B) If peeling wasn't possible and there is a data reference with an
2198 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2199 then see if loop versioning checks can be used to make all data
2200 references satisfy vect_supportable_dr_alignment. If so, update
2201 data structures as needed and return true.
2203 C) If neither peeling nor versioning were successful then return false if
2204 any data reference does not satisfy vect_supportable_dr_alignment.
2206 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2208 Note, Possibility 3 above (which is peeling and versioning together) is not
2209 being done at this time. */
2211 /* (1) Peeling to force alignment. */
2213 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2214 Considerations:
2215 + How many accesses will become aligned due to the peeling
2216 - How many accesses will become unaligned due to the peeling,
2217 and the cost of misaligned accesses.
2218 - The cost of peeling (the extra runtime checks, the increase
2219 in code size). */
2221 FOR_EACH_VEC_ELT (datarefs, i, dr)
2223 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2224 if (!vect_relevant_for_alignment_p (dr_info))
2225 continue;
2227 stmt_vec_info stmt_info = dr_info->stmt;
2228 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2229 do_peeling = vector_alignment_reachable_p (dr_info);
2230 if (do_peeling)
2232 if (known_alignment_for_access_p (dr_info, vectype))
2234 unsigned int npeel_tmp = 0;
2235 bool negative = tree_int_cst_compare (DR_STEP (dr),
2236 size_zero_node) < 0;
2238 /* If known_alignment_for_access_p then we have set
2239 DR_MISALIGNMENT which is only done if we know it at compiler
2240 time, so it is safe to assume target alignment is constant.
2242 unsigned int target_align =
2243 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2244 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2245 poly_int64 off = 0;
2246 if (negative)
2247 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2248 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2249 mis = negative ? mis : -mis;
2250 if (mis != 0)
2251 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2253 /* For multiple types, it is possible that the bigger type access
2254 will have more than one peeling option. E.g., a loop with two
2255 types: one of size (vector size / 4), and the other one of
2256 size (vector size / 8). Vectorization factor will 8. If both
2257 accesses are misaligned by 3, the first one needs one scalar
2258 iteration to be aligned, and the second one needs 5. But the
2259 first one will be aligned also by peeling 5 scalar
2260 iterations, and in that case both accesses will be aligned.
2261 Hence, except for the immediate peeling amount, we also want
2262 to try to add full vector size, while we don't exceed
2263 vectorization factor.
2264 We do this automatically for cost model, since we calculate
2265 cost for every peeling option. */
2266 poly_uint64 nscalars = npeel_tmp;
2267 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2269 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2270 nscalars = (STMT_SLP_TYPE (stmt_info)
2271 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2274 /* Save info about DR in the hash table. Also include peeling
2275 amounts according to the explanation above. Indicate
2276 the alignment status when the ref is not aligned.
2277 ??? Rather than using unknown alignment here we should
2278 prune all entries from the peeling hashtable which cause
2279 DRs to be not supported. */
2280 bool supportable_if_not_aligned
2281 = vect_supportable_dr_alignment
2282 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2283 while (known_le (npeel_tmp, nscalars))
2285 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2286 dr_info, npeel_tmp,
2287 supportable_if_not_aligned);
2288 npeel_tmp += MAX (1, target_align / dr_size);
2291 one_misalignment_known = true;
2293 else
2295 /* If we don't know any misalignment values, we prefer
2296 peeling for data-ref that has the maximum number of data-refs
2297 with the same alignment, unless the target prefers to align
2298 stores over load. */
2299 unsigned same_align_drs = n_same_align_refs[i];
2300 if (!dr0_info
2301 || dr0_same_align_drs < same_align_drs)
2303 dr0_same_align_drs = same_align_drs;
2304 dr0_info = dr_info;
2306 /* For data-refs with the same number of related
2307 accesses prefer the one where the misalign
2308 computation will be invariant in the outermost loop. */
2309 else if (dr0_same_align_drs == same_align_drs)
2311 class loop *ivloop0, *ivloop;
2312 ivloop0 = outermost_invariant_loop_for_expr
2313 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2314 ivloop = outermost_invariant_loop_for_expr
2315 (loop, DR_BASE_ADDRESS (dr));
2316 if ((ivloop && !ivloop0)
2317 || (ivloop && ivloop0
2318 && flow_loop_nested_p (ivloop, ivloop0)))
2319 dr0_info = dr_info;
2322 one_misalignment_unknown = true;
2324 /* Check for data refs with unsupportable alignment that
2325 can be peeled. */
2326 enum dr_alignment_support supportable_dr_alignment
2327 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2328 DR_MISALIGNMENT_UNKNOWN);
2329 if (supportable_dr_alignment == dr_unaligned_unsupported)
2331 one_dr_unsupportable = true;
2332 unsupportable_dr_info = dr_info;
2335 if (!first_store && DR_IS_WRITE (dr))
2337 first_store = dr_info;
2338 first_store_same_align_drs = same_align_drs;
2342 else
2344 if (!aligned_access_p (dr_info, vectype))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348 "vector alignment may not be reachable\n");
2349 break;
2354 /* Check if we can possibly peel the loop. */
2355 if (!vect_can_advance_ivs_p (loop_vinfo)
2356 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2357 LOOP_VINFO_IV_EXIT (loop_vinfo))
2358 || loop->inner)
2359 do_peeling = false;
2361 struct _vect_peel_extended_info peel_for_known_alignment;
2362 struct _vect_peel_extended_info peel_for_unknown_alignment;
2363 struct _vect_peel_extended_info best_peel;
2365 peel_for_unknown_alignment.inside_cost = INT_MAX;
2366 peel_for_unknown_alignment.outside_cost = INT_MAX;
2367 peel_for_unknown_alignment.peel_info.count = 0;
2369 if (do_peeling
2370 && one_misalignment_unknown)
2372 /* Check if the target requires to prefer stores over loads, i.e., if
2373 misaligned stores are more expensive than misaligned loads (taking
2374 drs with same alignment into account). */
2375 unsigned int load_inside_cost = 0;
2376 unsigned int load_outside_cost = 0;
2377 unsigned int store_inside_cost = 0;
2378 unsigned int store_outside_cost = 0;
2379 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2381 stmt_vector_for_cost dummy;
2382 dummy.create (2);
2383 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2384 &load_inside_cost,
2385 &load_outside_cost,
2386 &dummy, &dummy, estimated_npeels);
2387 dummy.release ();
2389 if (first_store)
2391 dummy.create (2);
2392 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2393 &store_inside_cost,
2394 &store_outside_cost,
2395 &dummy, &dummy,
2396 estimated_npeels);
2397 dummy.release ();
2399 else
2401 store_inside_cost = INT_MAX;
2402 store_outside_cost = INT_MAX;
2405 if (load_inside_cost > store_inside_cost
2406 || (load_inside_cost == store_inside_cost
2407 && load_outside_cost > store_outside_cost))
2409 dr0_info = first_store;
2410 dr0_same_align_drs = first_store_same_align_drs;
2411 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2412 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2414 else
2416 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2417 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2420 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2421 prologue_cost_vec.create (2);
2422 epilogue_cost_vec.create (2);
2424 int dummy2;
2425 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2426 (loop_vinfo, estimated_npeels, &dummy2,
2427 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2428 &prologue_cost_vec, &epilogue_cost_vec);
2430 prologue_cost_vec.release ();
2431 epilogue_cost_vec.release ();
2433 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2436 peel_for_unknown_alignment.peel_info.npeel = 0;
2437 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2439 best_peel = peel_for_unknown_alignment;
2441 peel_for_known_alignment.inside_cost = INT_MAX;
2442 peel_for_known_alignment.outside_cost = INT_MAX;
2443 peel_for_known_alignment.peel_info.count = 0;
2444 peel_for_known_alignment.peel_info.dr_info = NULL;
2446 if (do_peeling && one_misalignment_known)
2448 /* Peeling is possible, but there is no data access that is not supported
2449 unless aligned. So we try to choose the best possible peeling from
2450 the hash table. */
2451 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2452 (&peeling_htab, loop_vinfo);
2455 /* Compare costs of peeling for known and unknown alignment. */
2456 if (peel_for_known_alignment.peel_info.dr_info != NULL
2457 && peel_for_unknown_alignment.inside_cost
2458 >= peel_for_known_alignment.inside_cost)
2460 best_peel = peel_for_known_alignment;
2462 /* If the best peeling for known alignment has NPEEL == 0, perform no
2463 peeling at all except if there is an unsupportable dr that we can
2464 align. */
2465 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2466 do_peeling = false;
2469 /* If there is an unsupportable data ref, prefer this over all choices so far
2470 since we'd have to discard a chosen peeling except when it accidentally
2471 aligned the unsupportable data ref. */
2472 if (one_dr_unsupportable)
2473 dr0_info = unsupportable_dr_info;
2474 else if (do_peeling)
2476 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2477 TODO: Use nopeel_outside_cost or get rid of it? */
2478 unsigned nopeel_inside_cost = 0;
2479 unsigned nopeel_outside_cost = 0;
2481 stmt_vector_for_cost dummy;
2482 dummy.create (2);
2483 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2484 &nopeel_outside_cost, &dummy, &dummy, 0);
2485 dummy.release ();
2487 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2488 costs will be recorded. */
2489 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2490 prologue_cost_vec.create (2);
2491 epilogue_cost_vec.create (2);
2493 int dummy2;
2494 nopeel_outside_cost += vect_get_known_peeling_cost
2495 (loop_vinfo, 0, &dummy2,
2496 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2497 &prologue_cost_vec, &epilogue_cost_vec);
2499 prologue_cost_vec.release ();
2500 epilogue_cost_vec.release ();
2502 npeel = best_peel.peel_info.npeel;
2503 dr0_info = best_peel.peel_info.dr_info;
2505 /* If no peeling is not more expensive than the best peeling we
2506 have so far, don't perform any peeling. */
2507 if (nopeel_inside_cost <= best_peel.inside_cost)
2508 do_peeling = false;
2511 if (do_peeling)
2513 stmt_vec_info stmt_info = dr0_info->stmt;
2514 if (known_alignment_for_access_p (dr0_info,
2515 STMT_VINFO_VECTYPE (stmt_info)))
2517 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2518 size_zero_node) < 0;
2519 if (!npeel)
2521 /* Since it's known at compile time, compute the number of
2522 iterations in the peeled loop (the peeling factor) for use in
2523 updating DR_MISALIGNMENT values. The peeling factor is the
2524 vectorization factor minus the misalignment as an element
2525 count. */
2526 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2527 poly_int64 off = 0;
2528 if (negative)
2529 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2530 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2531 unsigned int mis
2532 = dr_misalignment (dr0_info, vectype, off);
2533 mis = negative ? mis : -mis;
2534 /* If known_alignment_for_access_p then we have set
2535 DR_MISALIGNMENT which is only done if we know it at compiler
2536 time, so it is safe to assume target alignment is constant.
2538 unsigned int target_align =
2539 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2540 npeel = ((mis & (target_align - 1))
2541 / vect_get_scalar_dr_size (dr0_info));
2544 /* For interleaved data access every iteration accesses all the
2545 members of the group, therefore we divide the number of iterations
2546 by the group size. */
2547 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2548 npeel /= DR_GROUP_SIZE (stmt_info);
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_NOTE, vect_location,
2552 "Try peeling by %d\n", npeel);
2555 /* Ensure that all datarefs can be vectorized after the peel. */
2556 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2557 do_peeling = false;
2559 /* Check if all datarefs are supportable and log. */
2560 if (do_peeling
2561 && npeel == 0
2562 && known_alignment_for_access_p (dr0_info,
2563 STMT_VINFO_VECTYPE (stmt_info)))
2564 return opt_result::success ();
2566 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2567 if (do_peeling)
2569 unsigned max_allowed_peel
2570 = param_vect_max_peeling_for_alignment;
2571 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2572 max_allowed_peel = 0;
2573 if (max_allowed_peel != (unsigned)-1)
2575 unsigned max_peel = npeel;
2576 if (max_peel == 0)
2578 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2579 unsigned HOST_WIDE_INT target_align_c;
2580 if (target_align.is_constant (&target_align_c))
2581 max_peel =
2582 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2583 else
2585 do_peeling = false;
2586 if (dump_enabled_p ())
2587 dump_printf_loc (MSG_NOTE, vect_location,
2588 "Disable peeling, max peels set and vector"
2589 " alignment unknown\n");
2592 if (max_peel > max_allowed_peel)
2594 do_peeling = false;
2595 if (dump_enabled_p ())
2596 dump_printf_loc (MSG_NOTE, vect_location,
2597 "Disable peeling, max peels reached: %d\n", max_peel);
2602 /* Cost model #2 - if peeling may result in a remaining loop not
2603 iterating enough to be vectorized then do not peel. Since this
2604 is a cost heuristic rather than a correctness decision, use the
2605 most likely runtime value for variable vectorization factors. */
2606 if (do_peeling
2607 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2609 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2610 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2611 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2612 < assumed_vf + max_peel)
2613 do_peeling = false;
2616 if (do_peeling)
2618 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2619 If the misalignment of DR_i is identical to that of dr0 then set
2620 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2621 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2622 by the peeling factor times the element size of DR_i (MOD the
2623 vectorization factor times the size). Otherwise, the
2624 misalignment of DR_i must be set to unknown. */
2625 FOR_EACH_VEC_ELT (datarefs, i, dr)
2626 if (dr != dr0_info->dr)
2628 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2629 if (!vect_relevant_for_alignment_p (dr_info))
2630 continue;
2632 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2635 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2636 if (npeel)
2637 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2638 else
2639 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2640 SET_DR_MISALIGNMENT (dr0_info,
2641 vect_dr_misalign_for_aligned_access (dr0_info));
2642 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_NOTE, vect_location,
2645 "Alignment of access forced using peeling.\n");
2646 dump_printf_loc (MSG_NOTE, vect_location,
2647 "Peeling for alignment will be applied.\n");
2650 /* The inside-loop cost will be accounted for in vectorizable_load
2651 and vectorizable_store correctly with adjusted alignments.
2652 Drop the body_cst_vec on the floor here. */
2653 return opt_result::success ();
2657 /* (2) Versioning to force alignment. */
2659 /* Try versioning if:
2660 1) optimize loop for speed and the cost-model is not cheap
2661 2) there is at least one unsupported misaligned data ref with an unknown
2662 misalignment, and
2663 3) all misaligned data refs with a known misalignment are supported, and
2664 4) the number of runtime alignment checks is within reason. */
2666 do_versioning
2667 = (optimize_loop_nest_for_speed_p (loop)
2668 && !loop->inner /* FORNOW */
2669 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2671 if (do_versioning)
2673 FOR_EACH_VEC_ELT (datarefs, i, dr)
2675 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2676 if (!vect_relevant_for_alignment_p (dr_info))
2677 continue;
2679 stmt_vec_info stmt_info = dr_info->stmt;
2680 if (STMT_VINFO_STRIDED_P (stmt_info))
2682 do_versioning = false;
2683 break;
2686 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2687 bool negative = tree_int_cst_compare (DR_STEP (dr),
2688 size_zero_node) < 0;
2689 poly_int64 off = 0;
2690 if (negative)
2691 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2692 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2693 int misalignment;
2694 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2695 continue;
2697 enum dr_alignment_support supportable_dr_alignment
2698 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2699 misalignment);
2700 if (supportable_dr_alignment == dr_unaligned_unsupported)
2702 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2703 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2704 >= (unsigned) param_vect_max_version_for_alignment_checks))
2706 do_versioning = false;
2707 break;
2710 /* At present we don't support versioning for alignment
2711 with variable VF, since there's no guarantee that the
2712 VF is a power of two. We could relax this if we added
2713 a way of enforcing a power-of-two size. */
2714 unsigned HOST_WIDE_INT size;
2715 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2717 do_versioning = false;
2718 break;
2721 /* Forcing alignment in the first iteration is no good if
2722 we don't keep it across iterations. For now, just disable
2723 versioning in this case.
2724 ?? We could actually unroll the loop to achieve the required
2725 overall step alignment, and forcing the alignment could be
2726 done by doing some iterations of the non-vectorized loop. */
2727 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2728 * DR_STEP_ALIGNMENT (dr),
2729 DR_TARGET_ALIGNMENT (dr_info)))
2731 do_versioning = false;
2732 break;
2735 /* The rightmost bits of an aligned address must be zeros.
2736 Construct the mask needed for this test. For example,
2737 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2738 mask must be 15 = 0xf. */
2739 int mask = size - 1;
2741 /* FORNOW: use the same mask to test all potentially unaligned
2742 references in the loop. */
2743 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2744 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2746 do_versioning = false;
2747 break;
2750 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2751 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2755 /* Versioning requires at least one misaligned data reference. */
2756 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2757 do_versioning = false;
2758 else if (!do_versioning)
2759 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2762 if (do_versioning)
2764 const vec<stmt_vec_info> &may_misalign_stmts
2765 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2766 stmt_vec_info stmt_info;
2768 /* It can now be assumed that the data references in the statements
2769 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2770 of the loop being vectorized. */
2771 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2773 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2774 SET_DR_MISALIGNMENT (dr_info,
2775 vect_dr_misalign_for_aligned_access (dr_info));
2776 if (dump_enabled_p ())
2777 dump_printf_loc (MSG_NOTE, vect_location,
2778 "Alignment of access forced using versioning.\n");
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_NOTE, vect_location,
2783 "Versioning for alignment will be applied.\n");
2785 /* Peeling and versioning can't be done together at this time. */
2786 gcc_assert (! (do_peeling && do_versioning));
2788 return opt_result::success ();
2791 /* This point is reached if neither peeling nor versioning is being done. */
2792 gcc_assert (! (do_peeling || do_versioning));
2794 return opt_result::success ();
2798 /* Function vect_analyze_data_refs_alignment
2800 Analyze the alignment of the data-references in the loop.
2801 Return FALSE if a data reference is found that cannot be vectorized. */
2803 opt_result
2804 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2806 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2808 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2809 struct data_reference *dr;
2810 unsigned int i;
2812 vect_record_base_alignments (loop_vinfo);
2813 FOR_EACH_VEC_ELT (datarefs, i, dr)
2815 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2816 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2818 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2819 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2820 continue;
2821 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2822 STMT_VINFO_VECTYPE (dr_info->stmt));
2826 return opt_result::success ();
2830 /* Analyze alignment of DRs of stmts in NODE. */
2832 static bool
2833 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2835 /* Alignment is maintained in the first element of the group. */
2836 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2837 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2838 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2839 tree vectype = SLP_TREE_VECTYPE (node);
2840 poly_uint64 vector_alignment
2841 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2842 BITS_PER_UNIT);
2843 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2844 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2845 /* Re-analyze alignment when we're facing a vectorization with a bigger
2846 alignment requirement. */
2847 else if (known_lt (dr_info->target_alignment, vector_alignment))
2849 poly_uint64 old_target_alignment = dr_info->target_alignment;
2850 int old_misalignment = dr_info->misalignment;
2851 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2852 /* But keep knowledge about a smaller alignment. */
2853 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2854 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2856 dr_info->target_alignment = old_target_alignment;
2857 dr_info->misalignment = old_misalignment;
2860 /* When we ever face unordered target alignments the first one wins in terms
2861 of analyzing and the other will become unknown in dr_misalignment. */
2862 return true;
2865 /* Function vect_slp_analyze_instance_alignment
2867 Analyze the alignment of the data-references in the SLP instance.
2868 Return FALSE if a data reference is found that cannot be vectorized. */
2870 bool
2871 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2872 slp_instance instance)
2874 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2876 slp_tree node;
2877 unsigned i;
2878 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2879 if (! vect_slp_analyze_node_alignment (vinfo, node))
2880 return false;
2882 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2883 && ! vect_slp_analyze_node_alignment
2884 (vinfo, SLP_INSTANCE_TREE (instance)))
2885 return false;
2887 return true;
2891 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2892 accesses of legal size, step, etc. Detect gaps, single element
2893 interleaving, and other special cases. Set grouped access info.
2894 Collect groups of strided stores for further use in SLP analysis.
2895 Worker for vect_analyze_group_access. */
2897 static bool
2898 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2900 data_reference *dr = dr_info->dr;
2901 tree step = DR_STEP (dr);
2902 tree scalar_type = TREE_TYPE (DR_REF (dr));
2903 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2904 stmt_vec_info stmt_info = dr_info->stmt;
2905 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2906 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2907 HOST_WIDE_INT dr_step = -1;
2908 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2909 bool slp_impossible = false;
2911 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2912 size of the interleaving group (including gaps). */
2913 if (tree_fits_shwi_p (step))
2915 dr_step = tree_to_shwi (step);
2916 /* Check that STEP is a multiple of type size. Otherwise there is
2917 a non-element-sized gap at the end of the group which we
2918 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2919 ??? As we can handle non-constant step fine here we should
2920 simply remove uses of DR_GROUP_GAP between the last and first
2921 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2922 simply not include that gap. */
2923 if ((dr_step % type_size) != 0)
2925 if (dump_enabled_p ())
2926 dump_printf_loc (MSG_NOTE, vect_location,
2927 "Step %T is not a multiple of the element size"
2928 " for %T\n",
2929 step, DR_REF (dr));
2930 return false;
2932 groupsize = absu_hwi (dr_step) / type_size;
2934 else
2935 groupsize = 0;
2937 /* Not consecutive access is possible only if it is a part of interleaving. */
2938 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2940 /* Check if it this DR is a part of interleaving, and is a single
2941 element of the group that is accessed in the loop. */
2943 /* Gaps are supported only for loads. STEP must be a multiple of the type
2944 size. */
2945 if (DR_IS_READ (dr)
2946 && (dr_step % type_size) == 0
2947 && groupsize > 0
2948 /* This could be UINT_MAX but as we are generating code in a very
2949 inefficient way we have to cap earlier.
2950 See PR91403 for example. */
2951 && groupsize <= 4096)
2953 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2954 DR_GROUP_SIZE (stmt_info) = groupsize;
2955 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2956 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_NOTE, vect_location,
2958 "Detected single element interleaving %T"
2959 " step %T\n",
2960 DR_REF (dr), step);
2962 return true;
2965 if (dump_enabled_p ())
2966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2967 "not consecutive access %G", stmt_info->stmt);
2969 if (bb_vinfo)
2971 /* Mark the statement as unvectorizable. */
2972 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2973 return true;
2976 if (dump_enabled_p ())
2977 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2978 STMT_VINFO_STRIDED_P (stmt_info) = true;
2979 return true;
2982 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2984 /* First stmt in the interleaving chain. Check the chain. */
2985 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2986 struct data_reference *data_ref = dr;
2987 unsigned int count = 1;
2988 tree prev_init = DR_INIT (data_ref);
2989 HOST_WIDE_INT diff, gaps = 0;
2991 /* By construction, all group members have INTEGER_CST DR_INITs. */
2992 while (next)
2994 /* We never have the same DR multiple times. */
2995 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2996 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2998 data_ref = STMT_VINFO_DATA_REF (next);
3000 /* All group members have the same STEP by construction. */
3001 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3003 /* Check that the distance between two accesses is equal to the type
3004 size. Otherwise, we have gaps. */
3005 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3006 - TREE_INT_CST_LOW (prev_init)) / type_size;
3007 if (diff < 1 || diff > UINT_MAX)
3009 /* For artificial testcases with array accesses with large
3010 constant indices we can run into overflow issues which
3011 can end up fooling the groupsize constraint below so
3012 check the individual gaps (which are represented as
3013 unsigned int) as well. */
3014 if (dump_enabled_p ())
3015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3016 "interleaved access with gap larger "
3017 "than representable\n");
3018 return false;
3020 if (diff != 1)
3022 /* FORNOW: SLP of accesses with gaps is not supported. */
3023 slp_impossible = true;
3024 if (DR_IS_WRITE (data_ref))
3026 if (dump_enabled_p ())
3027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3028 "interleaved store with gaps\n");
3029 return false;
3032 gaps += diff - 1;
3035 last_accessed_element += diff;
3037 /* Store the gap from the previous member of the group. If there is no
3038 gap in the access, DR_GROUP_GAP is always 1. */
3039 DR_GROUP_GAP (next) = diff;
3041 prev_init = DR_INIT (data_ref);
3042 next = DR_GROUP_NEXT_ELEMENT (next);
3043 /* Count the number of data-refs in the chain. */
3044 count++;
3047 if (groupsize == 0)
3048 groupsize = count + gaps;
3050 /* This could be UINT_MAX but as we are generating code in a very
3051 inefficient way we have to cap earlier. See PR78699 for example. */
3052 if (groupsize > 4096)
3054 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3056 "group is too large\n");
3057 return false;
3060 /* Check that the size of the interleaving is equal to count for stores,
3061 i.e., that there are no gaps. */
3062 if (groupsize != count
3063 && !DR_IS_READ (dr))
3065 groupsize = count;
3066 STMT_VINFO_STRIDED_P (stmt_info) = true;
3069 /* If there is a gap after the last load in the group it is the
3070 difference between the groupsize and the last accessed
3071 element.
3072 When there is no gap, this difference should be 0. */
3073 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3075 DR_GROUP_SIZE (stmt_info) = groupsize;
3076 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 "Detected interleaving ");
3080 if (DR_IS_READ (dr))
3081 dump_printf (MSG_NOTE, "load ");
3082 else if (STMT_VINFO_STRIDED_P (stmt_info))
3083 dump_printf (MSG_NOTE, "strided store ");
3084 else
3085 dump_printf (MSG_NOTE, "store ");
3086 dump_printf (MSG_NOTE, "of size %u\n",
3087 (unsigned)groupsize);
3088 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3089 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3090 while (next)
3092 if (DR_GROUP_GAP (next) != 1)
3093 dump_printf_loc (MSG_NOTE, vect_location,
3094 "\t<gap of %d elements>\n",
3095 DR_GROUP_GAP (next) - 1);
3096 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3097 next = DR_GROUP_NEXT_ELEMENT (next);
3099 if (DR_GROUP_GAP (stmt_info) != 0)
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 "\t<gap of %d elements>\n",
3102 DR_GROUP_GAP (stmt_info));
3105 /* SLP: create an SLP data structure for every interleaving group of
3106 stores for further analysis in vect_analyse_slp. */
3107 if (DR_IS_WRITE (dr) && !slp_impossible)
3109 if (loop_vinfo)
3110 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3111 if (bb_vinfo)
3112 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3116 return true;
3119 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3120 accesses of legal size, step, etc. Detect gaps, single element
3121 interleaving, and other special cases. Set grouped access info.
3122 Collect groups of strided stores for further use in SLP analysis. */
3124 static bool
3125 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3127 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3129 /* Dissolve the group if present. */
3130 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3131 while (stmt_info)
3133 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3134 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3135 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3136 stmt_info = next;
3138 return false;
3140 return true;
3143 /* Analyze the access pattern of the data-reference DR_INFO.
3144 In case of non-consecutive accesses call vect_analyze_group_access() to
3145 analyze groups of accesses. */
3147 static bool
3148 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3150 data_reference *dr = dr_info->dr;
3151 tree step = DR_STEP (dr);
3152 tree scalar_type = TREE_TYPE (DR_REF (dr));
3153 stmt_vec_info stmt_info = dr_info->stmt;
3154 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3155 class loop *loop = NULL;
3157 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3158 return true;
3160 if (loop_vinfo)
3161 loop = LOOP_VINFO_LOOP (loop_vinfo);
3163 if (loop_vinfo && !step)
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3167 "bad data-ref access in loop\n");
3168 return false;
3171 /* Allow loads with zero step in inner-loop vectorization. */
3172 if (loop_vinfo && integer_zerop (step))
3174 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3175 if (!nested_in_vect_loop_p (loop, stmt_info))
3176 return DR_IS_READ (dr);
3177 /* Allow references with zero step for outer loops marked
3178 with pragma omp simd only - it guarantees absence of
3179 loop-carried dependencies between inner loop iterations. */
3180 if (loop->safelen < 2)
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location,
3184 "zero step in inner loop of nest\n");
3185 return false;
3189 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3191 /* Interleaved accesses are not yet supported within outer-loop
3192 vectorization for references in the inner-loop. */
3193 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3195 /* For the rest of the analysis we use the outer-loop step. */
3196 step = STMT_VINFO_DR_STEP (stmt_info);
3197 if (integer_zerop (step))
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_NOTE, vect_location,
3201 "zero step in outer loop.\n");
3202 return DR_IS_READ (dr);
3206 /* Consecutive? */
3207 if (TREE_CODE (step) == INTEGER_CST)
3209 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3210 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3211 || (dr_step < 0
3212 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3214 /* Mark that it is not interleaving. */
3215 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3216 return true;
3220 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3222 if (dump_enabled_p ())
3223 dump_printf_loc (MSG_NOTE, vect_location,
3224 "grouped access in outer loop.\n");
3225 return false;
3229 /* Assume this is a DR handled by non-constant strided load case. */
3230 if (TREE_CODE (step) != INTEGER_CST)
3231 return (STMT_VINFO_STRIDED_P (stmt_info)
3232 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3233 || vect_analyze_group_access (vinfo, dr_info)));
3235 /* Not consecutive access - check if it's a part of interleaving group. */
3236 return vect_analyze_group_access (vinfo, dr_info);
3239 /* Compare two data-references DRA and DRB to group them into chunks
3240 suitable for grouping. */
3242 static int
3243 dr_group_sort_cmp (const void *dra_, const void *drb_)
3245 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3246 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3247 data_reference_p dra = dra_info->dr;
3248 data_reference_p drb = drb_info->dr;
3249 int cmp;
3251 /* Stabilize sort. */
3252 if (dra == drb)
3253 return 0;
3255 /* Different group IDs lead never belong to the same group. */
3256 if (dra_info->group != drb_info->group)
3257 return dra_info->group < drb_info->group ? -1 : 1;
3259 /* Ordering of DRs according to base. */
3260 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3261 DR_BASE_ADDRESS (drb));
3262 if (cmp != 0)
3263 return cmp;
3265 /* And according to DR_OFFSET. */
3266 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3267 if (cmp != 0)
3268 return cmp;
3270 /* Put reads before writes. */
3271 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3272 return DR_IS_READ (dra) ? -1 : 1;
3274 /* Then sort after access size. */
3275 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3276 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3277 if (cmp != 0)
3278 return cmp;
3280 /* And after step. */
3281 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3282 if (cmp != 0)
3283 return cmp;
3285 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3286 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3287 if (cmp == 0)
3288 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3289 return cmp;
3292 /* If OP is the result of a conversion, return the unconverted value,
3293 otherwise return null. */
3295 static tree
3296 strip_conversion (tree op)
3298 if (TREE_CODE (op) != SSA_NAME)
3299 return NULL_TREE;
3300 gimple *stmt = SSA_NAME_DEF_STMT (op);
3301 if (!is_gimple_assign (stmt)
3302 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3303 return NULL_TREE;
3304 return gimple_assign_rhs1 (stmt);
3307 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3308 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3309 be grouped in SLP mode. */
3311 static bool
3312 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3313 bool allow_slp_p)
3315 if (gimple_assign_single_p (stmt1_info->stmt))
3316 return gimple_assign_single_p (stmt2_info->stmt);
3318 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3319 if (call1 && gimple_call_internal_p (call1))
3321 /* Check for two masked loads or two masked stores. */
3322 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3323 if (!call2 || !gimple_call_internal_p (call2))
3324 return false;
3325 internal_fn ifn = gimple_call_internal_fn (call1);
3326 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3327 return false;
3328 if (ifn != gimple_call_internal_fn (call2))
3329 return false;
3331 /* Check that the masks are the same. Cope with casts of masks,
3332 like those created by build_mask_conversion. */
3333 tree mask1 = gimple_call_arg (call1, 2);
3334 tree mask2 = gimple_call_arg (call2, 2);
3335 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3337 mask1 = strip_conversion (mask1);
3338 if (!mask1)
3339 return false;
3340 mask2 = strip_conversion (mask2);
3341 if (!mask2)
3342 return false;
3343 if (!operand_equal_p (mask1, mask2, 0))
3344 return false;
3346 return true;
3349 return false;
3352 /* Function vect_analyze_data_ref_accesses.
3354 Analyze the access pattern of all the data references in the loop.
3356 FORNOW: the only access pattern that is considered vectorizable is a
3357 simple step 1 (consecutive) access.
3359 FORNOW: handle only arrays and pointer accesses. */
3361 opt_result
3362 vect_analyze_data_ref_accesses (vec_info *vinfo,
3363 vec<int> *dataref_groups)
3365 unsigned int i;
3366 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3368 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3370 if (datarefs.is_empty ())
3371 return opt_result::success ();
3373 /* Sort the array of datarefs to make building the interleaving chains
3374 linear. Don't modify the original vector's order, it is needed for
3375 determining what dependencies are reversed. */
3376 vec<dr_vec_info *> datarefs_copy;
3377 datarefs_copy.create (datarefs.length ());
3378 for (unsigned i = 0; i < datarefs.length (); i++)
3380 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3381 /* If the caller computed DR grouping use that, otherwise group by
3382 basic blocks. */
3383 if (dataref_groups)
3384 dr_info->group = (*dataref_groups)[i];
3385 else
3386 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3387 datarefs_copy.quick_push (dr_info);
3389 datarefs_copy.qsort (dr_group_sort_cmp);
3390 hash_set<stmt_vec_info> to_fixup;
3392 /* Build the interleaving chains. */
3393 for (i = 0; i < datarefs_copy.length () - 1;)
3395 dr_vec_info *dr_info_a = datarefs_copy[i];
3396 data_reference_p dra = dr_info_a->dr;
3397 int dra_group_id = dr_info_a->group;
3398 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3399 stmt_vec_info lastinfo = NULL;
3400 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3401 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3403 ++i;
3404 continue;
3406 for (i = i + 1; i < datarefs_copy.length (); ++i)
3408 dr_vec_info *dr_info_b = datarefs_copy[i];
3409 data_reference_p drb = dr_info_b->dr;
3410 int drb_group_id = dr_info_b->group;
3411 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3412 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3413 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3414 break;
3416 /* ??? Imperfect sorting (non-compatible types, non-modulo
3417 accesses, same accesses) can lead to a group to be artificially
3418 split here as we don't just skip over those. If it really
3419 matters we can push those to a worklist and re-iterate
3420 over them. The we can just skip ahead to the next DR here. */
3422 /* DRs in a different DR group should not be put into the same
3423 interleaving group. */
3424 if (dra_group_id != drb_group_id)
3425 break;
3427 /* Check that the data-refs have same first location (except init)
3428 and they are both either store or load (not load and store,
3429 not masked loads or stores). */
3430 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3431 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3432 DR_BASE_ADDRESS (drb)) != 0
3433 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3434 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3435 break;
3437 /* Check that the data-refs have the same constant size. */
3438 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3439 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3440 if (!tree_fits_uhwi_p (sza)
3441 || !tree_fits_uhwi_p (szb)
3442 || !tree_int_cst_equal (sza, szb))
3443 break;
3445 /* Check that the data-refs have the same step. */
3446 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3447 break;
3449 /* Check the types are compatible.
3450 ??? We don't distinguish this during sorting. */
3451 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3452 TREE_TYPE (DR_REF (drb))))
3453 break;
3455 /* Check that the DR_INITs are compile-time constants. */
3456 if (!tree_fits_shwi_p (DR_INIT (dra))
3457 || !tree_fits_shwi_p (DR_INIT (drb)))
3458 break;
3460 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3461 just hold extra information. */
3462 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3463 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3464 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3465 break;
3467 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3468 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3469 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3470 HOST_WIDE_INT init_prev
3471 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3472 gcc_assert (init_a <= init_b
3473 && init_a <= init_prev
3474 && init_prev <= init_b);
3476 /* Do not place the same access in the interleaving chain twice. */
3477 if (init_b == init_prev)
3479 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3480 < gimple_uid (DR_STMT (drb)));
3481 /* Simply link in duplicates and fix up the chain below. */
3483 else
3485 /* If init_b == init_a + the size of the type * k, we have an
3486 interleaving, and DRA is accessed before DRB. */
3487 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3488 if (type_size_a == 0
3489 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3490 % type_size_a != 0))
3491 break;
3493 /* If we have a store, the accesses are adjacent. This splits
3494 groups into chunks we support (we don't support vectorization
3495 of stores with gaps). */
3496 if (!DR_IS_READ (dra)
3497 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3498 != type_size_a))
3499 break;
3501 /* If the step (if not zero or non-constant) is smaller than the
3502 difference between data-refs' inits this splits groups into
3503 suitable sizes. */
3504 if (tree_fits_shwi_p (DR_STEP (dra)))
3506 unsigned HOST_WIDE_INT step
3507 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3508 if (step != 0
3509 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3510 break;
3514 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_NOTE, vect_location,
3516 DR_IS_READ (dra)
3517 ? "Detected interleaving load %T and %T\n"
3518 : "Detected interleaving store %T and %T\n",
3519 DR_REF (dra), DR_REF (drb));
3521 /* Link the found element into the group list. */
3522 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3524 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3525 lastinfo = stmtinfo_a;
3527 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3528 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3529 lastinfo = stmtinfo_b;
3531 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3532 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3534 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3535 dump_printf_loc (MSG_NOTE, vect_location,
3536 "Load suitable for SLP vectorization only.\n");
3538 if (init_b == init_prev
3539 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3540 && dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 "Queuing group with duplicate access for fixup\n");
3546 /* Fixup groups with duplicate entries by splitting it. */
3547 while (1)
3549 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3550 if (!(it != to_fixup.end ()))
3551 break;
3552 stmt_vec_info grp = *it;
3553 to_fixup.remove (grp);
3555 /* Find the earliest duplicate group member. */
3556 unsigned first_duplicate = -1u;
3557 stmt_vec_info next, g = grp;
3558 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3560 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3561 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3562 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3563 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3564 g = next;
3566 if (first_duplicate == -1U)
3567 continue;
3569 /* Then move all stmts after the first duplicate to a new group.
3570 Note this is a heuristic but one with the property that *it
3571 is fixed up completely. */
3572 g = grp;
3573 stmt_vec_info newgroup = NULL, ng = grp;
3574 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3576 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3578 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3579 if (!newgroup)
3580 newgroup = next;
3581 else
3582 DR_GROUP_NEXT_ELEMENT (ng) = next;
3583 ng = next;
3584 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3586 else
3587 g = DR_GROUP_NEXT_ELEMENT (g);
3589 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3591 /* Fixup the new group which still may contain duplicates. */
3592 to_fixup.add (newgroup);
3595 dr_vec_info *dr_info;
3596 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3598 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3599 && !vect_analyze_data_ref_access (vinfo, dr_info))
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603 "not vectorized: complicated access pattern.\n");
3605 if (is_a <bb_vec_info> (vinfo))
3607 /* Mark the statement as not vectorizable. */
3608 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3609 continue;
3611 else
3613 datarefs_copy.release ();
3614 return opt_result::failure_at (dr_info->stmt->stmt,
3615 "not vectorized:"
3616 " complicated access pattern.\n");
3621 datarefs_copy.release ();
3622 return opt_result::success ();
3625 /* Function vect_vfa_segment_size.
3627 Input:
3628 DR_INFO: The data reference.
3629 LENGTH_FACTOR: segment length to consider.
3631 Return a value suitable for the dr_with_seg_len::seg_len field.
3632 This is the "distance travelled" by the pointer from the first
3633 iteration in the segment to the last. Note that it does not include
3634 the size of the access; in effect it only describes the first byte. */
3636 static tree
3637 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3639 length_factor = size_binop (MINUS_EXPR,
3640 fold_convert (sizetype, length_factor),
3641 size_one_node);
3642 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3643 length_factor);
3646 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3647 gives the worst-case number of bytes covered by the segment. */
3649 static unsigned HOST_WIDE_INT
3650 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3652 stmt_vec_info stmt_vinfo = dr_info->stmt;
3653 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3654 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3655 unsigned HOST_WIDE_INT access_size = ref_size;
3656 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3658 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3659 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3661 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3662 int misalignment;
3663 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3664 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3665 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3666 == dr_explicit_realign_optimized))
3668 /* We might access a full vector's worth. */
3669 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3671 return access_size;
3674 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3675 describes. */
3677 static unsigned int
3678 vect_vfa_align (dr_vec_info *dr_info)
3680 return dr_alignment (dr_info->dr);
3683 /* Function vect_no_alias_p.
3685 Given data references A and B with equal base and offset, see whether
3686 the alias relation can be decided at compilation time. Return 1 if
3687 it can and the references alias, 0 if it can and the references do
3688 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3689 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3690 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3692 static int
3693 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3694 tree segment_length_a, tree segment_length_b,
3695 unsigned HOST_WIDE_INT access_size_a,
3696 unsigned HOST_WIDE_INT access_size_b)
3698 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3699 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3700 poly_uint64 const_length_a;
3701 poly_uint64 const_length_b;
3703 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3704 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3705 [a, a+12) */
3706 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3708 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3709 offset_a -= const_length_a;
3711 else
3712 const_length_a = tree_to_poly_uint64 (segment_length_a);
3713 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3715 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3716 offset_b -= const_length_b;
3718 else
3719 const_length_b = tree_to_poly_uint64 (segment_length_b);
3721 const_length_a += access_size_a;
3722 const_length_b += access_size_b;
3724 if (ranges_known_overlap_p (offset_a, const_length_a,
3725 offset_b, const_length_b))
3726 return 1;
3728 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3729 offset_b, const_length_b))
3730 return 0;
3732 return -1;
3735 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3736 in DDR is >= VF. */
3738 static bool
3739 dependence_distance_ge_vf (data_dependence_relation *ddr,
3740 unsigned int loop_depth, poly_uint64 vf)
3742 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3743 || DDR_NUM_DIST_VECTS (ddr) == 0)
3744 return false;
3746 /* If the dependence is exact, we should have limited the VF instead. */
3747 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3749 unsigned int i;
3750 lambda_vector dist_v;
3751 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3753 HOST_WIDE_INT dist = dist_v[loop_depth];
3754 if (dist != 0
3755 && !(dist > 0 && DDR_REVERSED_P (ddr))
3756 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3757 return false;
3760 if (dump_enabled_p ())
3761 dump_printf_loc (MSG_NOTE, vect_location,
3762 "dependence distance between %T and %T is >= VF\n",
3763 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3765 return true;
3768 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3770 static void
3771 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3773 dump_printf (dump_kind, "%s (%T) >= ",
3774 lower_bound.unsigned_p ? "unsigned" : "abs",
3775 lower_bound.expr);
3776 dump_dec (dump_kind, lower_bound.min_value);
3779 /* Record that the vectorized loop requires the vec_lower_bound described
3780 by EXPR, UNSIGNED_P and MIN_VALUE. */
3782 static void
3783 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3784 poly_uint64 min_value)
3786 vec<vec_lower_bound> &lower_bounds
3787 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3788 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3789 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3791 unsigned_p &= lower_bounds[i].unsigned_p;
3792 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3793 if (lower_bounds[i].unsigned_p != unsigned_p
3794 || maybe_lt (lower_bounds[i].min_value, min_value))
3796 lower_bounds[i].unsigned_p = unsigned_p;
3797 lower_bounds[i].min_value = min_value;
3798 if (dump_enabled_p ())
3800 dump_printf_loc (MSG_NOTE, vect_location,
3801 "updating run-time check to ");
3802 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3803 dump_printf (MSG_NOTE, "\n");
3806 return;
3809 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3810 if (dump_enabled_p ())
3812 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3813 dump_lower_bound (MSG_NOTE, lower_bound);
3814 dump_printf (MSG_NOTE, "\n");
3816 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3819 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3820 will span fewer than GAP bytes. */
3822 static bool
3823 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3824 poly_int64 gap)
3826 stmt_vec_info stmt_info = dr_info->stmt;
3827 HOST_WIDE_INT count
3828 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3829 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3830 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3831 return (estimated_poly_value (gap)
3832 <= count * vect_get_scalar_dr_size (dr_info));
3835 /* Return true if we know that there is no alias between DR_INFO_A and
3836 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3837 When returning true, set *LOWER_BOUND_OUT to this N. */
3839 static bool
3840 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3841 poly_uint64 *lower_bound_out)
3843 /* Check that there is a constant gap of known sign between DR_A
3844 and DR_B. */
3845 data_reference *dr_a = dr_info_a->dr;
3846 data_reference *dr_b = dr_info_b->dr;
3847 poly_int64 init_a, init_b;
3848 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3849 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3850 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3851 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3852 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3853 || !ordered_p (init_a, init_b))
3854 return false;
3856 /* Sort DR_A and DR_B by the address they access. */
3857 if (maybe_lt (init_b, init_a))
3859 std::swap (init_a, init_b);
3860 std::swap (dr_info_a, dr_info_b);
3861 std::swap (dr_a, dr_b);
3864 /* If the two accesses could be dependent within a scalar iteration,
3865 make sure that we'd retain their order. */
3866 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3867 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3868 return false;
3870 /* There is no alias if abs (DR_STEP) is greater than or equal to
3871 the bytes spanned by the combination of the two accesses. */
3872 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3873 return true;
3876 /* Function vect_prune_runtime_alias_test_list.
3878 Prune a list of ddrs to be tested at run-time by versioning for alias.
3879 Merge several alias checks into one if possible.
3880 Return FALSE if resulting list of ddrs is longer then allowed by
3881 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3883 opt_result
3884 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3886 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3887 hash_set <tree_pair_hash> compared_objects;
3889 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3890 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3891 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3892 const vec<vec_object_pair> &check_unequal_addrs
3893 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3894 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3895 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3897 ddr_p ddr;
3898 unsigned int i;
3899 tree length_factor;
3901 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3903 /* Step values are irrelevant for aliasing if the number of vector
3904 iterations is equal to the number of scalar iterations (which can
3905 happen for fully-SLP loops). */
3906 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3908 if (!vf_one_p)
3910 /* Convert the checks for nonzero steps into bound tests. */
3911 tree value;
3912 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3913 vect_check_lower_bound (loop_vinfo, value, true, 1);
3916 if (may_alias_ddrs.is_empty ())
3917 return opt_result::success ();
3919 comp_alias_ddrs.create (may_alias_ddrs.length ());
3921 unsigned int loop_depth
3922 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3923 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3925 /* First, we collect all data ref pairs for aliasing checks. */
3926 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3928 poly_uint64 lower_bound;
3929 tree segment_length_a, segment_length_b;
3930 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3931 unsigned int align_a, align_b;
3933 /* Ignore the alias if the VF we chose ended up being no greater
3934 than the dependence distance. */
3935 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3936 continue;
3938 if (DDR_OBJECT_A (ddr))
3940 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3941 if (!compared_objects.add (new_pair))
3943 if (dump_enabled_p ())
3944 dump_printf_loc (MSG_NOTE, vect_location,
3945 "checking that %T and %T"
3946 " have different addresses\n",
3947 new_pair.first, new_pair.second);
3948 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3950 continue;
3953 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3954 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3956 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3957 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3959 bool preserves_scalar_order_p
3960 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3961 bool ignore_step_p
3962 = (vf_one_p
3963 && (preserves_scalar_order_p
3964 || operand_equal_p (DR_STEP (dr_info_a->dr),
3965 DR_STEP (dr_info_b->dr))));
3967 /* Skip the pair if inter-iteration dependencies are irrelevant
3968 and intra-iteration dependencies are guaranteed to be honored. */
3969 if (ignore_step_p
3970 && (preserves_scalar_order_p
3971 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3972 &lower_bound)))
3974 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_NOTE, vect_location,
3976 "no need for alias check between "
3977 "%T and %T when VF is 1\n",
3978 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3979 continue;
3982 /* See whether we can handle the alias using a bounds check on
3983 the step, and whether that's likely to be the best approach.
3984 (It might not be, for example, if the minimum step is much larger
3985 than the number of bytes handled by one vector iteration.) */
3986 if (!ignore_step_p
3987 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3988 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3989 &lower_bound)
3990 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3991 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3993 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3994 if (dump_enabled_p ())
3996 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3997 "%T and %T when the step %T is outside ",
3998 DR_REF (dr_info_a->dr),
3999 DR_REF (dr_info_b->dr),
4000 DR_STEP (dr_info_a->dr));
4001 if (unsigned_p)
4002 dump_printf (MSG_NOTE, "[0");
4003 else
4005 dump_printf (MSG_NOTE, "(");
4006 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4008 dump_printf (MSG_NOTE, ", ");
4009 dump_dec (MSG_NOTE, lower_bound);
4010 dump_printf (MSG_NOTE, ")\n");
4012 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4013 unsigned_p, lower_bound);
4014 continue;
4017 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4018 if (dr_group_first_a)
4020 stmt_info_a = dr_group_first_a;
4021 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4024 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4025 if (dr_group_first_b)
4027 stmt_info_b = dr_group_first_b;
4028 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4031 if (ignore_step_p)
4033 segment_length_a = size_zero_node;
4034 segment_length_b = size_zero_node;
4036 else
4038 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4039 DR_STEP (dr_info_b->dr), 0))
4040 length_factor = scalar_loop_iters;
4041 else
4042 length_factor = size_int (vect_factor);
4043 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4044 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4046 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4047 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4048 align_a = vect_vfa_align (dr_info_a);
4049 align_b = vect_vfa_align (dr_info_b);
4051 /* See whether the alias is known at compilation time. */
4052 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4053 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4054 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4055 DR_OFFSET (dr_info_b->dr), 0)
4056 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4057 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4058 && poly_int_tree_p (segment_length_a)
4059 && poly_int_tree_p (segment_length_b))
4061 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4062 segment_length_a,
4063 segment_length_b,
4064 access_size_a,
4065 access_size_b);
4066 if (res >= 0 && dump_enabled_p ())
4068 dump_printf_loc (MSG_NOTE, vect_location,
4069 "can tell at compile time that %T and %T",
4070 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4071 if (res == 0)
4072 dump_printf (MSG_NOTE, " do not alias\n");
4073 else
4074 dump_printf (MSG_NOTE, " alias\n");
4077 if (res == 0)
4078 continue;
4080 if (res == 1)
4081 return opt_result::failure_at (stmt_info_b->stmt,
4082 "not vectorized:"
4083 " compilation time alias: %G%G",
4084 stmt_info_a->stmt,
4085 stmt_info_b->stmt);
4088 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4089 access_size_a, align_a);
4090 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4091 access_size_b, align_b);
4092 /* Canonicalize the order to be the one that's needed for accurate
4093 RAW, WAR and WAW flags, in cases where the data references are
4094 well-ordered. The order doesn't really matter otherwise,
4095 but we might as well be consistent. */
4096 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4097 std::swap (dr_a, dr_b);
4099 dr_with_seg_len_pair_t dr_with_seg_len_pair
4100 (dr_a, dr_b, (preserves_scalar_order_p
4101 ? dr_with_seg_len_pair_t::WELL_ORDERED
4102 : dr_with_seg_len_pair_t::REORDERED));
4104 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4107 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4109 unsigned int count = (comp_alias_ddrs.length ()
4110 + check_unequal_addrs.length ());
4112 if (count
4113 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4114 == VECT_COST_MODEL_VERY_CHEAP))
4115 return opt_result::failure_at
4116 (vect_location, "would need a runtime alias check\n");
4118 if (dump_enabled_p ())
4119 dump_printf_loc (MSG_NOTE, vect_location,
4120 "improved number of alias checks from %d to %d\n",
4121 may_alias_ddrs.length (), count);
4122 unsigned limit = param_vect_max_version_for_alias_checks;
4123 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4124 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4125 if (count > limit)
4126 return opt_result::failure_at
4127 (vect_location,
4128 "number of versioning for alias run-time tests exceeds %d "
4129 "(--param vect-max-version-for-alias-checks)\n", limit);
4131 return opt_result::success ();
4134 /* Check whether we can use an internal function for a gather load
4135 or scatter store. READ_P is true for loads and false for stores.
4136 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4137 the type of the memory elements being loaded or stored. OFFSET_TYPE
4138 is the type of the offset that is being applied to the invariant
4139 base address. SCALE is the amount by which the offset should
4140 be multiplied *after* it has been converted to address width.
4142 Return true if the function is supported, storing the function id in
4143 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4145 bool
4146 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4147 tree vectype, tree memory_type, tree offset_type,
4148 int scale, internal_fn *ifn_out,
4149 tree *offset_vectype_out)
4151 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4152 unsigned int element_bits = vector_element_bits (vectype);
4153 if (element_bits != memory_bits)
4154 /* For now the vector elements must be the same width as the
4155 memory elements. */
4156 return false;
4158 /* Work out which function we need. */
4159 internal_fn ifn, alt_ifn, alt_ifn2;
4160 if (read_p)
4162 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4163 alt_ifn = IFN_MASK_GATHER_LOAD;
4164 /* When target supports MASK_LEN_GATHER_LOAD, we always
4165 use MASK_LEN_GATHER_LOAD regardless whether len and
4166 mask are valid or not. */
4167 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4169 else
4171 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4172 alt_ifn = IFN_MASK_SCATTER_STORE;
4173 /* When target supports MASK_LEN_SCATTER_STORE, we always
4174 use MASK_LEN_SCATTER_STORE regardless whether len and
4175 mask are valid or not. */
4176 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4179 for (;;)
4181 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4182 if (!offset_vectype)
4183 return false;
4185 /* Test whether the target supports this combination. */
4186 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4187 offset_vectype, scale))
4189 *ifn_out = ifn;
4190 *offset_vectype_out = offset_vectype;
4191 return true;
4193 else if (!masked_p
4194 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4195 memory_type,
4196 offset_vectype,
4197 scale))
4199 *ifn_out = alt_ifn;
4200 *offset_vectype_out = offset_vectype;
4201 return true;
4203 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4204 memory_type,
4205 offset_vectype, scale))
4207 *ifn_out = alt_ifn2;
4208 *offset_vectype_out = offset_vectype;
4209 return true;
4212 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4213 && TYPE_PRECISION (offset_type) >= element_bits)
4214 return false;
4216 offset_type = build_nonstandard_integer_type
4217 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4221 /* STMT_INFO is a call to an internal gather load or scatter store function.
4222 Describe the operation in INFO. */
4224 static void
4225 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4226 gather_scatter_info *info)
4228 gcall *call = as_a <gcall *> (stmt_info->stmt);
4229 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4230 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4232 info->ifn = gimple_call_internal_fn (call);
4233 info->decl = NULL_TREE;
4234 info->base = gimple_call_arg (call, 0);
4235 info->offset = gimple_call_arg (call, 1);
4236 info->offset_dt = vect_unknown_def_type;
4237 info->offset_vectype = NULL_TREE;
4238 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4239 info->element_type = TREE_TYPE (vectype);
4240 info->memory_type = TREE_TYPE (DR_REF (dr));
4243 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4244 gather load or scatter store. Describe the operation in *INFO if so. */
4246 bool
4247 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4248 gather_scatter_info *info)
4250 HOST_WIDE_INT scale = 1;
4251 poly_int64 pbitpos, pbitsize;
4252 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4253 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4254 tree offtype = NULL_TREE;
4255 tree decl = NULL_TREE, base, off;
4256 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4257 tree memory_type = TREE_TYPE (DR_REF (dr));
4258 machine_mode pmode;
4259 int punsignedp, reversep, pvolatilep = 0;
4260 internal_fn ifn;
4261 tree offset_vectype;
4262 bool masked_p = false;
4264 /* See whether this is already a call to a gather/scatter internal function.
4265 If not, see whether it's a masked load or store. */
4266 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4267 if (call && gimple_call_internal_p (call))
4269 ifn = gimple_call_internal_fn (call);
4270 if (internal_gather_scatter_fn_p (ifn))
4272 vect_describe_gather_scatter_call (stmt_info, info);
4273 return true;
4275 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4278 /* True if we should aim to use internal functions rather than
4279 built-in functions. */
4280 bool use_ifn_p = (DR_IS_READ (dr)
4281 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4282 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4284 base = DR_REF (dr);
4285 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4286 see if we can use the def stmt of the address. */
4287 if (masked_p
4288 && TREE_CODE (base) == MEM_REF
4289 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4290 && integer_zerop (TREE_OPERAND (base, 1))
4291 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4293 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4294 if (is_gimple_assign (def_stmt)
4295 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4296 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4299 /* The gather and scatter builtins need address of the form
4300 loop_invariant + vector * {1, 2, 4, 8}
4302 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4303 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4304 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4305 multiplications and additions in it. To get a vector, we need
4306 a single SSA_NAME that will be defined in the loop and will
4307 contain everything that is not loop invariant and that can be
4308 vectorized. The following code attempts to find such a preexistng
4309 SSA_NAME OFF and put the loop invariants into a tree BASE
4310 that can be gimplified before the loop. */
4311 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4312 &punsignedp, &reversep, &pvolatilep);
4313 if (reversep)
4314 return false;
4316 /* PR 107346. Packed structs can have fields at offsets that are not
4317 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4318 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4319 return false;
4321 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4323 if (TREE_CODE (base) == MEM_REF)
4325 if (!integer_zerop (TREE_OPERAND (base, 1)))
4327 if (off == NULL_TREE)
4328 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4329 else
4330 off = size_binop (PLUS_EXPR, off,
4331 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4333 base = TREE_OPERAND (base, 0);
4335 else
4336 base = build_fold_addr_expr (base);
4338 if (off == NULL_TREE)
4339 off = size_zero_node;
4341 /* If base is not loop invariant, either off is 0, then we start with just
4342 the constant offset in the loop invariant BASE and continue with base
4343 as OFF, otherwise give up.
4344 We could handle that case by gimplifying the addition of base + off
4345 into some SSA_NAME and use that as off, but for now punt. */
4346 if (!expr_invariant_in_loop_p (loop, base))
4348 if (!integer_zerop (off))
4349 return false;
4350 off = base;
4351 base = size_int (pbytepos);
4353 /* Otherwise put base + constant offset into the loop invariant BASE
4354 and continue with OFF. */
4355 else
4357 base = fold_convert (sizetype, base);
4358 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4361 /* OFF at this point may be either a SSA_NAME or some tree expression
4362 from get_inner_reference. Try to peel off loop invariants from it
4363 into BASE as long as possible. */
4364 STRIP_NOPS (off);
4365 while (offtype == NULL_TREE)
4367 enum tree_code code;
4368 tree op0, op1, add = NULL_TREE;
4370 if (TREE_CODE (off) == SSA_NAME)
4372 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4374 if (expr_invariant_in_loop_p (loop, off))
4375 return false;
4377 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4378 break;
4380 op0 = gimple_assign_rhs1 (def_stmt);
4381 code = gimple_assign_rhs_code (def_stmt);
4382 op1 = gimple_assign_rhs2 (def_stmt);
4384 else
4386 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4387 return false;
4388 code = TREE_CODE (off);
4389 extract_ops_from_tree (off, &code, &op0, &op1);
4391 switch (code)
4393 case POINTER_PLUS_EXPR:
4394 case PLUS_EXPR:
4395 if (expr_invariant_in_loop_p (loop, op0))
4397 add = op0;
4398 off = op1;
4399 do_add:
4400 add = fold_convert (sizetype, add);
4401 if (scale != 1)
4402 add = size_binop (MULT_EXPR, add, size_int (scale));
4403 base = size_binop (PLUS_EXPR, base, add);
4404 continue;
4406 if (expr_invariant_in_loop_p (loop, op1))
4408 add = op1;
4409 off = op0;
4410 goto do_add;
4412 break;
4413 case MINUS_EXPR:
4414 if (expr_invariant_in_loop_p (loop, op1))
4416 add = fold_convert (sizetype, op1);
4417 add = size_binop (MINUS_EXPR, size_zero_node, add);
4418 off = op0;
4419 goto do_add;
4421 break;
4422 case MULT_EXPR:
4423 if (scale == 1 && tree_fits_shwi_p (op1))
4425 int new_scale = tree_to_shwi (op1);
4426 /* Only treat this as a scaling operation if the target
4427 supports it for at least some offset type. */
4428 if (use_ifn_p
4429 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4430 masked_p, vectype, memory_type,
4431 signed_char_type_node,
4432 new_scale, &ifn,
4433 &offset_vectype)
4434 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4435 masked_p, vectype, memory_type,
4436 unsigned_char_type_node,
4437 new_scale, &ifn,
4438 &offset_vectype))
4439 break;
4440 scale = new_scale;
4441 off = op0;
4442 continue;
4444 break;
4445 case SSA_NAME:
4446 off = op0;
4447 continue;
4448 CASE_CONVERT:
4449 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4450 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4451 break;
4453 /* Don't include the conversion if the target is happy with
4454 the current offset type. */
4455 if (use_ifn_p
4456 && TREE_CODE (off) == SSA_NAME
4457 && !POINTER_TYPE_P (TREE_TYPE (off))
4458 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4459 masked_p, vectype, memory_type,
4460 TREE_TYPE (off), scale, &ifn,
4461 &offset_vectype))
4462 break;
4464 if (TYPE_PRECISION (TREE_TYPE (op0))
4465 == TYPE_PRECISION (TREE_TYPE (off)))
4467 off = op0;
4468 continue;
4471 /* Include the conversion if it is widening and we're using
4472 the IFN path or the target can handle the converted from
4473 offset or the current size is not already the same as the
4474 data vector element size. */
4475 if ((TYPE_PRECISION (TREE_TYPE (op0))
4476 < TYPE_PRECISION (TREE_TYPE (off)))
4477 && (use_ifn_p
4478 || (DR_IS_READ (dr)
4479 ? (targetm.vectorize.builtin_gather
4480 && targetm.vectorize.builtin_gather (vectype,
4481 TREE_TYPE (op0),
4482 scale))
4483 : (targetm.vectorize.builtin_scatter
4484 && targetm.vectorize.builtin_scatter (vectype,
4485 TREE_TYPE (op0),
4486 scale)))
4487 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4488 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4490 off = op0;
4491 offtype = TREE_TYPE (off);
4492 STRIP_NOPS (off);
4493 continue;
4495 break;
4496 default:
4497 break;
4499 break;
4502 /* If at the end OFF still isn't a SSA_NAME or isn't
4503 defined in the loop, punt. */
4504 if (TREE_CODE (off) != SSA_NAME
4505 || expr_invariant_in_loop_p (loop, off))
4506 return false;
4508 if (offtype == NULL_TREE)
4509 offtype = TREE_TYPE (off);
4511 if (use_ifn_p)
4513 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4514 vectype, memory_type, offtype, scale,
4515 &ifn, &offset_vectype))
4516 ifn = IFN_LAST;
4517 decl = NULL_TREE;
4519 else
4521 if (DR_IS_READ (dr))
4523 if (targetm.vectorize.builtin_gather)
4524 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4526 else
4528 if (targetm.vectorize.builtin_scatter)
4529 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4531 ifn = IFN_LAST;
4532 /* The offset vector type will be read from DECL when needed. */
4533 offset_vectype = NULL_TREE;
4536 info->ifn = ifn;
4537 info->decl = decl;
4538 info->base = base;
4539 info->offset = off;
4540 info->offset_dt = vect_unknown_def_type;
4541 info->offset_vectype = offset_vectype;
4542 info->scale = scale;
4543 info->element_type = TREE_TYPE (vectype);
4544 info->memory_type = memory_type;
4545 return true;
4548 /* Find the data references in STMT, analyze them with respect to LOOP and
4549 append them to DATAREFS. Return false if datarefs in this stmt cannot
4550 be handled. */
4552 opt_result
4553 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4554 vec<data_reference_p> *datarefs,
4555 vec<int> *dataref_groups, int group_id)
4557 /* We can ignore clobbers for dataref analysis - they are removed during
4558 loop vectorization and BB vectorization checks dependences with a
4559 stmt walk. */
4560 if (gimple_clobber_p (stmt))
4561 return opt_result::success ();
4563 if (gimple_has_volatile_ops (stmt))
4564 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4565 stmt);
4567 if (stmt_can_throw_internal (cfun, stmt))
4568 return opt_result::failure_at (stmt,
4569 "not vectorized:"
4570 " statement can throw an exception: %G",
4571 stmt);
4573 auto_vec<data_reference_p, 2> refs;
4574 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4575 if (!res)
4576 return res;
4578 if (refs.is_empty ())
4579 return opt_result::success ();
4581 if (refs.length () > 1)
4583 while (!refs.is_empty ())
4584 free_data_ref (refs.pop ());
4585 return opt_result::failure_at (stmt,
4586 "not vectorized: more than one "
4587 "data ref in stmt: %G", stmt);
4590 data_reference_p dr = refs.pop ();
4591 if (gcall *call = dyn_cast <gcall *> (stmt))
4592 if (!gimple_call_internal_p (call)
4593 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4594 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4596 free_data_ref (dr);
4597 return opt_result::failure_at (stmt,
4598 "not vectorized: dr in a call %G", stmt);
4601 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4602 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4604 free_data_ref (dr);
4605 return opt_result::failure_at (stmt,
4606 "not vectorized:"
4607 " statement is an unsupported"
4608 " bitfield access %G", stmt);
4611 if (DR_BASE_ADDRESS (dr)
4612 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4614 free_data_ref (dr);
4615 return opt_result::failure_at (stmt,
4616 "not vectorized:"
4617 " base addr of dr is a constant\n");
4620 /* Check whether this may be a SIMD lane access and adjust the
4621 DR to make it easier for us to handle it. */
4622 if (loop
4623 && loop->simduid
4624 && (!DR_BASE_ADDRESS (dr)
4625 || !DR_OFFSET (dr)
4626 || !DR_INIT (dr)
4627 || !DR_STEP (dr)))
4629 struct data_reference *newdr
4630 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4631 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4632 if (DR_BASE_ADDRESS (newdr)
4633 && DR_OFFSET (newdr)
4634 && DR_INIT (newdr)
4635 && DR_STEP (newdr)
4636 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4637 && integer_zerop (DR_STEP (newdr)))
4639 tree base_address = DR_BASE_ADDRESS (newdr);
4640 tree off = DR_OFFSET (newdr);
4641 tree step = ssize_int (1);
4642 if (integer_zerop (off)
4643 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4645 off = TREE_OPERAND (base_address, 1);
4646 base_address = TREE_OPERAND (base_address, 0);
4648 STRIP_NOPS (off);
4649 if (TREE_CODE (off) == MULT_EXPR
4650 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4652 step = TREE_OPERAND (off, 1);
4653 off = TREE_OPERAND (off, 0);
4654 STRIP_NOPS (off);
4656 if (CONVERT_EXPR_P (off)
4657 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4658 < TYPE_PRECISION (TREE_TYPE (off))))
4659 off = TREE_OPERAND (off, 0);
4660 if (TREE_CODE (off) == SSA_NAME)
4662 gimple *def = SSA_NAME_DEF_STMT (off);
4663 /* Look through widening conversion. */
4664 if (is_gimple_assign (def)
4665 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4667 tree rhs1 = gimple_assign_rhs1 (def);
4668 if (TREE_CODE (rhs1) == SSA_NAME
4669 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4670 && (TYPE_PRECISION (TREE_TYPE (off))
4671 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4672 def = SSA_NAME_DEF_STMT (rhs1);
4674 if (is_gimple_call (def)
4675 && gimple_call_internal_p (def)
4676 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4678 tree arg = gimple_call_arg (def, 0);
4679 tree reft = TREE_TYPE (DR_REF (newdr));
4680 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4681 arg = SSA_NAME_VAR (arg);
4682 if (arg == loop->simduid
4683 /* For now. */
4684 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4686 DR_BASE_ADDRESS (newdr) = base_address;
4687 DR_OFFSET (newdr) = ssize_int (0);
4688 DR_STEP (newdr) = step;
4689 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4690 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4691 /* Mark as simd-lane access. */
4692 tree arg2 = gimple_call_arg (def, 1);
4693 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4694 free_data_ref (dr);
4695 datarefs->safe_push (newdr);
4696 if (dataref_groups)
4697 dataref_groups->safe_push (group_id);
4698 return opt_result::success ();
4703 free_data_ref (newdr);
4706 datarefs->safe_push (dr);
4707 if (dataref_groups)
4708 dataref_groups->safe_push (group_id);
4709 return opt_result::success ();
4712 /* Function vect_analyze_data_refs.
4714 Find all the data references in the loop or basic block.
4716 The general structure of the analysis of data refs in the vectorizer is as
4717 follows:
4718 1- vect_analyze_data_refs(loop/bb): call
4719 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4720 in the loop/bb and their dependences.
4721 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4722 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4723 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4727 opt_result
4728 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4730 class loop *loop = NULL;
4731 unsigned int i;
4732 struct data_reference *dr;
4733 tree scalar_type;
4735 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4737 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4738 loop = LOOP_VINFO_LOOP (loop_vinfo);
4740 /* Go through the data-refs, check that the analysis succeeded. Update
4741 pointer from stmt_vec_info struct to DR and vectype. */
4743 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4744 FOR_EACH_VEC_ELT (datarefs, i, dr)
4746 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4747 poly_uint64 vf;
4749 gcc_assert (DR_REF (dr));
4750 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4751 gcc_assert (!stmt_info->dr_aux.dr);
4752 stmt_info->dr_aux.dr = dr;
4753 stmt_info->dr_aux.stmt = stmt_info;
4755 /* Check that analysis of the data-ref succeeded. */
4756 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4757 || !DR_STEP (dr))
4759 bool maybe_gather
4760 = DR_IS_READ (dr)
4761 && !TREE_THIS_VOLATILE (DR_REF (dr));
4762 bool maybe_scatter
4763 = DR_IS_WRITE (dr)
4764 && !TREE_THIS_VOLATILE (DR_REF (dr));
4766 /* If target supports vector gather loads or scatter stores,
4767 see if they can't be used. */
4768 if (is_a <loop_vec_info> (vinfo)
4769 && !nested_in_vect_loop_p (loop, stmt_info))
4771 if (maybe_gather || maybe_scatter)
4773 if (maybe_gather)
4774 gatherscatter = GATHER;
4775 else
4776 gatherscatter = SCATTER;
4780 if (gatherscatter == SG_NONE)
4782 if (dump_enabled_p ())
4783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4784 "not vectorized: data ref analysis "
4785 "failed %G", stmt_info->stmt);
4786 if (is_a <bb_vec_info> (vinfo))
4788 /* In BB vectorization the ref can still participate
4789 in dependence analysis, we just can't vectorize it. */
4790 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4791 continue;
4793 return opt_result::failure_at (stmt_info->stmt,
4794 "not vectorized:"
4795 " data ref analysis failed: %G",
4796 stmt_info->stmt);
4800 /* See if this was detected as SIMD lane access. */
4801 if (dr->aux == (void *)-1
4802 || dr->aux == (void *)-2
4803 || dr->aux == (void *)-3
4804 || dr->aux == (void *)-4)
4806 if (nested_in_vect_loop_p (loop, stmt_info))
4807 return opt_result::failure_at (stmt_info->stmt,
4808 "not vectorized:"
4809 " data ref analysis failed: %G",
4810 stmt_info->stmt);
4811 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4812 = -(uintptr_t) dr->aux;
4815 tree base = get_base_address (DR_REF (dr));
4816 if (base && VAR_P (base) && DECL_NONALIASED (base))
4818 if (dump_enabled_p ())
4819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4820 "not vectorized: base object not addressable "
4821 "for stmt: %G", stmt_info->stmt);
4822 if (is_a <bb_vec_info> (vinfo))
4824 /* In BB vectorization the ref can still participate
4825 in dependence analysis, we just can't vectorize it. */
4826 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4827 continue;
4829 return opt_result::failure_at (stmt_info->stmt,
4830 "not vectorized: base object not"
4831 " addressable for stmt: %G",
4832 stmt_info->stmt);
4835 if (is_a <loop_vec_info> (vinfo)
4836 && DR_STEP (dr)
4837 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4839 if (nested_in_vect_loop_p (loop, stmt_info))
4840 return opt_result::failure_at (stmt_info->stmt,
4841 "not vectorized: "
4842 "not suitable for strided load %G",
4843 stmt_info->stmt);
4844 STMT_VINFO_STRIDED_P (stmt_info) = true;
4847 /* Update DR field in stmt_vec_info struct. */
4849 /* If the dataref is in an inner-loop of the loop that is considered for
4850 for vectorization, we also want to analyze the access relative to
4851 the outer-loop (DR contains information only relative to the
4852 inner-most enclosing loop). We do that by building a reference to the
4853 first location accessed by the inner-loop, and analyze it relative to
4854 the outer-loop. */
4855 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4857 /* Build a reference to the first location accessed by the
4858 inner loop: *(BASE + INIT + OFFSET). By construction,
4859 this address must be invariant in the inner loop, so we
4860 can consider it as being used in the outer loop. */
4861 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4862 tree offset = unshare_expr (DR_OFFSET (dr));
4863 tree init = unshare_expr (DR_INIT (dr));
4864 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4865 init, offset);
4866 tree init_addr = fold_build_pointer_plus (base, init_offset);
4867 tree init_ref = build_fold_indirect_ref (init_addr);
4869 if (dump_enabled_p ())
4870 dump_printf_loc (MSG_NOTE, vect_location,
4871 "analyze in outer loop: %T\n", init_ref);
4873 opt_result res
4874 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4875 init_ref, loop, stmt_info->stmt);
4876 if (!res)
4877 /* dr_analyze_innermost already explained the failure. */
4878 return res;
4880 if (dump_enabled_p ())
4881 dump_printf_loc (MSG_NOTE, vect_location,
4882 "\touter base_address: %T\n"
4883 "\touter offset from base address: %T\n"
4884 "\touter constant offset from base address: %T\n"
4885 "\touter step: %T\n"
4886 "\touter base alignment: %d\n\n"
4887 "\touter base misalignment: %d\n"
4888 "\touter offset alignment: %d\n"
4889 "\touter step alignment: %d\n",
4890 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4891 STMT_VINFO_DR_OFFSET (stmt_info),
4892 STMT_VINFO_DR_INIT (stmt_info),
4893 STMT_VINFO_DR_STEP (stmt_info),
4894 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4895 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4896 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4897 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4900 /* Set vectype for STMT. */
4901 scalar_type = TREE_TYPE (DR_REF (dr));
4902 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4903 if (!vectype)
4905 if (dump_enabled_p ())
4907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4908 "not vectorized: no vectype for stmt: %G",
4909 stmt_info->stmt);
4910 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4911 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4912 scalar_type);
4913 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4916 if (is_a <bb_vec_info> (vinfo))
4918 /* No vector type is fine, the ref can still participate
4919 in dependence analysis, we just can't vectorize it. */
4920 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4921 continue;
4923 if (fatal)
4924 *fatal = false;
4925 return opt_result::failure_at (stmt_info->stmt,
4926 "not vectorized:"
4927 " no vectype for stmt: %G"
4928 " scalar_type: %T\n",
4929 stmt_info->stmt, scalar_type);
4931 else
4933 if (dump_enabled_p ())
4934 dump_printf_loc (MSG_NOTE, vect_location,
4935 "got vectype for stmt: %G%T\n",
4936 stmt_info->stmt, vectype);
4939 /* Adjust the minimal vectorization factor according to the
4940 vector type. */
4941 vf = TYPE_VECTOR_SUBPARTS (vectype);
4942 *min_vf = upper_bound (*min_vf, vf);
4944 /* Leave the BB vectorizer to pick the vector type later, based on
4945 the final dataref group size and SLP node size. */
4946 if (is_a <loop_vec_info> (vinfo))
4947 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4949 if (gatherscatter != SG_NONE)
4951 gather_scatter_info gs_info;
4952 if (!vect_check_gather_scatter (stmt_info,
4953 as_a <loop_vec_info> (vinfo),
4954 &gs_info)
4955 || !get_vectype_for_scalar_type (vinfo,
4956 TREE_TYPE (gs_info.offset)))
4958 if (fatal)
4959 *fatal = false;
4960 return opt_result::failure_at
4961 (stmt_info->stmt,
4962 (gatherscatter == GATHER)
4963 ? "not vectorized: not suitable for gather load %G"
4964 : "not vectorized: not suitable for scatter store %G",
4965 stmt_info->stmt);
4967 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4971 /* We used to stop processing and prune the list here. Verify we no
4972 longer need to. */
4973 gcc_assert (i == datarefs.length ());
4975 return opt_result::success ();
4979 /* Function vect_get_new_vect_var.
4981 Returns a name for a new variable. The current naming scheme appends the
4982 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4983 the name of vectorizer generated variables, and appends that to NAME if
4984 provided. */
4986 tree
4987 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4989 const char *prefix;
4990 tree new_vect_var;
4992 switch (var_kind)
4994 case vect_simple_var:
4995 prefix = "vect";
4996 break;
4997 case vect_scalar_var:
4998 prefix = "stmp";
4999 break;
5000 case vect_mask_var:
5001 prefix = "mask";
5002 break;
5003 case vect_pointer_var:
5004 prefix = "vectp";
5005 break;
5006 default:
5007 gcc_unreachable ();
5010 if (name)
5012 char* tmp = concat (prefix, "_", name, NULL);
5013 new_vect_var = create_tmp_reg (type, tmp);
5014 free (tmp);
5016 else
5017 new_vect_var = create_tmp_reg (type, prefix);
5019 return new_vect_var;
5022 /* Like vect_get_new_vect_var but return an SSA name. */
5024 tree
5025 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5027 const char *prefix;
5028 tree new_vect_var;
5030 switch (var_kind)
5032 case vect_simple_var:
5033 prefix = "vect";
5034 break;
5035 case vect_scalar_var:
5036 prefix = "stmp";
5037 break;
5038 case vect_pointer_var:
5039 prefix = "vectp";
5040 break;
5041 default:
5042 gcc_unreachable ();
5045 if (name)
5047 char* tmp = concat (prefix, "_", name, NULL);
5048 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5049 free (tmp);
5051 else
5052 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5054 return new_vect_var;
5057 /* Duplicate points-to info on NAME from DR_INFO. */
5059 static void
5060 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5062 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5063 /* DR_PTR_INFO is for a base SSA name, not including constant or
5064 variable offsets in the ref so its alignment info does not apply. */
5065 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5068 /* Function vect_create_addr_base_for_vector_ref.
5070 Create an expression that computes the address of the first memory location
5071 that will be accessed for a data reference.
5073 Input:
5074 STMT_INFO: The statement containing the data reference.
5075 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5076 OFFSET: Optional. If supplied, it is be added to the initial address.
5077 LOOP: Specify relative to which loop-nest should the address be computed.
5078 For example, when the dataref is in an inner-loop nested in an
5079 outer-loop that is now being vectorized, LOOP can be either the
5080 outer-loop, or the inner-loop. The first memory location accessed
5081 by the following dataref ('in' points to short):
5083 for (i=0; i<N; i++)
5084 for (j=0; j<M; j++)
5085 s += in[i+j]
5087 is as follows:
5088 if LOOP=i_loop: &in (relative to i_loop)
5089 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5091 Output:
5092 1. Return an SSA_NAME whose value is the address of the memory location of
5093 the first vector of the data reference.
5094 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5095 these statement(s) which define the returned SSA_NAME.
5097 FORNOW: We are only handling array accesses with step 1. */
5099 tree
5100 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5101 gimple_seq *new_stmt_list,
5102 tree offset)
5104 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5105 struct data_reference *dr = dr_info->dr;
5106 const char *base_name;
5107 tree addr_base;
5108 tree dest;
5109 gimple_seq seq = NULL;
5110 tree vect_ptr_type;
5111 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5112 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5114 tree data_ref_base = unshare_expr (drb->base_address);
5115 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5116 tree init = unshare_expr (drb->init);
5118 if (loop_vinfo)
5119 base_name = get_name (data_ref_base);
5120 else
5122 base_offset = ssize_int (0);
5123 init = ssize_int (0);
5124 base_name = get_name (DR_REF (dr));
5127 /* Create base_offset */
5128 base_offset = size_binop (PLUS_EXPR,
5129 fold_convert (sizetype, base_offset),
5130 fold_convert (sizetype, init));
5132 if (offset)
5134 offset = fold_convert (sizetype, offset);
5135 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5136 base_offset, offset);
5139 /* base + base_offset */
5140 if (loop_vinfo)
5141 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5142 else
5143 addr_base = build1 (ADDR_EXPR,
5144 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5145 /* Strip zero offset components since we don't need
5146 them and they can confuse late diagnostics if
5147 we CSE them wrongly. See PR106904 for example. */
5148 unshare_expr (strip_zero_offset_components
5149 (DR_REF (dr))));
5151 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5152 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5153 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5154 gimple_seq_add_seq (new_stmt_list, seq);
5156 if (DR_PTR_INFO (dr)
5157 && TREE_CODE (addr_base) == SSA_NAME
5158 /* We should only duplicate pointer info to newly created SSA names. */
5159 && SSA_NAME_VAR (addr_base) == dest)
5161 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5162 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5165 if (dump_enabled_p ())
5166 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5168 return addr_base;
5172 /* Function vect_create_data_ref_ptr.
5174 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5175 location accessed in the loop by STMT_INFO, along with the def-use update
5176 chain to appropriately advance the pointer through the loop iterations.
5177 Also set aliasing information for the pointer. This pointer is used by
5178 the callers to this function to create a memory reference expression for
5179 vector load/store access.
5181 Input:
5182 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5183 GIMPLE_ASSIGN <name, data-ref> or
5184 GIMPLE_ASSIGN <data-ref, name>.
5185 2. AGGR_TYPE: the type of the reference, which should be either a vector
5186 or an array.
5187 3. AT_LOOP: the loop where the vector memref is to be created.
5188 4. OFFSET (optional): a byte offset to be added to the initial address
5189 accessed by the data-ref in STMT_INFO.
5190 5. BSI: location where the new stmts are to be placed if there is no loop
5191 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5192 pointing to the initial address.
5193 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5194 to the IV during each iteration of the loop. NULL says to move
5195 by one copy of AGGR_TYPE up or down, depending on the step of the
5196 data reference.
5198 Output:
5199 1. Declare a new ptr to vector_type, and have it point to the base of the
5200 data reference (initial addressed accessed by the data reference).
5201 For example, for vector of type V8HI, the following code is generated:
5203 v8hi *ap;
5204 ap = (v8hi *)initial_address;
5206 if OFFSET is not supplied:
5207 initial_address = &a[init];
5208 if OFFSET is supplied:
5209 initial_address = &a[init] + OFFSET;
5210 if BYTE_OFFSET is supplied:
5211 initial_address = &a[init] + BYTE_OFFSET;
5213 Return the initial_address in INITIAL_ADDRESS.
5215 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5216 update the pointer in each iteration of the loop.
5218 Return the increment stmt that updates the pointer in PTR_INCR.
5220 3. Return the pointer. */
5222 tree
5223 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5224 tree aggr_type, class loop *at_loop, tree offset,
5225 tree *initial_address, gimple_stmt_iterator *gsi,
5226 gimple **ptr_incr, bool only_init,
5227 tree iv_step)
5229 const char *base_name;
5230 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5231 class loop *loop = NULL;
5232 bool nested_in_vect_loop = false;
5233 class loop *containing_loop = NULL;
5234 tree aggr_ptr_type;
5235 tree aggr_ptr;
5236 tree new_temp;
5237 gimple_seq new_stmt_list = NULL;
5238 edge pe = NULL;
5239 basic_block new_bb;
5240 tree aggr_ptr_init;
5241 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5242 struct data_reference *dr = dr_info->dr;
5243 tree aptr;
5244 gimple_stmt_iterator incr_gsi;
5245 bool insert_after;
5246 tree indx_before_incr, indx_after_incr;
5247 gimple *incr;
5248 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5250 gcc_assert (iv_step != NULL_TREE
5251 || TREE_CODE (aggr_type) == ARRAY_TYPE
5252 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5254 if (loop_vinfo)
5256 loop = LOOP_VINFO_LOOP (loop_vinfo);
5257 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5258 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5259 pe = loop_preheader_edge (loop);
5261 else
5263 gcc_assert (bb_vinfo);
5264 only_init = true;
5265 *ptr_incr = NULL;
5268 /* Create an expression for the first address accessed by this load
5269 in LOOP. */
5270 base_name = get_name (DR_BASE_ADDRESS (dr));
5272 if (dump_enabled_p ())
5274 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5275 dump_printf_loc (MSG_NOTE, vect_location,
5276 "create %s-pointer variable to type: %T",
5277 get_tree_code_name (TREE_CODE (aggr_type)),
5278 aggr_type);
5279 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5280 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5281 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5282 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5283 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5284 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5285 else
5286 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5287 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5290 /* (1) Create the new aggregate-pointer variable.
5291 Vector and array types inherit the alias set of their component
5292 type by default so we need to use a ref-all pointer if the data
5293 reference does not conflict with the created aggregated data
5294 reference because it is not addressable. */
5295 bool need_ref_all = false;
5296 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5297 get_alias_set (DR_REF (dr))))
5298 need_ref_all = true;
5299 /* Likewise for any of the data references in the stmt group. */
5300 else if (DR_GROUP_SIZE (stmt_info) > 1)
5302 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5305 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5306 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5307 get_alias_set (DR_REF (sdr))))
5309 need_ref_all = true;
5310 break;
5312 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5314 while (sinfo);
5316 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5317 need_ref_all);
5318 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5321 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5322 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5323 def-use update cycles for the pointer: one relative to the outer-loop
5324 (LOOP), which is what steps (3) and (4) below do. The other is relative
5325 to the inner-loop (which is the inner-most loop containing the dataref),
5326 and this is done be step (5) below.
5328 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5329 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5330 redundant. Steps (3),(4) create the following:
5332 vp0 = &base_addr;
5333 LOOP: vp1 = phi(vp0,vp2)
5336 vp2 = vp1 + step
5337 goto LOOP
5339 If there is an inner-loop nested in loop, then step (5) will also be
5340 applied, and an additional update in the inner-loop will be created:
5342 vp0 = &base_addr;
5343 LOOP: vp1 = phi(vp0,vp2)
5345 inner: vp3 = phi(vp1,vp4)
5346 vp4 = vp3 + inner_step
5347 if () goto inner
5349 vp2 = vp1 + step
5350 if () goto LOOP */
5352 /* (2) Calculate the initial address of the aggregate-pointer, and set
5353 the aggregate-pointer to point to it before the loop. */
5355 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5357 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5358 stmt_info, &new_stmt_list,
5359 offset);
5360 if (new_stmt_list)
5362 if (pe)
5364 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5365 gcc_assert (!new_bb);
5367 else
5368 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5371 *initial_address = new_temp;
5372 aggr_ptr_init = new_temp;
5374 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5375 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5376 inner-loop nested in LOOP (during outer-loop vectorization). */
5378 /* No update in loop is required. */
5379 if (only_init && (!loop_vinfo || at_loop == loop))
5380 aptr = aggr_ptr_init;
5381 else
5383 /* Accesses to invariant addresses should be handled specially
5384 by the caller. */
5385 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5386 gcc_assert (!integer_zerop (step));
5388 if (iv_step == NULL_TREE)
5390 /* The step of the aggregate pointer is the type size,
5391 negated for downward accesses. */
5392 iv_step = TYPE_SIZE_UNIT (aggr_type);
5393 if (tree_int_cst_sgn (step) == -1)
5394 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5397 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5399 create_iv (aggr_ptr_init, PLUS_EXPR,
5400 fold_convert (aggr_ptr_type, iv_step),
5401 aggr_ptr, loop, &incr_gsi, insert_after,
5402 &indx_before_incr, &indx_after_incr);
5403 incr = gsi_stmt (incr_gsi);
5405 /* Copy the points-to information if it exists. */
5406 if (DR_PTR_INFO (dr))
5408 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5409 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5411 if (ptr_incr)
5412 *ptr_incr = incr;
5414 aptr = indx_before_incr;
5417 if (!nested_in_vect_loop || only_init)
5418 return aptr;
5421 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5422 nested in LOOP, if exists. */
5424 gcc_assert (nested_in_vect_loop);
5425 if (!only_init)
5427 standard_iv_increment_position (containing_loop, &incr_gsi,
5428 &insert_after);
5429 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5430 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5431 &indx_before_incr, &indx_after_incr);
5432 incr = gsi_stmt (incr_gsi);
5434 /* Copy the points-to information if it exists. */
5435 if (DR_PTR_INFO (dr))
5437 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5438 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5440 if (ptr_incr)
5441 *ptr_incr = incr;
5443 return indx_before_incr;
5445 else
5446 gcc_unreachable ();
5450 /* Function bump_vector_ptr
5452 Increment a pointer (to a vector type) by vector-size. If requested,
5453 i.e. if PTR-INCR is given, then also connect the new increment stmt
5454 to the existing def-use update-chain of the pointer, by modifying
5455 the PTR_INCR as illustrated below:
5457 The pointer def-use update-chain before this function:
5458 DATAREF_PTR = phi (p_0, p_2)
5459 ....
5460 PTR_INCR: p_2 = DATAREF_PTR + step
5462 The pointer def-use update-chain after this function:
5463 DATAREF_PTR = phi (p_0, p_2)
5464 ....
5465 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5466 ....
5467 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5469 Input:
5470 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5471 in the loop.
5472 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5473 the loop. The increment amount across iterations is expected
5474 to be vector_size.
5475 BSI - location where the new update stmt is to be placed.
5476 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5477 BUMP - optional. The offset by which to bump the pointer. If not given,
5478 the offset is assumed to be vector_size.
5480 Output: Return NEW_DATAREF_PTR as illustrated above.
5484 tree
5485 bump_vector_ptr (vec_info *vinfo,
5486 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5487 stmt_vec_info stmt_info, tree bump)
5489 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5490 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5491 tree update = TYPE_SIZE_UNIT (vectype);
5492 gimple *incr_stmt;
5493 ssa_op_iter iter;
5494 use_operand_p use_p;
5495 tree new_dataref_ptr;
5497 if (bump)
5498 update = bump;
5500 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5501 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5502 else if (is_gimple_min_invariant (dataref_ptr))
5503 /* When possible avoid emitting a separate increment stmt that will
5504 force the addressed object addressable. */
5505 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5506 fold_build2 (MEM_REF,
5507 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5508 dataref_ptr,
5509 fold_convert (ptr_type_node, update)));
5510 else
5511 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5512 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5513 dataref_ptr, update);
5514 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5515 /* Fold the increment, avoiding excessive chains use-def chains of
5516 those, leading to compile-time issues for passes until the next
5517 forwprop pass which would do this as well. */
5518 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5519 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5521 incr_stmt = gsi_stmt (fold_gsi);
5522 update_stmt (incr_stmt);
5525 /* Copy the points-to information if it exists. */
5526 if (DR_PTR_INFO (dr))
5528 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5529 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5532 if (!ptr_incr)
5533 return new_dataref_ptr;
5535 /* Update the vector-pointer's cross-iteration increment. */
5536 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5538 tree use = USE_FROM_PTR (use_p);
5540 if (use == dataref_ptr)
5541 SET_USE (use_p, new_dataref_ptr);
5542 else
5543 gcc_assert (operand_equal_p (use, update, 0));
5546 return new_dataref_ptr;
5550 /* Copy memory reference info such as base/clique from the SRC reference
5551 to the DEST MEM_REF. */
5553 void
5554 vect_copy_ref_info (tree dest, tree src)
5556 if (TREE_CODE (dest) != MEM_REF)
5557 return;
5559 tree src_base = src;
5560 while (handled_component_p (src_base))
5561 src_base = TREE_OPERAND (src_base, 0);
5562 if (TREE_CODE (src_base) != MEM_REF
5563 && TREE_CODE (src_base) != TARGET_MEM_REF)
5564 return;
5566 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5567 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5571 /* Function vect_create_destination_var.
5573 Create a new temporary of type VECTYPE. */
5575 tree
5576 vect_create_destination_var (tree scalar_dest, tree vectype)
5578 tree vec_dest;
5579 const char *name;
5580 char *new_name;
5581 tree type;
5582 enum vect_var_kind kind;
5584 kind = vectype
5585 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5586 ? vect_mask_var
5587 : vect_simple_var
5588 : vect_scalar_var;
5589 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5591 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5593 name = get_name (scalar_dest);
5594 if (name)
5595 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5596 else
5597 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5598 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5599 free (new_name);
5601 return vec_dest;
5604 /* Function vect_grouped_store_supported.
5606 Returns TRUE if interleave high and interleave low permutations
5607 are supported, and FALSE otherwise. */
5609 bool
5610 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5612 machine_mode mode = TYPE_MODE (vectype);
5614 /* vect_permute_store_chain requires the group size to be equal to 3 or
5615 be a power of two. */
5616 if (count != 3 && exact_log2 (count) == -1)
5618 if (dump_enabled_p ())
5619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5620 "the size of the group of accesses"
5621 " is not a power of 2 or not eqaul to 3\n");
5622 return false;
5625 /* Check that the permutation is supported. */
5626 if (VECTOR_MODE_P (mode))
5628 unsigned int i;
5629 if (count == 3)
5631 unsigned int j0 = 0, j1 = 0, j2 = 0;
5632 unsigned int i, j;
5634 unsigned int nelt;
5635 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5637 if (dump_enabled_p ())
5638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5639 "cannot handle groups of 3 stores for"
5640 " variable-length vectors\n");
5641 return false;
5644 vec_perm_builder sel (nelt, nelt, 1);
5645 sel.quick_grow (nelt);
5646 vec_perm_indices indices;
5647 for (j = 0; j < 3; j++)
5649 int nelt0 = ((3 - j) * nelt) % 3;
5650 int nelt1 = ((3 - j) * nelt + 1) % 3;
5651 int nelt2 = ((3 - j) * nelt + 2) % 3;
5652 for (i = 0; i < nelt; i++)
5654 if (3 * i + nelt0 < nelt)
5655 sel[3 * i + nelt0] = j0++;
5656 if (3 * i + nelt1 < nelt)
5657 sel[3 * i + nelt1] = nelt + j1++;
5658 if (3 * i + nelt2 < nelt)
5659 sel[3 * i + nelt2] = 0;
5661 indices.new_vector (sel, 2, nelt);
5662 if (!can_vec_perm_const_p (mode, mode, indices))
5664 if (dump_enabled_p ())
5665 dump_printf (MSG_MISSED_OPTIMIZATION,
5666 "permutation op not supported by target.\n");
5667 return false;
5670 for (i = 0; i < nelt; i++)
5672 if (3 * i + nelt0 < nelt)
5673 sel[3 * i + nelt0] = 3 * i + nelt0;
5674 if (3 * i + nelt1 < nelt)
5675 sel[3 * i + nelt1] = 3 * i + nelt1;
5676 if (3 * i + nelt2 < nelt)
5677 sel[3 * i + nelt2] = nelt + j2++;
5679 indices.new_vector (sel, 2, nelt);
5680 if (!can_vec_perm_const_p (mode, mode, indices))
5682 if (dump_enabled_p ())
5683 dump_printf (MSG_MISSED_OPTIMIZATION,
5684 "permutation op not supported by target.\n");
5685 return false;
5688 return true;
5690 else
5692 /* If length is not equal to 3 then only power of 2 is supported. */
5693 gcc_assert (pow2p_hwi (count));
5694 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5696 /* The encoding has 2 interleaved stepped patterns. */
5697 if(!multiple_p (nelt, 2))
5698 return false;
5699 vec_perm_builder sel (nelt, 2, 3);
5700 sel.quick_grow (6);
5701 for (i = 0; i < 3; i++)
5703 sel[i * 2] = i;
5704 sel[i * 2 + 1] = i + nelt;
5706 vec_perm_indices indices (sel, 2, nelt);
5707 if (can_vec_perm_const_p (mode, mode, indices))
5709 for (i = 0; i < 6; i++)
5710 sel[i] += exact_div (nelt, 2);
5711 indices.new_vector (sel, 2, nelt);
5712 if (can_vec_perm_const_p (mode, mode, indices))
5713 return true;
5718 if (dump_enabled_p ())
5719 dump_printf (MSG_MISSED_OPTIMIZATION,
5720 "permutation op not supported by target.\n");
5721 return false;
5724 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5725 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5727 internal_fn
5728 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5729 bool masked_p)
5731 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5732 vec_mask_len_store_lanes_optab, vectype,
5733 count))
5734 return IFN_MASK_LEN_STORE_LANES;
5735 else if (masked_p)
5737 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5738 vec_mask_store_lanes_optab, vectype,
5739 count))
5740 return IFN_MASK_STORE_LANES;
5742 else
5744 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5745 vec_store_lanes_optab, vectype, count))
5746 return IFN_STORE_LANES;
5748 return IFN_LAST;
5752 /* Function vect_permute_store_chain.
5754 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5755 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5756 the data correctly for the stores. Return the final references for stores
5757 in RESULT_CHAIN.
5759 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5760 The input is 4 vectors each containing 8 elements. We assign a number to
5761 each element, the input sequence is:
5763 1st vec: 0 1 2 3 4 5 6 7
5764 2nd vec: 8 9 10 11 12 13 14 15
5765 3rd vec: 16 17 18 19 20 21 22 23
5766 4th vec: 24 25 26 27 28 29 30 31
5768 The output sequence should be:
5770 1st vec: 0 8 16 24 1 9 17 25
5771 2nd vec: 2 10 18 26 3 11 19 27
5772 3rd vec: 4 12 20 28 5 13 21 30
5773 4th vec: 6 14 22 30 7 15 23 31
5775 i.e., we interleave the contents of the four vectors in their order.
5777 We use interleave_high/low instructions to create such output. The input of
5778 each interleave_high/low operation is two vectors:
5779 1st vec 2nd vec
5780 0 1 2 3 4 5 6 7
5781 the even elements of the result vector are obtained left-to-right from the
5782 high/low elements of the first vector. The odd elements of the result are
5783 obtained left-to-right from the high/low elements of the second vector.
5784 The output of interleave_high will be: 0 4 1 5
5785 and of interleave_low: 2 6 3 7
5788 The permutation is done in log LENGTH stages. In each stage interleave_high
5789 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5790 where the first argument is taken from the first half of DR_CHAIN and the
5791 second argument from it's second half.
5792 In our example,
5794 I1: interleave_high (1st vec, 3rd vec)
5795 I2: interleave_low (1st vec, 3rd vec)
5796 I3: interleave_high (2nd vec, 4th vec)
5797 I4: interleave_low (2nd vec, 4th vec)
5799 The output for the first stage is:
5801 I1: 0 16 1 17 2 18 3 19
5802 I2: 4 20 5 21 6 22 7 23
5803 I3: 8 24 9 25 10 26 11 27
5804 I4: 12 28 13 29 14 30 15 31
5806 The output of the second stage, i.e. the final result is:
5808 I1: 0 8 16 24 1 9 17 25
5809 I2: 2 10 18 26 3 11 19 27
5810 I3: 4 12 20 28 5 13 21 30
5811 I4: 6 14 22 30 7 15 23 31. */
5813 void
5814 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5815 unsigned int length,
5816 stmt_vec_info stmt_info,
5817 gimple_stmt_iterator *gsi,
5818 vec<tree> *result_chain)
5820 tree vect1, vect2, high, low;
5821 gimple *perm_stmt;
5822 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5823 tree perm_mask_low, perm_mask_high;
5824 tree data_ref;
5825 tree perm3_mask_low, perm3_mask_high;
5826 unsigned int i, j, n, log_length = exact_log2 (length);
5828 result_chain->quick_grow (length);
5829 memcpy (result_chain->address (), dr_chain.address (),
5830 length * sizeof (tree));
5832 if (length == 3)
5834 /* vect_grouped_store_supported ensures that this is constant. */
5835 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5836 unsigned int j0 = 0, j1 = 0, j2 = 0;
5838 vec_perm_builder sel (nelt, nelt, 1);
5839 sel.quick_grow (nelt);
5840 vec_perm_indices indices;
5841 for (j = 0; j < 3; j++)
5843 int nelt0 = ((3 - j) * nelt) % 3;
5844 int nelt1 = ((3 - j) * nelt + 1) % 3;
5845 int nelt2 = ((3 - j) * nelt + 2) % 3;
5847 for (i = 0; i < nelt; i++)
5849 if (3 * i + nelt0 < nelt)
5850 sel[3 * i + nelt0] = j0++;
5851 if (3 * i + nelt1 < nelt)
5852 sel[3 * i + nelt1] = nelt + j1++;
5853 if (3 * i + nelt2 < nelt)
5854 sel[3 * i + nelt2] = 0;
5856 indices.new_vector (sel, 2, nelt);
5857 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5859 for (i = 0; i < nelt; i++)
5861 if (3 * i + nelt0 < nelt)
5862 sel[3 * i + nelt0] = 3 * i + nelt0;
5863 if (3 * i + nelt1 < nelt)
5864 sel[3 * i + nelt1] = 3 * i + nelt1;
5865 if (3 * i + nelt2 < nelt)
5866 sel[3 * i + nelt2] = nelt + j2++;
5868 indices.new_vector (sel, 2, nelt);
5869 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5871 vect1 = dr_chain[0];
5872 vect2 = dr_chain[1];
5874 /* Create interleaving stmt:
5875 low = VEC_PERM_EXPR <vect1, vect2,
5876 {j, nelt, *, j + 1, nelt + j + 1, *,
5877 j + 2, nelt + j + 2, *, ...}> */
5878 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5879 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5880 vect2, perm3_mask_low);
5881 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5883 vect1 = data_ref;
5884 vect2 = dr_chain[2];
5885 /* Create interleaving stmt:
5886 low = VEC_PERM_EXPR <vect1, vect2,
5887 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5888 6, 7, nelt + j + 2, ...}> */
5889 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5890 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5891 vect2, perm3_mask_high);
5892 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5893 (*result_chain)[j] = data_ref;
5896 else
5898 /* If length is not equal to 3 then only power of 2 is supported. */
5899 gcc_assert (pow2p_hwi (length));
5901 /* The encoding has 2 interleaved stepped patterns. */
5902 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5903 vec_perm_builder sel (nelt, 2, 3);
5904 sel.quick_grow (6);
5905 for (i = 0; i < 3; i++)
5907 sel[i * 2] = i;
5908 sel[i * 2 + 1] = i + nelt;
5910 vec_perm_indices indices (sel, 2, nelt);
5911 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5913 for (i = 0; i < 6; i++)
5914 sel[i] += exact_div (nelt, 2);
5915 indices.new_vector (sel, 2, nelt);
5916 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5918 for (i = 0, n = log_length; i < n; i++)
5920 for (j = 0; j < length/2; j++)
5922 vect1 = dr_chain[j];
5923 vect2 = dr_chain[j+length/2];
5925 /* Create interleaving stmt:
5926 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5927 ...}> */
5928 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5929 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5930 vect2, perm_mask_high);
5931 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5932 (*result_chain)[2*j] = high;
5934 /* Create interleaving stmt:
5935 low = VEC_PERM_EXPR <vect1, vect2,
5936 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5937 ...}> */
5938 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5939 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5940 vect2, perm_mask_low);
5941 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5942 (*result_chain)[2*j+1] = low;
5944 memcpy (dr_chain.address (), result_chain->address (),
5945 length * sizeof (tree));
5950 /* Function vect_setup_realignment
5952 This function is called when vectorizing an unaligned load using
5953 the dr_explicit_realign[_optimized] scheme.
5954 This function generates the following code at the loop prolog:
5956 p = initial_addr;
5957 x msq_init = *(floor(p)); # prolog load
5958 realignment_token = call target_builtin;
5959 loop:
5960 x msq = phi (msq_init, ---)
5962 The stmts marked with x are generated only for the case of
5963 dr_explicit_realign_optimized.
5965 The code above sets up a new (vector) pointer, pointing to the first
5966 location accessed by STMT_INFO, and a "floor-aligned" load using that
5967 pointer. It also generates code to compute the "realignment-token"
5968 (if the relevant target hook was defined), and creates a phi-node at the
5969 loop-header bb whose arguments are the result of the prolog-load (created
5970 by this function) and the result of a load that takes place in the loop
5971 (to be created by the caller to this function).
5973 For the case of dr_explicit_realign_optimized:
5974 The caller to this function uses the phi-result (msq) to create the
5975 realignment code inside the loop, and sets up the missing phi argument,
5976 as follows:
5977 loop:
5978 msq = phi (msq_init, lsq)
5979 lsq = *(floor(p')); # load in loop
5980 result = realign_load (msq, lsq, realignment_token);
5982 For the case of dr_explicit_realign:
5983 loop:
5984 msq = *(floor(p)); # load in loop
5985 p' = p + (VS-1);
5986 lsq = *(floor(p')); # load in loop
5987 result = realign_load (msq, lsq, realignment_token);
5989 Input:
5990 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5991 a memory location that may be unaligned.
5992 BSI - place where new code is to be inserted.
5993 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5994 is used.
5996 Output:
5997 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5998 target hook, if defined.
5999 Return value - the result of the loop-header phi node. */
6001 tree
6002 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6003 gimple_stmt_iterator *gsi, tree *realignment_token,
6004 enum dr_alignment_support alignment_support_scheme,
6005 tree init_addr,
6006 class loop **at_loop)
6008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6009 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6010 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6011 struct data_reference *dr = dr_info->dr;
6012 class loop *loop = NULL;
6013 edge pe = NULL;
6014 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6015 tree vec_dest;
6016 gimple *inc;
6017 tree ptr;
6018 tree data_ref;
6019 basic_block new_bb;
6020 tree msq_init = NULL_TREE;
6021 tree new_temp;
6022 gphi *phi_stmt;
6023 tree msq = NULL_TREE;
6024 gimple_seq stmts = NULL;
6025 bool compute_in_loop = false;
6026 bool nested_in_vect_loop = false;
6027 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6028 class loop *loop_for_initial_load = NULL;
6030 if (loop_vinfo)
6032 loop = LOOP_VINFO_LOOP (loop_vinfo);
6033 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6036 gcc_assert (alignment_support_scheme == dr_explicit_realign
6037 || alignment_support_scheme == dr_explicit_realign_optimized);
6039 /* We need to generate three things:
6040 1. the misalignment computation
6041 2. the extra vector load (for the optimized realignment scheme).
6042 3. the phi node for the two vectors from which the realignment is
6043 done (for the optimized realignment scheme). */
6045 /* 1. Determine where to generate the misalignment computation.
6047 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6048 calculation will be generated by this function, outside the loop (in the
6049 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6050 caller, inside the loop.
6052 Background: If the misalignment remains fixed throughout the iterations of
6053 the loop, then both realignment schemes are applicable, and also the
6054 misalignment computation can be done outside LOOP. This is because we are
6055 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6056 are a multiple of VS (the Vector Size), and therefore the misalignment in
6057 different vectorized LOOP iterations is always the same.
6058 The problem arises only if the memory access is in an inner-loop nested
6059 inside LOOP, which is now being vectorized using outer-loop vectorization.
6060 This is the only case when the misalignment of the memory access may not
6061 remain fixed throughout the iterations of the inner-loop (as explained in
6062 detail in vect_supportable_dr_alignment). In this case, not only is the
6063 optimized realignment scheme not applicable, but also the misalignment
6064 computation (and generation of the realignment token that is passed to
6065 REALIGN_LOAD) have to be done inside the loop.
6067 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6068 or not, which in turn determines if the misalignment is computed inside
6069 the inner-loop, or outside LOOP. */
6071 if (init_addr != NULL_TREE || !loop_vinfo)
6073 compute_in_loop = true;
6074 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6078 /* 2. Determine where to generate the extra vector load.
6080 For the optimized realignment scheme, instead of generating two vector
6081 loads in each iteration, we generate a single extra vector load in the
6082 preheader of the loop, and in each iteration reuse the result of the
6083 vector load from the previous iteration. In case the memory access is in
6084 an inner-loop nested inside LOOP, which is now being vectorized using
6085 outer-loop vectorization, we need to determine whether this initial vector
6086 load should be generated at the preheader of the inner-loop, or can be
6087 generated at the preheader of LOOP. If the memory access has no evolution
6088 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6089 to be generated inside LOOP (in the preheader of the inner-loop). */
6091 if (nested_in_vect_loop)
6093 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6094 bool invariant_in_outerloop =
6095 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6096 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6098 else
6099 loop_for_initial_load = loop;
6100 if (at_loop)
6101 *at_loop = loop_for_initial_load;
6103 tree vuse = NULL_TREE;
6104 if (loop_for_initial_load)
6106 pe = loop_preheader_edge (loop_for_initial_load);
6107 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6108 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6110 if (!vuse)
6111 vuse = gimple_vuse (gsi_stmt (*gsi));
6113 /* 3. For the case of the optimized realignment, create the first vector
6114 load at the loop preheader. */
6116 if (alignment_support_scheme == dr_explicit_realign_optimized)
6118 /* Create msq_init = *(floor(p1)) in the loop preheader */
6119 gassign *new_stmt;
6121 gcc_assert (!compute_in_loop);
6122 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6123 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6124 loop_for_initial_load, NULL_TREE,
6125 &init_addr, NULL, &inc, true);
6126 if (TREE_CODE (ptr) == SSA_NAME)
6127 new_temp = copy_ssa_name (ptr);
6128 else
6129 new_temp = make_ssa_name (TREE_TYPE (ptr));
6130 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6131 tree type = TREE_TYPE (ptr);
6132 new_stmt = gimple_build_assign
6133 (new_temp, BIT_AND_EXPR, ptr,
6134 fold_build2 (MINUS_EXPR, type,
6135 build_int_cst (type, 0),
6136 build_int_cst (type, align)));
6137 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6138 gcc_assert (!new_bb);
6139 data_ref
6140 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6141 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6142 vect_copy_ref_info (data_ref, DR_REF (dr));
6143 new_stmt = gimple_build_assign (vec_dest, data_ref);
6144 new_temp = make_ssa_name (vec_dest, new_stmt);
6145 gimple_assign_set_lhs (new_stmt, new_temp);
6146 gimple_set_vuse (new_stmt, vuse);
6147 if (pe)
6149 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6150 gcc_assert (!new_bb);
6152 else
6153 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6155 msq_init = gimple_assign_lhs (new_stmt);
6158 /* 4. Create realignment token using a target builtin, if available.
6159 It is done either inside the containing loop, or before LOOP (as
6160 determined above). */
6162 if (targetm.vectorize.builtin_mask_for_load)
6164 gcall *new_stmt;
6165 tree builtin_decl;
6167 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6168 if (!init_addr)
6170 /* Generate the INIT_ADDR computation outside LOOP. */
6171 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6172 stmt_info, &stmts,
6173 NULL_TREE);
6174 if (loop)
6176 pe = loop_preheader_edge (loop);
6177 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6178 gcc_assert (!new_bb);
6180 else
6181 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6184 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6185 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6186 vec_dest =
6187 vect_create_destination_var (scalar_dest,
6188 gimple_call_return_type (new_stmt));
6189 new_temp = make_ssa_name (vec_dest, new_stmt);
6190 gimple_call_set_lhs (new_stmt, new_temp);
6192 if (compute_in_loop)
6193 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6194 else
6196 /* Generate the misalignment computation outside LOOP. */
6197 pe = loop_preheader_edge (loop);
6198 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6199 gcc_assert (!new_bb);
6202 *realignment_token = gimple_call_lhs (new_stmt);
6204 /* The result of the CALL_EXPR to this builtin is determined from
6205 the value of the parameter and no global variables are touched
6206 which makes the builtin a "const" function. Requiring the
6207 builtin to have the "const" attribute makes it unnecessary
6208 to call mark_call_clobbered. */
6209 gcc_assert (TREE_READONLY (builtin_decl));
6212 if (alignment_support_scheme == dr_explicit_realign)
6213 return msq;
6215 gcc_assert (!compute_in_loop);
6216 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6219 /* 5. Create msq = phi <msq_init, lsq> in loop */
6221 pe = loop_preheader_edge (containing_loop);
6222 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6223 msq = make_ssa_name (vec_dest);
6224 phi_stmt = create_phi_node (msq, containing_loop->header);
6225 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6227 return msq;
6231 /* Function vect_grouped_load_supported.
6233 COUNT is the size of the load group (the number of statements plus the
6234 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6235 only one statement, with a gap of COUNT - 1.
6237 Returns true if a suitable permute exists. */
6239 bool
6240 vect_grouped_load_supported (tree vectype, bool single_element_p,
6241 unsigned HOST_WIDE_INT count)
6243 machine_mode mode = TYPE_MODE (vectype);
6245 /* If this is single-element interleaving with an element distance
6246 that leaves unused vector loads around punt - we at least create
6247 very sub-optimal code in that case (and blow up memory,
6248 see PR65518). */
6249 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6251 if (dump_enabled_p ())
6252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253 "single-element interleaving not supported "
6254 "for not adjacent vector loads\n");
6255 return false;
6258 /* vect_permute_load_chain requires the group size to be equal to 3 or
6259 be a power of two. */
6260 if (count != 3 && exact_log2 (count) == -1)
6262 if (dump_enabled_p ())
6263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6264 "the size of the group of accesses"
6265 " is not a power of 2 or not equal to 3\n");
6266 return false;
6269 /* Check that the permutation is supported. */
6270 if (VECTOR_MODE_P (mode))
6272 unsigned int i, j;
6273 if (count == 3)
6275 unsigned int nelt;
6276 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6278 if (dump_enabled_p ())
6279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6280 "cannot handle groups of 3 loads for"
6281 " variable-length vectors\n");
6282 return false;
6285 vec_perm_builder sel (nelt, nelt, 1);
6286 sel.quick_grow (nelt);
6287 vec_perm_indices indices;
6288 unsigned int k;
6289 for (k = 0; k < 3; k++)
6291 for (i = 0; i < nelt; i++)
6292 if (3 * i + k < 2 * nelt)
6293 sel[i] = 3 * i + k;
6294 else
6295 sel[i] = 0;
6296 indices.new_vector (sel, 2, nelt);
6297 if (!can_vec_perm_const_p (mode, mode, indices))
6299 if (dump_enabled_p ())
6300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6301 "shuffle of 3 loads is not supported by"
6302 " target\n");
6303 return false;
6305 for (i = 0, j = 0; i < nelt; i++)
6306 if (3 * i + k < 2 * nelt)
6307 sel[i] = i;
6308 else
6309 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6310 indices.new_vector (sel, 2, nelt);
6311 if (!can_vec_perm_const_p (mode, mode, indices))
6313 if (dump_enabled_p ())
6314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6315 "shuffle of 3 loads is not supported by"
6316 " target\n");
6317 return false;
6320 return true;
6322 else
6324 /* If length is not equal to 3 then only power of 2 is supported. */
6325 gcc_assert (pow2p_hwi (count));
6326 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6328 /* The encoding has a single stepped pattern. */
6329 vec_perm_builder sel (nelt, 1, 3);
6330 sel.quick_grow (3);
6331 for (i = 0; i < 3; i++)
6332 sel[i] = i * 2;
6333 vec_perm_indices indices (sel, 2, nelt);
6334 if (can_vec_perm_const_p (mode, mode, indices))
6336 for (i = 0; i < 3; i++)
6337 sel[i] = i * 2 + 1;
6338 indices.new_vector (sel, 2, nelt);
6339 if (can_vec_perm_const_p (mode, mode, indices))
6340 return true;
6345 if (dump_enabled_p ())
6346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6347 "extract even/odd not supported by target\n");
6348 return false;
6351 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6352 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6354 internal_fn
6355 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6356 bool masked_p)
6358 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6359 vec_mask_len_load_lanes_optab, vectype,
6360 count))
6361 return IFN_MASK_LEN_LOAD_LANES;
6362 else if (masked_p)
6364 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6365 vec_mask_load_lanes_optab, vectype,
6366 count))
6367 return IFN_MASK_LOAD_LANES;
6369 else
6371 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6372 vectype, count))
6373 return IFN_LOAD_LANES;
6375 return IFN_LAST;
6378 /* Function vect_permute_load_chain.
6380 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6381 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6382 the input data correctly. Return the final references for loads in
6383 RESULT_CHAIN.
6385 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6386 The input is 4 vectors each containing 8 elements. We assign a number to each
6387 element, the input sequence is:
6389 1st vec: 0 1 2 3 4 5 6 7
6390 2nd vec: 8 9 10 11 12 13 14 15
6391 3rd vec: 16 17 18 19 20 21 22 23
6392 4th vec: 24 25 26 27 28 29 30 31
6394 The output sequence should be:
6396 1st vec: 0 4 8 12 16 20 24 28
6397 2nd vec: 1 5 9 13 17 21 25 29
6398 3rd vec: 2 6 10 14 18 22 26 30
6399 4th vec: 3 7 11 15 19 23 27 31
6401 i.e., the first output vector should contain the first elements of each
6402 interleaving group, etc.
6404 We use extract_even/odd instructions to create such output. The input of
6405 each extract_even/odd operation is two vectors
6406 1st vec 2nd vec
6407 0 1 2 3 4 5 6 7
6409 and the output is the vector of extracted even/odd elements. The output of
6410 extract_even will be: 0 2 4 6
6411 and of extract_odd: 1 3 5 7
6414 The permutation is done in log LENGTH stages. In each stage extract_even
6415 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6416 their order. In our example,
6418 E1: extract_even (1st vec, 2nd vec)
6419 E2: extract_odd (1st vec, 2nd vec)
6420 E3: extract_even (3rd vec, 4th vec)
6421 E4: extract_odd (3rd vec, 4th vec)
6423 The output for the first stage will be:
6425 E1: 0 2 4 6 8 10 12 14
6426 E2: 1 3 5 7 9 11 13 15
6427 E3: 16 18 20 22 24 26 28 30
6428 E4: 17 19 21 23 25 27 29 31
6430 In order to proceed and create the correct sequence for the next stage (or
6431 for the correct output, if the second stage is the last one, as in our
6432 example), we first put the output of extract_even operation and then the
6433 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6434 The input for the second stage is:
6436 1st vec (E1): 0 2 4 6 8 10 12 14
6437 2nd vec (E3): 16 18 20 22 24 26 28 30
6438 3rd vec (E2): 1 3 5 7 9 11 13 15
6439 4th vec (E4): 17 19 21 23 25 27 29 31
6441 The output of the second stage:
6443 E1: 0 4 8 12 16 20 24 28
6444 E2: 2 6 10 14 18 22 26 30
6445 E3: 1 5 9 13 17 21 25 29
6446 E4: 3 7 11 15 19 23 27 31
6448 And RESULT_CHAIN after reordering:
6450 1st vec (E1): 0 4 8 12 16 20 24 28
6451 2nd vec (E3): 1 5 9 13 17 21 25 29
6452 3rd vec (E2): 2 6 10 14 18 22 26 30
6453 4th vec (E4): 3 7 11 15 19 23 27 31. */
6455 static void
6456 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6457 unsigned int length,
6458 stmt_vec_info stmt_info,
6459 gimple_stmt_iterator *gsi,
6460 vec<tree> *result_chain)
6462 tree data_ref, first_vect, second_vect;
6463 tree perm_mask_even, perm_mask_odd;
6464 tree perm3_mask_low, perm3_mask_high;
6465 gimple *perm_stmt;
6466 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6467 unsigned int i, j, log_length = exact_log2 (length);
6469 result_chain->quick_grow (length);
6470 memcpy (result_chain->address (), dr_chain.address (),
6471 length * sizeof (tree));
6473 if (length == 3)
6475 /* vect_grouped_load_supported ensures that this is constant. */
6476 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6477 unsigned int k;
6479 vec_perm_builder sel (nelt, nelt, 1);
6480 sel.quick_grow (nelt);
6481 vec_perm_indices indices;
6482 for (k = 0; k < 3; k++)
6484 for (i = 0; i < nelt; i++)
6485 if (3 * i + k < 2 * nelt)
6486 sel[i] = 3 * i + k;
6487 else
6488 sel[i] = 0;
6489 indices.new_vector (sel, 2, nelt);
6490 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6492 for (i = 0, j = 0; i < nelt; i++)
6493 if (3 * i + k < 2 * nelt)
6494 sel[i] = i;
6495 else
6496 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6497 indices.new_vector (sel, 2, nelt);
6498 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6500 first_vect = dr_chain[0];
6501 second_vect = dr_chain[1];
6503 /* Create interleaving stmt (low part of):
6504 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6505 ...}> */
6506 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6507 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6508 second_vect, perm3_mask_low);
6509 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6511 /* Create interleaving stmt (high part of):
6512 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6513 ...}> */
6514 first_vect = data_ref;
6515 second_vect = dr_chain[2];
6516 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6517 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6518 second_vect, perm3_mask_high);
6519 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6520 (*result_chain)[k] = data_ref;
6523 else
6525 /* If length is not equal to 3 then only power of 2 is supported. */
6526 gcc_assert (pow2p_hwi (length));
6528 /* The encoding has a single stepped pattern. */
6529 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6530 vec_perm_builder sel (nelt, 1, 3);
6531 sel.quick_grow (3);
6532 for (i = 0; i < 3; ++i)
6533 sel[i] = i * 2;
6534 vec_perm_indices indices (sel, 2, nelt);
6535 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6537 for (i = 0; i < 3; ++i)
6538 sel[i] = i * 2 + 1;
6539 indices.new_vector (sel, 2, nelt);
6540 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6542 for (i = 0; i < log_length; i++)
6544 for (j = 0; j < length; j += 2)
6546 first_vect = dr_chain[j];
6547 second_vect = dr_chain[j+1];
6549 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6550 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6551 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6552 first_vect, second_vect,
6553 perm_mask_even);
6554 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6555 (*result_chain)[j/2] = data_ref;
6557 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6558 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6559 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6560 first_vect, second_vect,
6561 perm_mask_odd);
6562 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6563 (*result_chain)[j/2+length/2] = data_ref;
6565 memcpy (dr_chain.address (), result_chain->address (),
6566 length * sizeof (tree));
6571 /* Function vect_shift_permute_load_chain.
6573 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6574 sequence of stmts to reorder the input data accordingly.
6575 Return the final references for loads in RESULT_CHAIN.
6576 Return true if successed, false otherwise.
6578 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6579 The input is 3 vectors each containing 8 elements. We assign a
6580 number to each element, the input sequence is:
6582 1st vec: 0 1 2 3 4 5 6 7
6583 2nd vec: 8 9 10 11 12 13 14 15
6584 3rd vec: 16 17 18 19 20 21 22 23
6586 The output sequence should be:
6588 1st vec: 0 3 6 9 12 15 18 21
6589 2nd vec: 1 4 7 10 13 16 19 22
6590 3rd vec: 2 5 8 11 14 17 20 23
6592 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6594 First we shuffle all 3 vectors to get correct elements order:
6596 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6597 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6598 3rd vec: (16 19 22) (17 20 23) (18 21)
6600 Next we unite and shift vector 3 times:
6602 1st step:
6603 shift right by 6 the concatenation of:
6604 "1st vec" and "2nd vec"
6605 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6606 "2nd vec" and "3rd vec"
6607 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6608 "3rd vec" and "1st vec"
6609 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6610 | New vectors |
6612 So that now new vectors are:
6614 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6615 2nd vec: (10 13) (16 19 22) (17 20 23)
6616 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6618 2nd step:
6619 shift right by 5 the concatenation of:
6620 "1st vec" and "3rd vec"
6621 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6622 "2nd vec" and "1st vec"
6623 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6624 "3rd vec" and "2nd vec"
6625 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6626 | New vectors |
6628 So that now new vectors are:
6630 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6631 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6632 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6634 3rd step:
6635 shift right by 5 the concatenation of:
6636 "1st vec" and "1st vec"
6637 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6638 shift right by 3 the concatenation of:
6639 "2nd vec" and "2nd vec"
6640 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6641 | New vectors |
6643 So that now all vectors are READY:
6644 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6645 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6646 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6648 This algorithm is faster than one in vect_permute_load_chain if:
6649 1. "shift of a concatination" is faster than general permutation.
6650 This is usually so.
6651 2. The TARGET machine can't execute vector instructions in parallel.
6652 This is because each step of the algorithm depends on previous.
6653 The algorithm in vect_permute_load_chain is much more parallel.
6655 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6658 static bool
6659 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6660 unsigned int length,
6661 stmt_vec_info stmt_info,
6662 gimple_stmt_iterator *gsi,
6663 vec<tree> *result_chain)
6665 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6666 tree perm2_mask1, perm2_mask2, perm3_mask;
6667 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6668 gimple *perm_stmt;
6670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6671 machine_mode vmode = TYPE_MODE (vectype);
6672 unsigned int i;
6673 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6675 unsigned HOST_WIDE_INT nelt, vf;
6676 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6677 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6678 /* Not supported for variable-length vectors. */
6679 return false;
6681 vec_perm_builder sel (nelt, nelt, 1);
6682 sel.quick_grow (nelt);
6684 result_chain->quick_grow (length);
6685 memcpy (result_chain->address (), dr_chain.address (),
6686 length * sizeof (tree));
6688 if (pow2p_hwi (length) && vf > 4)
6690 unsigned int j, log_length = exact_log2 (length);
6691 for (i = 0; i < nelt / 2; ++i)
6692 sel[i] = i * 2;
6693 for (i = 0; i < nelt / 2; ++i)
6694 sel[nelt / 2 + i] = i * 2 + 1;
6695 vec_perm_indices indices (sel, 2, nelt);
6696 if (!can_vec_perm_const_p (vmode, vmode, indices))
6698 if (dump_enabled_p ())
6699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700 "shuffle of 2 fields structure is not \
6701 supported by target\n");
6702 return false;
6704 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6706 for (i = 0; i < nelt / 2; ++i)
6707 sel[i] = i * 2 + 1;
6708 for (i = 0; i < nelt / 2; ++i)
6709 sel[nelt / 2 + i] = i * 2;
6710 indices.new_vector (sel, 2, nelt);
6711 if (!can_vec_perm_const_p (vmode, vmode, indices))
6713 if (dump_enabled_p ())
6714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6715 "shuffle of 2 fields structure is not \
6716 supported by target\n");
6717 return false;
6719 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6721 /* Generating permutation constant to shift all elements.
6722 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6723 for (i = 0; i < nelt; i++)
6724 sel[i] = nelt / 2 + i;
6725 indices.new_vector (sel, 2, nelt);
6726 if (!can_vec_perm_const_p (vmode, vmode, indices))
6728 if (dump_enabled_p ())
6729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730 "shift permutation is not supported by target\n");
6731 return false;
6733 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6735 /* Generating permutation constant to select vector from 2.
6736 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6737 for (i = 0; i < nelt / 2; i++)
6738 sel[i] = i;
6739 for (i = nelt / 2; i < nelt; i++)
6740 sel[i] = nelt + i;
6741 indices.new_vector (sel, 2, nelt);
6742 if (!can_vec_perm_const_p (vmode, vmode, indices))
6744 if (dump_enabled_p ())
6745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746 "select is not supported by target\n");
6747 return false;
6749 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6751 for (i = 0; i < log_length; i++)
6753 for (j = 0; j < length; j += 2)
6755 first_vect = dr_chain[j];
6756 second_vect = dr_chain[j + 1];
6758 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6759 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6760 first_vect, first_vect,
6761 perm2_mask1);
6762 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6763 vect[0] = data_ref;
6765 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6766 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6767 second_vect, second_vect,
6768 perm2_mask2);
6769 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6770 vect[1] = data_ref;
6772 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6773 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6774 vect[0], vect[1], shift1_mask);
6775 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6776 (*result_chain)[j/2 + length/2] = data_ref;
6778 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6779 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6780 vect[0], vect[1], select_mask);
6781 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6782 (*result_chain)[j/2] = data_ref;
6784 memcpy (dr_chain.address (), result_chain->address (),
6785 length * sizeof (tree));
6787 return true;
6789 if (length == 3 && vf > 2)
6791 unsigned int k = 0, l = 0;
6793 /* Generating permutation constant to get all elements in rigth order.
6794 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6795 for (i = 0; i < nelt; i++)
6797 if (3 * k + (l % 3) >= nelt)
6799 k = 0;
6800 l += (3 - (nelt % 3));
6802 sel[i] = 3 * k + (l % 3);
6803 k++;
6805 vec_perm_indices indices (sel, 2, nelt);
6806 if (!can_vec_perm_const_p (vmode, vmode, indices))
6808 if (dump_enabled_p ())
6809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810 "shuffle of 3 fields structure is not \
6811 supported by target\n");
6812 return false;
6814 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6816 /* Generating permutation constant to shift all elements.
6817 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6818 for (i = 0; i < nelt; i++)
6819 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6820 indices.new_vector (sel, 2, nelt);
6821 if (!can_vec_perm_const_p (vmode, vmode, indices))
6823 if (dump_enabled_p ())
6824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6825 "shift permutation is not supported by target\n");
6826 return false;
6828 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6830 /* Generating permutation constant to shift all elements.
6831 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6832 for (i = 0; i < nelt; i++)
6833 sel[i] = 2 * (nelt / 3) + 1 + i;
6834 indices.new_vector (sel, 2, nelt);
6835 if (!can_vec_perm_const_p (vmode, vmode, indices))
6837 if (dump_enabled_p ())
6838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6839 "shift permutation is not supported by target\n");
6840 return false;
6842 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6844 /* Generating permutation constant to shift all elements.
6845 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6846 for (i = 0; i < nelt; i++)
6847 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6848 indices.new_vector (sel, 2, nelt);
6849 if (!can_vec_perm_const_p (vmode, vmode, indices))
6851 if (dump_enabled_p ())
6852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6853 "shift permutation is not supported by target\n");
6854 return false;
6856 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6858 /* Generating permutation constant to shift all elements.
6859 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6860 for (i = 0; i < nelt; i++)
6861 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6862 indices.new_vector (sel, 2, nelt);
6863 if (!can_vec_perm_const_p (vmode, vmode, indices))
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867 "shift permutation is not supported by target\n");
6868 return false;
6870 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6872 for (k = 0; k < 3; k++)
6874 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6875 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6876 dr_chain[k], dr_chain[k],
6877 perm3_mask);
6878 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6879 vect[k] = data_ref;
6882 for (k = 0; k < 3; k++)
6884 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6885 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6886 vect[k % 3], vect[(k + 1) % 3],
6887 shift1_mask);
6888 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6889 vect_shift[k] = data_ref;
6892 for (k = 0; k < 3; k++)
6894 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6895 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6896 vect_shift[(4 - k) % 3],
6897 vect_shift[(3 - k) % 3],
6898 shift2_mask);
6899 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6900 vect[k] = data_ref;
6903 (*result_chain)[3 - (nelt % 3)] = vect[2];
6905 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6906 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6907 vect[0], shift3_mask);
6908 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6909 (*result_chain)[nelt % 3] = data_ref;
6911 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6912 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6913 vect[1], shift4_mask);
6914 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6915 (*result_chain)[0] = data_ref;
6916 return true;
6918 return false;
6921 /* Function vect_transform_grouped_load.
6923 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6924 to perform their permutation and ascribe the result vectorized statements to
6925 the scalar statements.
6928 void
6929 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6930 vec<tree> dr_chain,
6931 int size, gimple_stmt_iterator *gsi)
6933 machine_mode mode;
6934 vec<tree> result_chain = vNULL;
6936 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6937 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6938 vectors, that are ready for vector computation. */
6939 result_chain.create (size);
6941 /* If reassociation width for vector type is 2 or greater target machine can
6942 execute 2 or more vector instructions in parallel. Otherwise try to
6943 get chain for loads group using vect_shift_permute_load_chain. */
6944 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6945 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6946 || pow2p_hwi (size)
6947 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6948 gsi, &result_chain))
6949 vect_permute_load_chain (vinfo, dr_chain,
6950 size, stmt_info, gsi, &result_chain);
6951 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6952 result_chain.release ();
6955 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6956 generated as part of the vectorization of STMT_INFO. Assign the statement
6957 for each vector to the associated scalar statement. */
6959 void
6960 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6961 vec<tree> result_chain)
6963 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6964 unsigned int i, gap_count;
6965 tree tmp_data_ref;
6967 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6968 Since we scan the chain starting from it's first node, their order
6969 corresponds the order of data-refs in RESULT_CHAIN. */
6970 stmt_vec_info next_stmt_info = first_stmt_info;
6971 gap_count = 1;
6972 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6974 if (!next_stmt_info)
6975 break;
6977 /* Skip the gaps. Loads created for the gaps will be removed by dead
6978 code elimination pass later. No need to check for the first stmt in
6979 the group, since it always exists.
6980 DR_GROUP_GAP is the number of steps in elements from the previous
6981 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6982 correspond to the gaps. */
6983 if (next_stmt_info != first_stmt_info
6984 && gap_count < DR_GROUP_GAP (next_stmt_info))
6986 gap_count++;
6987 continue;
6990 /* ??? The following needs cleanup after the removal of
6991 DR_GROUP_SAME_DR_STMT. */
6992 if (next_stmt_info)
6994 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6995 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6996 copies, and we put the new vector statement last. */
6997 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6999 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7000 gap_count = 1;
7005 /* Function vect_force_dr_alignment_p.
7007 Returns whether the alignment of a DECL can be forced to be aligned
7008 on ALIGNMENT bit boundary. */
7010 bool
7011 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7013 if (!VAR_P (decl))
7014 return false;
7016 if (decl_in_symtab_p (decl)
7017 && !symtab_node::get (decl)->can_increase_alignment_p ())
7018 return false;
7020 if (TREE_STATIC (decl))
7021 return (known_le (alignment,
7022 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7023 else
7024 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7027 /* Return whether the data reference DR_INFO is supported with respect to its
7028 alignment.
7029 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7030 it is aligned, i.e., check if it is possible to vectorize it with different
7031 alignment. */
7033 enum dr_alignment_support
7034 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7035 tree vectype, int misalignment)
7037 data_reference *dr = dr_info->dr;
7038 stmt_vec_info stmt_info = dr_info->stmt;
7039 machine_mode mode = TYPE_MODE (vectype);
7040 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7041 class loop *vect_loop = NULL;
7042 bool nested_in_vect_loop = false;
7044 if (misalignment == 0)
7045 return dr_aligned;
7047 /* For now assume all conditional loads/stores support unaligned
7048 access without any special code. */
7049 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7050 if (gimple_call_internal_p (stmt)
7051 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7052 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7053 return dr_unaligned_supported;
7055 if (loop_vinfo)
7057 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7058 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7061 /* Possibly unaligned access. */
7063 /* We can choose between using the implicit realignment scheme (generating
7064 a misaligned_move stmt) and the explicit realignment scheme (generating
7065 aligned loads with a REALIGN_LOAD). There are two variants to the
7066 explicit realignment scheme: optimized, and unoptimized.
7067 We can optimize the realignment only if the step between consecutive
7068 vector loads is equal to the vector size. Since the vector memory
7069 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7070 is guaranteed that the misalignment amount remains the same throughout the
7071 execution of the vectorized loop. Therefore, we can create the
7072 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7073 at the loop preheader.
7075 However, in the case of outer-loop vectorization, when vectorizing a
7076 memory access in the inner-loop nested within the LOOP that is now being
7077 vectorized, while it is guaranteed that the misalignment of the
7078 vectorized memory access will remain the same in different outer-loop
7079 iterations, it is *not* guaranteed that is will remain the same throughout
7080 the execution of the inner-loop. This is because the inner-loop advances
7081 with the original scalar step (and not in steps of VS). If the inner-loop
7082 step happens to be a multiple of VS, then the misalignment remains fixed
7083 and we can use the optimized realignment scheme. For example:
7085 for (i=0; i<N; i++)
7086 for (j=0; j<M; j++)
7087 s += a[i+j];
7089 When vectorizing the i-loop in the above example, the step between
7090 consecutive vector loads is 1, and so the misalignment does not remain
7091 fixed across the execution of the inner-loop, and the realignment cannot
7092 be optimized (as illustrated in the following pseudo vectorized loop):
7094 for (i=0; i<N; i+=4)
7095 for (j=0; j<M; j++){
7096 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7097 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7098 // (assuming that we start from an aligned address).
7101 We therefore have to use the unoptimized realignment scheme:
7103 for (i=0; i<N; i+=4)
7104 for (j=k; j<M; j+=4)
7105 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7106 // that the misalignment of the initial address is
7107 // 0).
7109 The loop can then be vectorized as follows:
7111 for (k=0; k<4; k++){
7112 rt = get_realignment_token (&vp[k]);
7113 for (i=0; i<N; i+=4){
7114 v1 = vp[i+k];
7115 for (j=k; j<M; j+=4){
7116 v2 = vp[i+j+VS-1];
7117 va = REALIGN_LOAD <v1,v2,rt>;
7118 vs += va;
7119 v1 = v2;
7122 } */
7124 if (DR_IS_READ (dr))
7126 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7127 && (!targetm.vectorize.builtin_mask_for_load
7128 || targetm.vectorize.builtin_mask_for_load ()))
7130 /* If we are doing SLP then the accesses need not have the
7131 same alignment, instead it depends on the SLP group size. */
7132 if (loop_vinfo
7133 && STMT_SLP_TYPE (stmt_info)
7134 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7135 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7136 * (DR_GROUP_SIZE
7137 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7138 TYPE_VECTOR_SUBPARTS (vectype))))
7140 else if (!loop_vinfo
7141 || (nested_in_vect_loop
7142 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7143 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7144 return dr_explicit_realign;
7145 else
7146 return dr_explicit_realign_optimized;
7150 bool is_packed = false;
7151 tree type = TREE_TYPE (DR_REF (dr));
7152 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7153 is_packed = not_size_aligned (DR_REF (dr));
7154 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7155 is_packed))
7156 return dr_unaligned_supported;
7158 /* Unsupported. */
7159 return dr_unaligned_unsupported;