c++: mutable temps in rodata [PR116369]
[official-gcc.git] / gcc / tree-vect-data-refs.cc
blobfe7fdec4ba0d9491a19fbd8724e0be060563d2eb
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_MEMORY
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "predict.h"
32 #include "memmodel.h"
33 #include "tm_p.h"
34 #include "ssa.h"
35 #include "optabs-tree.h"
36 #include "cgraph.h"
37 #include "dumpfile.h"
38 #include "alias.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
41 #include "tree-eh.h"
42 #include "gimplify.h"
43 #include "gimple-iterator.h"
44 #include "gimplify-me.h"
45 #include "tree-ssa-loop-ivopts.h"
46 #include "tree-ssa-loop-manip.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "expr.h"
52 #include "builtins.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 #include "gimple-fold.h"
59 /* Return true if load- or store-lanes optab OPTAB is implemented for
60 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 static bool
63 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
64 tree vectype, unsigned HOST_WIDE_INT count)
66 machine_mode mode, array_mode;
67 bool limit_p;
69 mode = TYPE_MODE (vectype);
70 if (!targetm.array_mode (mode, count).exists (&array_mode))
72 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
73 limit_p = !targetm.array_mode_supported_p (mode, count);
74 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
76 if (dump_enabled_p ())
77 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78 "no array mode for %s[%wu]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
101 /* Helper function to identify a simd clone call. If this is a call to a
102 function with simd clones then return the corresponding cgraph_node,
103 otherwise return NULL. */
105 static cgraph_node*
106 simd_clone_call_p (gimple *stmt)
108 gcall *call = dyn_cast <gcall *> (stmt);
109 if (!call)
110 return NULL;
112 tree fndecl = NULL_TREE;
113 if (gimple_call_internal_p (call, IFN_MASK_CALL))
114 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
115 else
116 fndecl = gimple_call_fndecl (stmt);
118 if (fndecl == NULL_TREE)
119 return NULL;
121 cgraph_node *node = cgraph_node::get (fndecl);
122 if (node && node->simd_clones != NULL)
123 return node;
125 return NULL;
130 /* Return the smallest scalar part of STMT_INFO.
131 This is used to determine the vectype of the stmt. We generally set the
132 vectype according to the type of the result (lhs). For stmts whose
133 result-type is different than the type of the arguments (e.g., demotion,
134 promotion), vectype will be reset appropriately (later). Note that we have
135 to visit the smallest datatype in this function, because that determines the
136 VF. If the smallest datatype in the loop is present only as the rhs of a
137 promotion operation - we'd miss it.
138 Such a case, where a variable of this datatype does not appear in the lhs
139 anywhere in the loop, can only occur if it's an invariant: e.g.:
140 'int_x = (int) short_inv', which we'd expect to have been optimized away by
141 invariant motion. However, we cannot rely on invariant motion to always
142 take invariants out of the loop, and so in the case of promotion we also
143 have to check the rhs.
144 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
145 types. */
147 tree
148 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
150 HOST_WIDE_INT lhs, rhs;
152 /* During the analysis phase, this function is called on arbitrary
153 statements that might not have scalar results. */
154 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
155 return scalar_type;
157 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
159 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
160 if (assign)
162 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
163 if (gimple_assign_cast_p (assign)
164 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
165 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
166 || gimple_assign_rhs_code (assign) == SAD_EXPR
167 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
168 || gimple_assign_rhs_code (assign) == WIDEN_MULT_PLUS_EXPR
169 || gimple_assign_rhs_code (assign) == WIDEN_MULT_MINUS_EXPR
170 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
171 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
173 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
175 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
176 if (rhs < lhs)
177 scalar_type = rhs_type;
180 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
182 auto clone = node->simd_clones->simdclone;
183 for (unsigned int i = 0; i < clone->nargs; ++i)
185 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
187 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
188 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
189 if (rhs < lhs)
191 scalar_type = arg_scalar_type;
192 lhs = rhs;
197 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
199 unsigned int i = 0;
200 if (gimple_call_internal_p (call))
202 internal_fn ifn = gimple_call_internal_fn (call);
203 if (internal_load_fn_p (ifn))
204 /* For loads the LHS type does the trick. */
205 i = ~0U;
206 else if (internal_store_fn_p (ifn))
208 /* For stores use the tyep of the stored value. */
209 i = internal_fn_stored_value_index (ifn);
210 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
211 i = ~0U;
213 else if (internal_fn_mask_index (ifn) == 0)
214 i = 1;
216 if (i < gimple_call_num_args (call))
218 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
219 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
221 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
222 if (rhs < lhs)
223 scalar_type = rhs_type;
228 return scalar_type;
232 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
233 tested at run-time. Return TRUE if DDR was successfully inserted.
234 Return false if versioning is not supported. */
236 static opt_result
237 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
239 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
241 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
242 return opt_result::failure_at (vect_location,
243 "will not create alias checks, as"
244 " --param vect-max-version-for-alias-checks"
245 " == 0\n");
247 opt_result res
248 = runtime_alias_check_p (ddr, loop,
249 optimize_loop_nest_for_speed_p (loop));
250 if (!res)
251 return res;
253 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
254 return opt_result::success ();
257 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
259 static void
260 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
262 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
263 for (unsigned int i = 0; i < checks.length(); ++i)
264 if (checks[i] == value)
265 return;
267 if (dump_enabled_p ())
268 dump_printf_loc (MSG_NOTE, vect_location,
269 "need run-time check that %T is nonzero\n",
270 value);
271 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
274 /* Return true if we know that the order of vectorized DR_INFO_A and
275 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
276 DR_INFO_B. At least one of the accesses is a write. */
278 static bool
279 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
281 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
282 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
284 /* Single statements are always kept in their original order. */
285 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
286 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
287 return true;
289 /* If there is a loop invariant read involved we might vectorize it in
290 the prologue, breaking scalar oder with respect to the in-loop store. */
291 if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
292 || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
293 return false;
295 /* STMT_A and STMT_B belong to overlapping groups. All loads are
296 emitted at the position of the first scalar load.
297 Stores in a group are emitted at the position of the last scalar store.
298 Compute that position and check whether the resulting order matches
299 the current one. */
300 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
301 if (il_a)
303 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
304 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
305 s = DR_GROUP_NEXT_ELEMENT (s))
306 il_a = get_later_stmt (il_a, s);
307 else /* DR_IS_READ */
308 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
309 s = DR_GROUP_NEXT_ELEMENT (s))
310 if (get_later_stmt (il_a, s) == il_a)
311 il_a = s;
313 else
314 il_a = stmtinfo_a;
315 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
316 if (il_b)
318 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
319 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
320 s = DR_GROUP_NEXT_ELEMENT (s))
321 il_b = get_later_stmt (il_b, s);
322 else /* DR_IS_READ */
323 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
324 s = DR_GROUP_NEXT_ELEMENT (s))
325 if (get_later_stmt (il_b, s) == il_b)
326 il_b = s;
328 else
329 il_b = stmtinfo_b;
330 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
331 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
334 /* A subroutine of vect_analyze_data_ref_dependence. Handle
335 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
336 distances. These distances are conservatively correct but they don't
337 reflect a guaranteed dependence.
339 Return true if this function does all the work necessary to avoid
340 an alias or false if the caller should use the dependence distances
341 to limit the vectorization factor in the usual way. LOOP_DEPTH is
342 the depth of the loop described by LOOP_VINFO and the other arguments
343 are as for vect_analyze_data_ref_dependence. */
345 static bool
346 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
347 loop_vec_info loop_vinfo,
348 int loop_depth, unsigned int *max_vf)
350 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
351 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
353 int dist = dist_v[loop_depth];
354 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
356 /* If the user asserted safelen >= DIST consecutive iterations
357 can be executed concurrently, assume independence.
359 ??? An alternative would be to add the alias check even
360 in this case, and vectorize the fallback loop with the
361 maximum VF set to safelen. However, if the user has
362 explicitly given a length, it's less likely that that
363 would be a win. */
364 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
366 if ((unsigned int) loop->safelen < *max_vf)
367 *max_vf = loop->safelen;
368 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
369 continue;
372 /* For dependence distances of 2 or more, we have the option
373 of limiting VF or checking for an alias at runtime.
374 Prefer to check at runtime if we can, to avoid limiting
375 the VF unnecessarily when the bases are in fact independent.
377 Note that the alias checks will be removed if the VF ends up
378 being small enough. */
379 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
380 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
381 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
382 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
383 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
386 return true;
390 /* Function vect_analyze_data_ref_dependence.
392 FIXME: I needed to change the sense of the returned flag.
394 Return FALSE if there (might) exist a dependence between a memory-reference
395 DRA and a memory-reference DRB. When versioning for alias may check a
396 dependence at run-time, return TRUE. Adjust *MAX_VF according to
397 the data dependence. */
399 static opt_result
400 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
401 loop_vec_info loop_vinfo,
402 unsigned int *max_vf)
404 unsigned int i;
405 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
406 struct data_reference *dra = DDR_A (ddr);
407 struct data_reference *drb = DDR_B (ddr);
408 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
409 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
410 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
411 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
412 lambda_vector dist_v;
413 unsigned int loop_depth;
415 /* If user asserted safelen consecutive iterations can be
416 executed concurrently, assume independence. */
417 auto apply_safelen = [&]()
419 if (loop->safelen >= 2)
421 if ((unsigned int) loop->safelen < *max_vf)
422 *max_vf = loop->safelen;
423 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
424 return true;
426 return false;
429 /* In loop analysis all data references should be vectorizable. */
430 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
431 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
432 gcc_unreachable ();
434 /* Independent data accesses. */
435 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
436 return opt_result::success ();
438 if (dra == drb
439 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
440 return opt_result::success ();
442 /* We do not have to consider dependences between accesses that belong
443 to the same group, unless the stride could be smaller than the
444 group size. */
445 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
446 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
447 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
448 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
449 return opt_result::success ();
451 /* Even if we have an anti-dependence then, as the vectorized loop covers at
452 least two scalar iterations, there is always also a true dependence.
453 As the vectorizer does not re-order loads and stores we can ignore
454 the anti-dependence if TBAA can disambiguate both DRs similar to the
455 case with known negative distance anti-dependences (positive
456 distance anti-dependences would violate TBAA constraints). */
457 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
458 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
459 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
460 get_alias_set (DR_REF (drb))))
461 return opt_result::success ();
463 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
464 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
466 if (apply_safelen ())
467 return opt_result::success ();
469 return opt_result::failure_at
470 (stmtinfo_a->stmt,
471 "possible alias involving gather/scatter between %T and %T\n",
472 DR_REF (dra), DR_REF (drb));
475 /* Unknown data dependence. */
476 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
478 if (apply_safelen ())
479 return opt_result::success ();
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
483 "versioning for alias required: "
484 "can't determine dependence between %T and %T\n",
485 DR_REF (dra), DR_REF (drb));
487 /* Add to list of ddrs that need to be tested at run-time. */
488 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
491 /* Known data dependence. */
492 if (DDR_NUM_DIST_VECTS (ddr) == 0)
494 if (apply_safelen ())
495 return opt_result::success ();
497 if (dump_enabled_p ())
498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
499 "versioning for alias required: "
500 "bad dist vector for %T and %T\n",
501 DR_REF (dra), DR_REF (drb));
502 /* Add to list of ddrs that need to be tested at run-time. */
503 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
506 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
508 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
509 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
510 loop_depth, max_vf))
511 return opt_result::success ();
513 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
515 int dist = dist_v[loop_depth];
517 if (dump_enabled_p ())
518 dump_printf_loc (MSG_NOTE, vect_location,
519 "dependence distance = %d.\n", dist);
521 if (dist == 0)
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "dependence distance == 0 between %T and %T\n",
526 DR_REF (dra), DR_REF (drb));
528 /* When we perform grouped accesses and perform implicit CSE
529 by detecting equal accesses and doing disambiguation with
530 runtime alias tests like for
531 .. = a[i];
532 .. = a[i+1];
533 a[i] = ..;
534 a[i+1] = ..;
535 *p = ..;
536 .. = a[i];
537 .. = a[i+1];
538 where we will end up loading { a[i], a[i+1] } once, make
539 sure that inserting group loads before the first load and
540 stores after the last store will do the right thing.
541 Similar for groups like
542 a[i] = ...;
543 ... = a[i];
544 a[i+1] = ...;
545 where loads from the group interleave with the store. */
546 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
547 return opt_result::failure_at (stmtinfo_a->stmt,
548 "READ_WRITE dependence"
549 " in interleaving.\n");
551 if (loop->safelen < 2)
553 tree indicator = dr_zero_step_indicator (dra);
554 if (!indicator || integer_zerop (indicator))
555 return opt_result::failure_at (stmtinfo_a->stmt,
556 "access also has a zero step\n");
557 else if (TREE_CODE (indicator) != INTEGER_CST)
558 vect_check_nonzero_value (loop_vinfo, indicator);
560 continue;
563 if (dist > 0 && DDR_REVERSED_P (ddr))
565 /* If DDR_REVERSED_P the order of the data-refs in DDR was
566 reversed (to make distance vector positive), and the actual
567 distance is negative. */
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "dependence distance negative.\n");
571 /* When doing outer loop vectorization, we need to check if there is
572 a backward dependence at the inner loop level if the dependence
573 at the outer loop is reversed. See PR81740. */
574 if (nested_in_vect_loop_p (loop, stmtinfo_a)
575 || nested_in_vect_loop_p (loop, stmtinfo_b))
577 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
578 DDR_LOOP_NEST (ddr));
579 if (dist_v[inner_depth] < 0)
580 return opt_result::failure_at (stmtinfo_a->stmt,
581 "not vectorized, dependence "
582 "between data-refs %T and %T\n",
583 DR_REF (dra), DR_REF (drb));
585 /* Record a negative dependence distance to later limit the
586 amount of stmt copying / unrolling we can perform.
587 Only need to handle read-after-write dependence. */
588 if (DR_IS_READ (drb)
589 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
590 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
591 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
592 continue;
595 unsigned int abs_dist = abs (dist);
596 if (abs_dist >= 2 && abs_dist < *max_vf)
598 /* The dependence distance requires reduction of the maximal
599 vectorization factor. */
600 *max_vf = abs_dist;
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "adjusting maximal vectorization factor to %i\n",
604 *max_vf);
607 if (abs_dist >= *max_vf)
609 /* Dependence distance does not create dependence, as far as
610 vectorization is concerned, in this case. */
611 if (dump_enabled_p ())
612 dump_printf_loc (MSG_NOTE, vect_location,
613 "dependence distance >= VF.\n");
614 continue;
617 return opt_result::failure_at (stmtinfo_a->stmt,
618 "not vectorized, possible dependence "
619 "between data-refs %T and %T\n",
620 DR_REF (dra), DR_REF (drb));
623 return opt_result::success ();
626 /* Function vect_analyze_early_break_dependences.
628 Examine all the data references in the loop and make sure that if we have
629 multiple exits that we are able to safely move stores such that they become
630 safe for vectorization. The function also calculates the place where to move
631 the instructions to and computes what the new vUSE chain should be.
633 This works in tandem with the CFG that will be produced by
634 slpeel_tree_duplicate_loop_to_edge_cfg later on.
636 This function tries to validate whether an early break vectorization
637 is possible for the current instruction sequence. Returns True i
638 possible, otherwise False.
640 Requirements:
641 - Any memory access must be to a fixed size buffer.
642 - There must not be any loads and stores to the same object.
643 - Multiple loads are allowed as long as they don't alias.
645 NOTE:
646 This implementation is very conservative. Any overlapping loads/stores
647 that take place before the early break statement gets rejected aside from
648 WAR dependencies.
650 i.e.:
652 a[i] = 8
653 c = a[i]
654 if (b[i])
657 is not allowed, but
659 c = a[i]
660 a[i] = 8
661 if (b[i])
664 is which is the common case. */
666 static opt_result
667 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
669 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
671 /* List of all load data references found during traversal. */
672 auto_vec<data_reference *> bases;
673 basic_block dest_bb = NULL;
675 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
676 class loop *loop_nest = loop_outer (loop);
678 if (dump_enabled_p ())
679 dump_printf_loc (MSG_NOTE, vect_location,
680 "loop contains multiple exits, analyzing"
681 " statement dependencies.\n");
683 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
684 if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE, vect_location,
686 "alternate exit has been chosen as main exit.\n");
688 /* Since we don't support general control flow, the location we'll move the
689 side-effects to is always the latch connected exit. When we support
690 general control flow we can do better but for now this is fine. Move
691 side-effects to the in-loop destination of the last early exit. For the
692 PEELED case we move the side-effects to the latch block as this is
693 guaranteed to be the last block to be executed when a vector iteration
694 finished. */
695 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
696 dest_bb = loop->latch;
697 else
698 dest_bb = single_pred (loop->latch);
700 /* We start looking from dest_bb, for the non-PEELED case we don't want to
701 move any stores already present, but we do want to read and validate the
702 loads. */
703 basic_block bb = dest_bb;
705 /* We move stores across all loads to the beginning of dest_bb, so
706 the first block processed below doesn't need dependence checking. */
707 bool check_deps = false;
711 gimple_stmt_iterator gsi = gsi_last_bb (bb);
713 /* Now analyze all the remaining statements and try to determine which
714 instructions are allowed/needed to be moved. */
715 while (!gsi_end_p (gsi))
717 gimple *stmt = gsi_stmt (gsi);
718 gsi_prev (&gsi);
719 if (is_gimple_debug (stmt))
720 continue;
722 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
723 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
724 if (!dr_ref)
725 continue;
727 /* We know everything below dest_bb is safe since we know we
728 had a full vector iteration when reaching it. Either by
729 the loop entry / IV exit test being last or because this
730 is the loop latch itself. */
731 if (!check_deps)
732 continue;
734 /* Check if vector accesses to the object will be within bounds.
735 must be a constant or assume loop will be versioned or niters
736 bounded by VF so accesses are within range. We only need to check
737 the reads since writes are moved to a safe place where if we get
738 there we know they are safe to perform. */
739 if (DR_IS_READ (dr_ref)
740 && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
742 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "early breaks not supported: vectorization "
745 "would %s beyond size of obj.\n",
746 DR_IS_READ (dr_ref) ? "read" : "write");
747 return opt_result::failure_at (stmt,
748 "can't safely apply code motion to "
749 "dependencies of %G to vectorize "
750 "the early exit.\n", stmt);
753 if (DR_IS_READ (dr_ref))
754 bases.safe_push (dr_ref);
755 else if (DR_IS_WRITE (dr_ref))
757 /* We are moving writes down in the CFG. To be sure that this
758 is valid after vectorization we have to check all the loads
759 we are sinking the stores past to see if any of them may
760 alias or are the same object.
762 Same objects will not be an issue because unless the store
763 is marked volatile the value can be forwarded. If the
764 store is marked volatile we don't vectorize the loop
765 anyway.
767 That leaves the check for aliasing. We don't really need
768 to care about the stores aliasing with each other since the
769 stores are moved in order so the effects are still observed
770 correctly. This leaves the check for WAR dependencies
771 which we would be introducing here if the DR can alias.
772 The check is quadratic in loads/stores but I have not found
773 a better API to do this. I believe all loads and stores
774 must be checked. We also must check them when we
775 encountered the store, since we don't care about loads past
776 the store. */
778 for (auto dr_read : bases)
779 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
781 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
783 vect_location,
784 "early breaks not supported: "
785 "overlapping loads and stores "
786 "found before the break "
787 "statement.\n");
789 return opt_result::failure_at (stmt,
790 "can't safely apply code motion to dependencies"
791 " to vectorize the early exit. %G may alias with"
792 " %G\n", stmt, dr_read->stmt);
796 if (gimple_vdef (stmt))
798 if (dump_enabled_p ())
799 dump_printf_loc (MSG_NOTE, vect_location,
800 "==> recording stmt %G", stmt);
802 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
804 else if (gimple_vuse (stmt))
806 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
807 if (dump_enabled_p ())
808 dump_printf_loc (MSG_NOTE, vect_location,
809 "marked statement for vUSE update: %G", stmt);
813 if (!single_pred_p (bb))
815 gcc_assert (bb == loop->header);
816 break;
819 /* If we possibly sink through a virtual PHI make sure to elide that. */
820 if (gphi *vphi = get_virtual_phi (bb))
821 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
823 /* All earlier blocks need dependence checking. */
824 check_deps = true;
825 bb = single_pred (bb);
827 while (1);
829 /* We don't allow outer -> inner loop transitions which should have been
830 trapped already during loop form analysis. */
831 gcc_assert (dest_bb->loop_father == loop);
833 /* Check that the destination block we picked has only one pred. To relax this we
834 have to take special care when moving the statements. We don't currently support
835 such control flow however this check is there to simplify how we handle
836 labels that may be present anywhere in the IL. This check is to ensure that the
837 labels aren't significant for the CFG. */
838 if (!single_pred (dest_bb))
839 return opt_result::failure_at (vect_location,
840 "chosen loop exit block (BB %d) does not have a "
841 "single predecessor which is currently not "
842 "supported for early break vectorization.\n",
843 dest_bb->index);
845 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
847 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
849 /* All uses shall be updated to that of the first load. Entries are
850 stored in reverse order. */
851 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
852 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
854 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE, vect_location,
856 "will update use: %T, mem_ref: %G", vuse, g);
860 if (dump_enabled_p ())
861 dump_printf_loc (MSG_NOTE, vect_location,
862 "recorded statements to be moved to BB %d\n",
863 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
865 return opt_result::success ();
868 /* Function vect_analyze_data_ref_dependences.
870 Examine all the data references in the loop, and make sure there do not
871 exist any data dependences between them. Set *MAX_VF according to
872 the maximum vectorization factor the data dependences allow. */
874 opt_result
875 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
876 unsigned int *max_vf)
878 unsigned int i;
879 struct data_dependence_relation *ddr;
881 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
883 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
885 LOOP_VINFO_DDRS (loop_vinfo)
886 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
887 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
888 /* We do not need read-read dependences. */
889 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
890 &LOOP_VINFO_DDRS (loop_vinfo),
891 LOOP_VINFO_LOOP_NEST (loop_vinfo),
892 false);
893 gcc_assert (res);
896 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
898 /* For epilogues we either have no aliases or alias versioning
899 was applied to original loop. Therefore we may just get max_vf
900 using VF of original loop. */
901 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
902 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
903 else
904 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
906 opt_result res
907 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
908 if (!res)
909 return res;
912 /* If we have early break statements in the loop, check to see if they
913 are of a form we can vectorizer. */
914 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
915 return vect_analyze_early_break_dependences (loop_vinfo);
917 return opt_result::success ();
921 /* Function vect_slp_analyze_data_ref_dependence.
923 Return TRUE if there (might) exist a dependence between a memory-reference
924 DRA and a memory-reference DRB for VINFO. When versioning for alias
925 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
926 according to the data dependence. */
928 static bool
929 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
930 struct data_dependence_relation *ddr)
932 struct data_reference *dra = DDR_A (ddr);
933 struct data_reference *drb = DDR_B (ddr);
934 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
935 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
937 /* We need to check dependences of statements marked as unvectorizable
938 as well, they still can prohibit vectorization. */
940 /* Independent data accesses. */
941 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
942 return false;
944 if (dra == drb)
945 return false;
947 /* Read-read is OK. */
948 if (DR_IS_READ (dra) && DR_IS_READ (drb))
949 return false;
951 /* If dra and drb are part of the same interleaving chain consider
952 them independent. */
953 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
954 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
955 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
956 return false;
958 /* Unknown data dependence. */
959 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
961 if (dump_enabled_p ())
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "can't determine dependence between %T and %T\n",
964 DR_REF (dra), DR_REF (drb));
966 else if (dump_enabled_p ())
967 dump_printf_loc (MSG_NOTE, vect_location,
968 "determined dependence between %T and %T\n",
969 DR_REF (dra), DR_REF (drb));
971 return true;
975 /* Analyze dependences involved in the transform of a store SLP NODE. */
977 static bool
978 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
980 /* This walks over all stmts involved in the SLP store done
981 in NODE verifying we can sink them up to the last stmt in the
982 group. */
983 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
984 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
986 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
988 stmt_vec_info access_info
989 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
990 if (access_info == last_access_info)
991 continue;
992 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
993 ao_ref ref;
994 bool ref_initialized_p = false;
995 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
996 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
998 gimple *stmt = gsi_stmt (gsi);
999 if (! gimple_vuse (stmt))
1000 continue;
1002 /* If we couldn't record a (single) data reference for this
1003 stmt we have to resort to the alias oracle. */
1004 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1005 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1006 if (!dr_b)
1008 /* We are moving a store - this means
1009 we cannot use TBAA for disambiguation. */
1010 if (!ref_initialized_p)
1011 ao_ref_init (&ref, DR_REF (dr_a));
1012 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1013 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1014 return false;
1015 continue;
1018 gcc_assert (!gimple_visited_p (stmt));
1020 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1021 dr_b, vNULL);
1022 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1023 free_dependence_relation (ddr);
1024 if (dependent)
1025 return false;
1028 return true;
1031 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1032 contain the vector of scalar stores of this instance if we are
1033 disambiguating the loads. */
1035 static bool
1036 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1037 vec<stmt_vec_info> stores,
1038 stmt_vec_info last_store_info)
1040 /* This walks over all stmts involved in the SLP load done
1041 in NODE verifying we can hoist them up to the first stmt in the
1042 group. */
1043 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1044 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1046 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1048 if (! SLP_TREE_SCALAR_STMTS (node)[k])
1049 continue;
1050 stmt_vec_info access_info
1051 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1052 if (access_info == first_access_info)
1053 continue;
1054 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1055 ao_ref ref;
1056 bool ref_initialized_p = false;
1057 hash_set<stmt_vec_info> grp_visited;
1058 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1059 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1061 gimple *stmt = gsi_stmt (gsi);
1062 if (! gimple_vdef (stmt))
1063 continue;
1065 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1067 /* If we run into a store of this same instance (we've just
1068 marked those) then delay dependence checking until we run
1069 into the last store because this is where it will have
1070 been sunk to (and we verified that we can do that already). */
1071 if (gimple_visited_p (stmt))
1073 if (stmt_info != last_store_info)
1074 continue;
1076 for (stmt_vec_info &store_info : stores)
1078 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1079 ddr_p ddr = initialize_data_dependence_relation
1080 (dr_a, store_dr, vNULL);
1081 bool dependent
1082 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1083 free_dependence_relation (ddr);
1084 if (dependent)
1085 return false;
1087 continue;
1090 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1092 /* We are hoisting a load - this means we can use TBAA for
1093 disambiguation. */
1094 if (!ref_initialized_p)
1095 ao_ref_init (&ref, DR_REF (dr_a));
1096 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1098 /* If we couldn't record a (single) data reference for this
1099 stmt we have to give up now. */
1100 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1101 if (!dr_b)
1102 return false;
1103 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1104 dr_b, vNULL);
1105 bool dependent
1106 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1107 free_dependence_relation (ddr);
1108 if (dependent)
1109 return false;
1111 /* No dependence. */
1112 return true;
1114 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1116 /* When we run into a store group we have to honor
1117 that earlier stores might be moved here. We don't
1118 know exactly which and where to since we lack a
1119 back-mapping from DR to SLP node, so assume all
1120 earlier stores are sunk here. It's enough to
1121 consider the last stmt of a group for this.
1122 ??? Both this and the fact that we disregard that
1123 the conflicting instance might be removed later
1124 is overly conservative. */
1125 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1126 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1127 store_info != NULL;
1128 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1129 if ((store_info == stmt_info
1130 || get_later_stmt (store_info, stmt_info) == stmt_info)
1131 && !check_hoist (store_info))
1132 return false;
1134 else
1136 if (!check_hoist (stmt_info))
1137 return false;
1141 return true;
1145 /* Function vect_analyze_data_ref_dependences.
1147 Examine all the data references in the basic-block, and make sure there
1148 do not exist any data dependences between them. Set *MAX_VF according to
1149 the maximum vectorization factor the data dependences allow. */
1151 bool
1152 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1154 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1156 /* The stores of this instance are at the root of the SLP tree. */
1157 slp_tree store = NULL;
1158 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1159 store = SLP_INSTANCE_TREE (instance);
1161 /* Verify we can sink stores to the vectorized stmt insert location. */
1162 stmt_vec_info last_store_info = NULL;
1163 if (store)
1165 if (! vect_slp_analyze_store_dependences (vinfo, store))
1166 return false;
1168 /* Mark stores in this instance and remember the last one. */
1169 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1170 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1171 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1174 bool res = true;
1176 /* Verify we can sink loads to the vectorized stmt insert location,
1177 special-casing stores of this instance. */
1178 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1179 if (! vect_slp_analyze_load_dependences (vinfo, load,
1180 store
1181 ? SLP_TREE_SCALAR_STMTS (store)
1182 : vNULL, last_store_info))
1184 res = false;
1185 break;
1188 /* Unset the visited flag. */
1189 if (store)
1190 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1191 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1193 return res;
1196 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1197 applied. */
1200 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1202 HOST_WIDE_INT diff = 0;
1203 /* Alignment is only analyzed for the first element of a DR group,
1204 use that but adjust misalignment by the offset of the access. */
1205 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1207 dr_vec_info *first_dr
1208 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1209 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1210 INTEGER_CSTs and the first element in the group has the lowest
1211 address. */
1212 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1213 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1214 gcc_assert (diff >= 0);
1215 dr_info = first_dr;
1218 int misalign = dr_info->misalignment;
1219 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1220 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1221 return misalign;
1223 /* If the access is only aligned for a vector type with smaller alignment
1224 requirement the access has unknown misalignment. */
1225 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1226 targetm.vectorize.preferred_vector_alignment (vectype)))
1227 return DR_MISALIGNMENT_UNKNOWN;
1229 /* Apply the offset from the DR group start and the externally supplied
1230 offset which can for example result from a negative stride access. */
1231 poly_int64 misalignment = misalign + diff + offset;
1233 /* vect_compute_data_ref_alignment will have ensured that target_alignment
1234 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
1235 unsigned HOST_WIDE_INT target_alignment_c
1236 = dr_info->target_alignment.to_constant ();
1237 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1238 return DR_MISALIGNMENT_UNKNOWN;
1239 return misalign;
1242 /* Record the base alignment guarantee given by DRB, which occurs
1243 in STMT_INFO. */
1245 static void
1246 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1247 innermost_loop_behavior *drb)
1249 bool existed;
1250 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1251 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1252 if (!existed || entry.second->base_alignment < drb->base_alignment)
1254 entry = std::make_pair (stmt_info, drb);
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_NOTE, vect_location,
1257 "recording new base alignment for %T\n"
1258 " alignment: %d\n"
1259 " misalignment: %d\n"
1260 " based on: %G",
1261 drb->base_address,
1262 drb->base_alignment,
1263 drb->base_misalignment,
1264 stmt_info->stmt);
1268 /* If the region we're going to vectorize is reached, all unconditional
1269 data references occur at least once. We can therefore pool the base
1270 alignment guarantees from each unconditional reference. Do this by
1271 going through all the data references in VINFO and checking whether
1272 the containing statement makes the reference unconditionally. If so,
1273 record the alignment of the base address in VINFO so that it can be
1274 used for all other references with the same base. */
1276 void
1277 vect_record_base_alignments (vec_info *vinfo)
1279 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1280 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1281 for (data_reference *dr : vinfo->shared->datarefs)
1283 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1284 stmt_vec_info stmt_info = dr_info->stmt;
1285 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1286 && STMT_VINFO_VECTORIZABLE (stmt_info)
1287 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1289 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1291 /* If DR is nested in the loop that is being vectorized, we can also
1292 record the alignment of the base wrt the outer loop. */
1293 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1294 vect_record_base_alignment
1295 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1300 /* Function vect_compute_data_ref_alignment
1302 Compute the misalignment of the data reference DR_INFO when vectorizing
1303 with VECTYPE.
1305 Output:
1306 1. initialized misalignment info for DR_INFO
1308 FOR NOW: No analysis is actually performed. Misalignment is calculated
1309 only for trivial cases. TODO. */
1311 static void
1312 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1313 tree vectype)
1315 stmt_vec_info stmt_info = dr_info->stmt;
1316 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1317 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1318 class loop *loop = NULL;
1319 tree ref = DR_REF (dr_info->dr);
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_NOTE, vect_location,
1323 "vect_compute_data_ref_alignment:\n");
1325 if (loop_vinfo)
1326 loop = LOOP_VINFO_LOOP (loop_vinfo);
1328 /* Initialize misalignment to unknown. */
1329 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1331 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1332 return;
1334 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1335 bool step_preserves_misalignment_p;
1337 poly_uint64 vector_alignment
1338 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1339 BITS_PER_UNIT);
1340 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1342 /* If the main loop has peeled for alignment we have no way of knowing
1343 whether the data accesses in the epilogues are aligned. We can't at
1344 compile time answer the question whether we have entered the main loop or
1345 not. Fixes PR 92351. */
1346 if (loop_vinfo)
1348 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1349 if (orig_loop_vinfo
1350 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1351 return;
1354 unsigned HOST_WIDE_INT vect_align_c;
1355 if (!vector_alignment.is_constant (&vect_align_c))
1356 return;
1358 /* No step for BB vectorization. */
1359 if (!loop)
1361 gcc_assert (integer_zerop (drb->step));
1362 step_preserves_misalignment_p = true;
1365 else
1367 /* We can only use base and misalignment information relative to
1368 an innermost loop if the misalignment stays the same throughout the
1369 execution of the loop. As above, this is the case if the stride of
1370 the dataref evenly divides by the alignment. */
1371 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1372 step_preserves_misalignment_p
1373 = multiple_p (drb->step_alignment * vf, vect_align_c);
1375 if (!step_preserves_misalignment_p && dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "step doesn't divide the vector alignment.\n");
1379 /* In case the dataref is in an inner-loop of the loop that is being
1380 vectorized (LOOP), we use the base and misalignment information
1381 relative to the outer-loop (LOOP). This is ok only if the
1382 misalignment stays the same throughout the execution of the
1383 inner-loop, which is why we have to check that the stride of the
1384 dataref in the inner-loop evenly divides by the vector alignment. */
1385 if (step_preserves_misalignment_p
1386 && nested_in_vect_loop_p (loop, stmt_info))
1388 step_preserves_misalignment_p
1389 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1391 if (dump_enabled_p ())
1393 if (step_preserves_misalignment_p)
1394 dump_printf_loc (MSG_NOTE, vect_location,
1395 "inner step divides the vector alignment.\n");
1396 else
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398 "inner step doesn't divide the vector"
1399 " alignment.\n");
1404 unsigned int base_alignment = drb->base_alignment;
1405 unsigned int base_misalignment = drb->base_misalignment;
1407 /* Calculate the maximum of the pooled base address alignment and the
1408 alignment that we can compute for DR itself. */
1409 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1410 = base_alignments->get (drb->base_address);
1411 if (entry
1412 && base_alignment < (*entry).second->base_alignment
1413 && (loop_vinfo
1414 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1415 gimple_bb (entry->first->stmt))
1416 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1417 || (entry->first->dr_aux.group <= dr_info->group)))))
1419 base_alignment = entry->second->base_alignment;
1420 base_misalignment = entry->second->base_misalignment;
1423 if (drb->offset_alignment < vect_align_c
1424 || !step_preserves_misalignment_p
1425 /* We need to know whether the step wrt the vectorized loop is
1426 negative when computing the starting misalignment below. */
1427 || TREE_CODE (drb->step) != INTEGER_CST)
1429 if (dump_enabled_p ())
1430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1431 "Unknown alignment for access: %T\n", ref);
1432 return;
1435 if (base_alignment < vect_align_c)
1437 unsigned int max_alignment;
1438 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1439 if (max_alignment < vect_align_c
1440 || !vect_can_force_dr_alignment_p (base,
1441 vect_align_c * BITS_PER_UNIT))
1443 if (dump_enabled_p ())
1444 dump_printf_loc (MSG_NOTE, vect_location,
1445 "can't force alignment of ref: %T\n", ref);
1446 return;
1449 /* Force the alignment of the decl.
1450 NOTE: This is the only change to the code we make during
1451 the analysis phase, before deciding to vectorize the loop. */
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_NOTE, vect_location,
1454 "force alignment of %T\n", ref);
1456 dr_info->base_decl = base;
1457 dr_info->base_misaligned = true;
1458 base_misalignment = 0;
1460 poly_int64 misalignment
1461 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1463 unsigned int const_misalignment;
1464 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1466 if (dump_enabled_p ())
1467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1468 "Non-constant misalignment for access: %T\n", ref);
1469 return;
1472 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1474 if (dump_enabled_p ())
1475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476 "misalign = %d bytes of ref %T\n",
1477 const_misalignment, ref);
1479 return;
1482 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1483 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1484 is made aligned via peeling. */
1486 static bool
1487 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1488 dr_vec_info *dr_peel_info)
1490 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1491 DR_TARGET_ALIGNMENT (dr_info)))
1493 poly_offset_int diff
1494 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1495 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1496 if (known_eq (diff, 0)
1497 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1498 return true;
1500 return false;
1503 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1504 aligned via peeling. */
1506 static bool
1507 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1508 dr_vec_info *dr_peel_info)
1510 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1511 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1512 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1513 DR_OFFSET (dr_peel_info->dr), 0)
1514 || !operand_equal_p (DR_STEP (dr_info->dr),
1515 DR_STEP (dr_peel_info->dr), 0))
1516 return false;
1518 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1521 /* Compute the value for dr_info->misalign so that the access appears
1522 aligned. This is used by peeling to compensate for dr_misalignment
1523 applying the offset for negative step. */
1526 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1528 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1529 return 0;
1531 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1532 poly_int64 misalignment
1533 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1534 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1536 unsigned HOST_WIDE_INT target_alignment_c;
1537 int misalign;
1538 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1539 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1540 return DR_MISALIGNMENT_UNKNOWN;
1541 return misalign;
1544 /* Function vect_update_misalignment_for_peel.
1545 Sets DR_INFO's misalignment
1546 - to 0 if it has the same alignment as DR_PEEL_INFO,
1547 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1548 - to -1 (unknown) otherwise.
1550 DR_INFO - the data reference whose misalignment is to be adjusted.
1551 DR_PEEL_INFO - the data reference whose misalignment is being made
1552 zero in the vector loop by the peel.
1553 NPEEL - the number of iterations in the peel loop if the misalignment
1554 of DR_PEEL_INFO is known at compile time. */
1556 static void
1557 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1558 dr_vec_info *dr_peel_info, int npeel)
1560 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1561 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1563 SET_DR_MISALIGNMENT (dr_info,
1564 vect_dr_misalign_for_aligned_access (dr_peel_info));
1565 return;
1568 unsigned HOST_WIDE_INT alignment;
1569 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1570 && known_alignment_for_access_p (dr_info,
1571 STMT_VINFO_VECTYPE (dr_info->stmt))
1572 && known_alignment_for_access_p (dr_peel_info,
1573 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1575 int misal = dr_info->misalignment;
1576 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1577 misal &= alignment - 1;
1578 set_dr_misalignment (dr_info, misal);
1579 return;
1582 if (dump_enabled_p ())
1583 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1584 "to unknown (-1).\n");
1585 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1588 /* Return true if alignment is relevant for DR_INFO. */
1590 static bool
1591 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1593 stmt_vec_info stmt_info = dr_info->stmt;
1595 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1596 return false;
1598 /* For interleaving, only the alignment of the first access matters. */
1599 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1600 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1601 return false;
1603 /* Scatter-gather and invariant accesses continue to address individual
1604 scalars, so vector-level alignment is irrelevant. */
1605 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1606 || integer_zerop (DR_STEP (dr_info->dr)))
1607 return false;
1609 /* Strided accesses perform only component accesses, alignment is
1610 irrelevant for them. */
1611 if (STMT_VINFO_STRIDED_P (stmt_info)
1612 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1613 return false;
1615 return true;
1618 /* Given an memory reference EXP return whether its alignment is less
1619 than its size. */
1621 static bool
1622 not_size_aligned (tree exp)
1624 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1625 return true;
1627 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1628 > get_object_alignment (exp));
1631 /* Function vector_alignment_reachable_p
1633 Return true if vector alignment for DR_INFO is reachable by peeling
1634 a few loop iterations. Return false otherwise. */
1636 static bool
1637 vector_alignment_reachable_p (dr_vec_info *dr_info)
1639 stmt_vec_info stmt_info = dr_info->stmt;
1640 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1642 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1644 /* For interleaved access we peel only if number of iterations in
1645 the prolog loop ({VF - misalignment}), is a multiple of the
1646 number of the interleaved accesses. */
1647 int elem_size, mis_in_elements;
1649 /* FORNOW: handle only known alignment. */
1650 if (!known_alignment_for_access_p (dr_info, vectype))
1651 return false;
1653 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1654 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1655 elem_size = vector_element_size (vector_size, nelements);
1656 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1658 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1659 return false;
1662 /* If misalignment is known at the compile time then allow peeling
1663 only if natural alignment is reachable through peeling. */
1664 if (known_alignment_for_access_p (dr_info, vectype)
1665 && !aligned_access_p (dr_info, vectype))
1667 HOST_WIDE_INT elmsize =
1668 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1669 if (dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE, vect_location,
1672 "data size = %wd. misalignment = %d.\n", elmsize,
1673 dr_misalignment (dr_info, vectype));
1675 if (dr_misalignment (dr_info, vectype) % elmsize)
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "data size does not divide the misalignment.\n");
1680 return false;
1684 if (!known_alignment_for_access_p (dr_info, vectype))
1686 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1687 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1688 if (dump_enabled_p ())
1689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690 "Unknown misalignment, %snaturally aligned\n",
1691 is_packed ? "not " : "");
1692 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1695 return true;
1699 /* Calculate the cost of the memory access represented by DR_INFO. */
1701 static void
1702 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1703 dr_alignment_support alignment_support_scheme,
1704 int misalignment,
1705 unsigned int *inside_cost,
1706 unsigned int *outside_cost,
1707 stmt_vector_for_cost *body_cost_vec,
1708 stmt_vector_for_cost *prologue_cost_vec)
1710 stmt_vec_info stmt_info = dr_info->stmt;
1711 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1712 int ncopies;
1714 if (PURE_SLP_STMT (stmt_info))
1715 ncopies = 1;
1716 else
1717 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1719 if (DR_IS_READ (dr_info->dr))
1720 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1721 misalignment, true, inside_cost,
1722 outside_cost, prologue_cost_vec, body_cost_vec, false);
1723 else
1724 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1725 misalignment, inside_cost, body_cost_vec);
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "vect_get_data_access_cost: inside_cost = %d, "
1730 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1734 typedef struct _vect_peel_info
1736 dr_vec_info *dr_info;
1737 int npeel;
1738 unsigned int count;
1739 } *vect_peel_info;
1741 typedef struct _vect_peel_extended_info
1743 vec_info *vinfo;
1744 struct _vect_peel_info peel_info;
1745 unsigned int inside_cost;
1746 unsigned int outside_cost;
1747 } *vect_peel_extended_info;
1750 /* Peeling hashtable helpers. */
1752 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1754 static inline hashval_t hash (const _vect_peel_info *);
1755 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1758 inline hashval_t
1759 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1761 return (hashval_t) peel_info->npeel;
1764 inline bool
1765 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1767 return (a->npeel == b->npeel);
1771 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1773 static void
1774 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1775 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1776 int npeel, bool supportable_if_not_aligned)
1778 struct _vect_peel_info elem, *slot;
1779 _vect_peel_info **new_slot;
1781 elem.npeel = npeel;
1782 slot = peeling_htab->find (&elem);
1783 if (slot)
1784 slot->count++;
1785 else
1787 slot = XNEW (struct _vect_peel_info);
1788 slot->npeel = npeel;
1789 slot->dr_info = dr_info;
1790 slot->count = 1;
1791 new_slot = peeling_htab->find_slot (slot, INSERT);
1792 *new_slot = slot;
1795 /* If this DR is not supported with unknown misalignment then bias
1796 this slot when the cost model is disabled. */
1797 if (!supportable_if_not_aligned
1798 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1799 slot->count += VECT_MAX_COST;
1803 /* Traverse peeling hash table to find peeling option that aligns maximum
1804 number of data accesses. */
1807 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1808 _vect_peel_extended_info *max)
1810 vect_peel_info elem = *slot;
1812 if (elem->count > max->peel_info.count
1813 || (elem->count == max->peel_info.count
1814 && max->peel_info.npeel > elem->npeel))
1816 max->peel_info.npeel = elem->npeel;
1817 max->peel_info.count = elem->count;
1818 max->peel_info.dr_info = elem->dr_info;
1821 return 1;
1824 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1825 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1826 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1827 after peeling. */
1829 static void
1830 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1831 dr_vec_info *dr0_info,
1832 unsigned int *inside_cost,
1833 unsigned int *outside_cost,
1834 stmt_vector_for_cost *body_cost_vec,
1835 stmt_vector_for_cost *prologue_cost_vec,
1836 unsigned int npeel)
1838 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1840 bool dr0_alignment_known_p
1841 = (dr0_info
1842 && known_alignment_for_access_p (dr0_info,
1843 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1845 for (data_reference *dr : datarefs)
1847 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1848 if (!vect_relevant_for_alignment_p (dr_info))
1849 continue;
1851 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1852 dr_alignment_support alignment_support_scheme;
1853 int misalignment;
1854 unsigned HOST_WIDE_INT alignment;
1856 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1857 size_zero_node) < 0;
1858 poly_int64 off = 0;
1859 if (negative)
1860 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1861 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1863 if (npeel == 0)
1864 misalignment = dr_misalignment (dr_info, vectype, off);
1865 else if (dr_info == dr0_info
1866 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1867 misalignment = 0;
1868 else if (!dr0_alignment_known_p
1869 || !known_alignment_for_access_p (dr_info, vectype)
1870 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1871 misalignment = DR_MISALIGNMENT_UNKNOWN;
1872 else
1874 misalignment = dr_misalignment (dr_info, vectype, off);
1875 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1876 misalignment &= alignment - 1;
1878 alignment_support_scheme
1879 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1880 misalignment);
1882 vect_get_data_access_cost (loop_vinfo, dr_info,
1883 alignment_support_scheme, misalignment,
1884 inside_cost, outside_cost,
1885 body_cost_vec, prologue_cost_vec);
1889 /* Traverse peeling hash table and calculate cost for each peeling option.
1890 Find the one with the lowest cost. */
1893 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1894 _vect_peel_extended_info *min)
1896 vect_peel_info elem = *slot;
1897 int dummy;
1898 unsigned int inside_cost = 0, outside_cost = 0;
1899 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1900 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1901 epilogue_cost_vec;
1903 prologue_cost_vec.create (2);
1904 body_cost_vec.create (2);
1905 epilogue_cost_vec.create (2);
1907 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1908 &outside_cost, &body_cost_vec,
1909 &prologue_cost_vec, elem->npeel);
1911 body_cost_vec.release ();
1913 outside_cost += vect_get_known_peeling_cost
1914 (loop_vinfo, elem->npeel, &dummy,
1915 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1916 &prologue_cost_vec, &epilogue_cost_vec);
1918 /* Prologue and epilogue costs are added to the target model later.
1919 These costs depend only on the scalar iteration cost, the
1920 number of peeling iterations finally chosen, and the number of
1921 misaligned statements. So discard the information found here. */
1922 prologue_cost_vec.release ();
1923 epilogue_cost_vec.release ();
1925 if (inside_cost < min->inside_cost
1926 || (inside_cost == min->inside_cost
1927 && outside_cost < min->outside_cost))
1929 min->inside_cost = inside_cost;
1930 min->outside_cost = outside_cost;
1931 min->peel_info.dr_info = elem->dr_info;
1932 min->peel_info.npeel = elem->npeel;
1933 min->peel_info.count = elem->count;
1936 return 1;
1940 /* Choose best peeling option by traversing peeling hash table and either
1941 choosing an option with the lowest cost (if cost model is enabled) or the
1942 option that aligns as many accesses as possible. */
1944 static struct _vect_peel_extended_info
1945 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1946 loop_vec_info loop_vinfo)
1948 struct _vect_peel_extended_info res;
1950 res.peel_info.dr_info = NULL;
1951 res.vinfo = loop_vinfo;
1953 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1955 res.inside_cost = INT_MAX;
1956 res.outside_cost = INT_MAX;
1957 peeling_htab->traverse <_vect_peel_extended_info *,
1958 vect_peeling_hash_get_lowest_cost> (&res);
1960 else
1962 res.peel_info.count = 0;
1963 peeling_htab->traverse <_vect_peel_extended_info *,
1964 vect_peeling_hash_get_most_frequent> (&res);
1965 res.inside_cost = 0;
1966 res.outside_cost = 0;
1969 return res;
1972 /* Return true if the new peeling NPEEL is supported. */
1974 static bool
1975 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1976 unsigned npeel)
1978 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1979 enum dr_alignment_support supportable_dr_alignment;
1981 bool dr0_alignment_known_p
1982 = known_alignment_for_access_p (dr0_info,
1983 STMT_VINFO_VECTYPE (dr0_info->stmt));
1985 /* Ensure that all data refs can be vectorized after the peel. */
1986 for (data_reference *dr : datarefs)
1988 if (dr == dr0_info->dr)
1989 continue;
1991 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1992 if (!vect_relevant_for_alignment_p (dr_info)
1993 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1994 continue;
1996 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1997 int misalignment;
1998 unsigned HOST_WIDE_INT alignment;
1999 if (!dr0_alignment_known_p
2000 || !known_alignment_for_access_p (dr_info, vectype)
2001 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2002 misalignment = DR_MISALIGNMENT_UNKNOWN;
2003 else
2005 misalignment = dr_misalignment (dr_info, vectype);
2006 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2007 misalignment &= alignment - 1;
2009 supportable_dr_alignment
2010 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2011 misalignment);
2012 if (supportable_dr_alignment == dr_unaligned_unsupported)
2013 return false;
2016 return true;
2019 /* Compare two data-references DRA and DRB to group them into chunks
2020 with related alignment. */
2022 static int
2023 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2025 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2026 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2027 int cmp;
2029 /* Stabilize sort. */
2030 if (dra == drb)
2031 return 0;
2033 /* Ordering of DRs according to base. */
2034 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2035 DR_BASE_ADDRESS (drb));
2036 if (cmp != 0)
2037 return cmp;
2039 /* And according to DR_OFFSET. */
2040 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2041 if (cmp != 0)
2042 return cmp;
2044 /* And after step. */
2045 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2046 if (cmp != 0)
2047 return cmp;
2049 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2050 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2051 if (cmp == 0)
2052 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2053 return cmp;
2056 /* Function vect_enhance_data_refs_alignment
2058 This pass will use loop versioning and loop peeling in order to enhance
2059 the alignment of data references in the loop.
2061 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2062 original loop is to be vectorized. Any other loops that are created by
2063 the transformations performed in this pass - are not supposed to be
2064 vectorized. This restriction will be relaxed.
2066 This pass will require a cost model to guide it whether to apply peeling
2067 or versioning or a combination of the two. For example, the scheme that
2068 intel uses when given a loop with several memory accesses, is as follows:
2069 choose one memory access ('p') which alignment you want to force by doing
2070 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2071 other accesses are not necessarily aligned, or (2) use loop versioning to
2072 generate one loop in which all accesses are aligned, and another loop in
2073 which only 'p' is necessarily aligned.
2075 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2076 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2077 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2079 Devising a cost model is the most critical aspect of this work. It will
2080 guide us on which access to peel for, whether to use loop versioning, how
2081 many versions to create, etc. The cost model will probably consist of
2082 generic considerations as well as target specific considerations (on
2083 powerpc for example, misaligned stores are more painful than misaligned
2084 loads).
2086 Here are the general steps involved in alignment enhancements:
2088 -- original loop, before alignment analysis:
2089 for (i=0; i<N; i++){
2090 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2091 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2094 -- After vect_compute_data_refs_alignment:
2095 for (i=0; i<N; i++){
2096 x = q[i]; # DR_MISALIGNMENT(q) = 3
2097 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2100 -- Possibility 1: we do loop versioning:
2101 if (p is aligned) {
2102 for (i=0; i<N; i++){ # loop 1A
2103 x = q[i]; # DR_MISALIGNMENT(q) = 3
2104 p[i] = y; # DR_MISALIGNMENT(p) = 0
2107 else {
2108 for (i=0; i<N; i++){ # loop 1B
2109 x = q[i]; # DR_MISALIGNMENT(q) = 3
2110 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2114 -- Possibility 2: we do loop peeling:
2115 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2116 x = q[i];
2117 p[i] = y;
2119 for (i = 3; i < N; i++){ # loop 2A
2120 x = q[i]; # DR_MISALIGNMENT(q) = 0
2121 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2124 -- Possibility 3: combination of loop peeling and versioning:
2125 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2126 x = q[i];
2127 p[i] = y;
2129 if (p is aligned) {
2130 for (i = 3; i<N; i++){ # loop 3A
2131 x = q[i]; # DR_MISALIGNMENT(q) = 0
2132 p[i] = y; # DR_MISALIGNMENT(p) = 0
2135 else {
2136 for (i = 3; i<N; i++){ # loop 3B
2137 x = q[i]; # DR_MISALIGNMENT(q) = 0
2138 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2142 These loops are later passed to loop_transform to be vectorized. The
2143 vectorizer will use the alignment information to guide the transformation
2144 (whether to generate regular loads/stores, or with special handling for
2145 misalignment). */
2147 opt_result
2148 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2150 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2151 dr_vec_info *first_store = NULL;
2152 dr_vec_info *dr0_info = NULL;
2153 struct data_reference *dr;
2154 unsigned int i;
2155 bool do_peeling = false;
2156 bool do_versioning = false;
2157 unsigned int npeel = 0;
2158 bool one_misalignment_known = false;
2159 bool one_misalignment_unknown = false;
2160 bool one_dr_unsupportable = false;
2161 dr_vec_info *unsupportable_dr_info = NULL;
2162 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2163 hash_table<peel_info_hasher> peeling_htab (1);
2165 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2167 /* Reset data so we can safely be called multiple times. */
2168 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2169 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2171 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2172 return opt_result::success ();
2174 /* Sort the vector of datarefs so DRs that have the same or dependent
2175 alignment are next to each other. */
2176 auto_vec<data_reference_p> datarefs
2177 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2178 datarefs.qsort (dr_align_group_sort_cmp);
2180 /* Compute the number of DRs that become aligned when we peel
2181 a dataref so it becomes aligned. */
2182 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2183 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2184 unsigned i0;
2185 for (i0 = 0; i0 < datarefs.length (); ++i0)
2186 if (DR_BASE_ADDRESS (datarefs[i0]))
2187 break;
2188 for (i = i0 + 1; i <= datarefs.length (); ++i)
2190 if (i == datarefs.length ()
2191 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2192 DR_BASE_ADDRESS (datarefs[i]), 0)
2193 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2194 DR_OFFSET (datarefs[i]), 0)
2195 || !operand_equal_p (DR_STEP (datarefs[i0]),
2196 DR_STEP (datarefs[i]), 0))
2198 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2199 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2200 will get known misalignment if we align one of the refs
2201 with the largest DR_TARGET_ALIGNMENT. */
2202 for (unsigned j = i0; j < i; ++j)
2204 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2205 for (unsigned k = i0; k < i; ++k)
2207 if (k == j)
2208 continue;
2209 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2210 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2211 dr_infoj))
2212 n_same_align_refs[j]++;
2215 i0 = i;
2219 /* While cost model enhancements are expected in the future, the high level
2220 view of the code at this time is as follows:
2222 A) If there is a misaligned access then see if peeling to align
2223 this access can make all data references satisfy
2224 vect_supportable_dr_alignment. If so, update data structures
2225 as needed and return true.
2227 B) If peeling wasn't possible and there is a data reference with an
2228 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2229 then see if loop versioning checks can be used to make all data
2230 references satisfy vect_supportable_dr_alignment. If so, update
2231 data structures as needed and return true.
2233 C) If neither peeling nor versioning were successful then return false if
2234 any data reference does not satisfy vect_supportable_dr_alignment.
2236 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2238 Note, Possibility 3 above (which is peeling and versioning together) is not
2239 being done at this time. */
2241 /* (1) Peeling to force alignment. */
2243 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2244 Considerations:
2245 + How many accesses will become aligned due to the peeling
2246 - How many accesses will become unaligned due to the peeling,
2247 and the cost of misaligned accesses.
2248 - The cost of peeling (the extra runtime checks, the increase
2249 in code size). */
2251 FOR_EACH_VEC_ELT (datarefs, i, dr)
2253 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2254 if (!vect_relevant_for_alignment_p (dr_info))
2255 continue;
2257 stmt_vec_info stmt_info = dr_info->stmt;
2258 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2259 do_peeling = vector_alignment_reachable_p (dr_info);
2260 if (do_peeling)
2262 if (known_alignment_for_access_p (dr_info, vectype))
2264 unsigned int npeel_tmp = 0;
2265 bool negative = tree_int_cst_compare (DR_STEP (dr),
2266 size_zero_node) < 0;
2268 /* If known_alignment_for_access_p then we have set
2269 DR_MISALIGNMENT which is only done if we know it at compiler
2270 time, so it is safe to assume target alignment is constant.
2272 unsigned int target_align =
2273 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2274 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2275 poly_int64 off = 0;
2276 if (negative)
2277 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2278 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2279 mis = negative ? mis : -mis;
2280 if (mis != 0)
2281 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2283 /* For multiple types, it is possible that the bigger type access
2284 will have more than one peeling option. E.g., a loop with two
2285 types: one of size (vector size / 4), and the other one of
2286 size (vector size / 8). Vectorization factor will 8. If both
2287 accesses are misaligned by 3, the first one needs one scalar
2288 iteration to be aligned, and the second one needs 5. But the
2289 first one will be aligned also by peeling 5 scalar
2290 iterations, and in that case both accesses will be aligned.
2291 Hence, except for the immediate peeling amount, we also want
2292 to try to add full vector size, while we don't exceed
2293 vectorization factor.
2294 We do this automatically for cost model, since we calculate
2295 cost for every peeling option. */
2296 poly_uint64 nscalars = npeel_tmp;
2297 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2299 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2300 unsigned group_size = 1;
2301 if (STMT_SLP_TYPE (stmt_info)
2302 && STMT_VINFO_GROUPED_ACCESS (stmt_info))
2303 group_size = DR_GROUP_SIZE (stmt_info);
2304 nscalars = vf * group_size;
2307 /* Save info about DR in the hash table. Also include peeling
2308 amounts according to the explanation above. Indicate
2309 the alignment status when the ref is not aligned.
2310 ??? Rather than using unknown alignment here we should
2311 prune all entries from the peeling hashtable which cause
2312 DRs to be not supported. */
2313 bool supportable_if_not_aligned
2314 = vect_supportable_dr_alignment
2315 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2316 while (known_le (npeel_tmp, nscalars))
2318 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2319 dr_info, npeel_tmp,
2320 supportable_if_not_aligned);
2321 npeel_tmp += MAX (1, target_align / dr_size);
2324 one_misalignment_known = true;
2326 else
2328 /* If we don't know any misalignment values, we prefer
2329 peeling for data-ref that has the maximum number of data-refs
2330 with the same alignment, unless the target prefers to align
2331 stores over load. */
2332 unsigned same_align_drs = n_same_align_refs[i];
2333 if (!dr0_info
2334 || dr0_same_align_drs < same_align_drs)
2336 dr0_same_align_drs = same_align_drs;
2337 dr0_info = dr_info;
2339 /* For data-refs with the same number of related
2340 accesses prefer the one where the misalign
2341 computation will be invariant in the outermost loop. */
2342 else if (dr0_same_align_drs == same_align_drs)
2344 class loop *ivloop0, *ivloop;
2345 ivloop0 = outermost_invariant_loop_for_expr
2346 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2347 ivloop = outermost_invariant_loop_for_expr
2348 (loop, DR_BASE_ADDRESS (dr));
2349 if ((ivloop && !ivloop0)
2350 || (ivloop && ivloop0
2351 && flow_loop_nested_p (ivloop, ivloop0)))
2352 dr0_info = dr_info;
2355 one_misalignment_unknown = true;
2357 /* Check for data refs with unsupportable alignment that
2358 can be peeled. */
2359 enum dr_alignment_support supportable_dr_alignment
2360 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2361 DR_MISALIGNMENT_UNKNOWN);
2362 if (supportable_dr_alignment == dr_unaligned_unsupported)
2364 one_dr_unsupportable = true;
2365 unsupportable_dr_info = dr_info;
2368 if (!first_store && DR_IS_WRITE (dr))
2370 first_store = dr_info;
2371 first_store_same_align_drs = same_align_drs;
2375 else
2377 if (!aligned_access_p (dr_info, vectype))
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "vector alignment may not be reachable\n");
2382 break;
2387 /* Check if we can possibly peel the loop. */
2388 if (!vect_can_advance_ivs_p (loop_vinfo)
2389 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2390 loop_preheader_edge (loop))
2391 || loop->inner
2392 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2393 do_peeling = false;
2395 struct _vect_peel_extended_info peel_for_known_alignment;
2396 struct _vect_peel_extended_info peel_for_unknown_alignment;
2397 struct _vect_peel_extended_info best_peel;
2399 peel_for_unknown_alignment.inside_cost = INT_MAX;
2400 peel_for_unknown_alignment.outside_cost = INT_MAX;
2401 peel_for_unknown_alignment.peel_info.count = 0;
2403 if (do_peeling
2404 && one_misalignment_unknown)
2406 /* Check if the target requires to prefer stores over loads, i.e., if
2407 misaligned stores are more expensive than misaligned loads (taking
2408 drs with same alignment into account). */
2409 unsigned int load_inside_cost = 0;
2410 unsigned int load_outside_cost = 0;
2411 unsigned int store_inside_cost = 0;
2412 unsigned int store_outside_cost = 0;
2413 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2415 stmt_vector_for_cost dummy;
2416 dummy.create (2);
2417 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2418 &load_inside_cost,
2419 &load_outside_cost,
2420 &dummy, &dummy, estimated_npeels);
2421 dummy.release ();
2423 if (first_store)
2425 dummy.create (2);
2426 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2427 &store_inside_cost,
2428 &store_outside_cost,
2429 &dummy, &dummy,
2430 estimated_npeels);
2431 dummy.release ();
2433 else
2435 store_inside_cost = INT_MAX;
2436 store_outside_cost = INT_MAX;
2439 if (load_inside_cost > store_inside_cost
2440 || (load_inside_cost == store_inside_cost
2441 && load_outside_cost > store_outside_cost))
2443 dr0_info = first_store;
2444 dr0_same_align_drs = first_store_same_align_drs;
2445 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2446 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2448 else
2450 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2451 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2454 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2455 prologue_cost_vec.create (2);
2456 epilogue_cost_vec.create (2);
2458 int dummy2;
2459 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2460 (loop_vinfo, estimated_npeels, &dummy2,
2461 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2462 &prologue_cost_vec, &epilogue_cost_vec);
2464 prologue_cost_vec.release ();
2465 epilogue_cost_vec.release ();
2467 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2470 peel_for_unknown_alignment.peel_info.npeel = 0;
2471 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2473 best_peel = peel_for_unknown_alignment;
2475 peel_for_known_alignment.inside_cost = INT_MAX;
2476 peel_for_known_alignment.outside_cost = INT_MAX;
2477 peel_for_known_alignment.peel_info.count = 0;
2478 peel_for_known_alignment.peel_info.dr_info = NULL;
2480 if (do_peeling && one_misalignment_known)
2482 /* Peeling is possible, but there is no data access that is not supported
2483 unless aligned. So we try to choose the best possible peeling from
2484 the hash table. */
2485 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2486 (&peeling_htab, loop_vinfo);
2489 /* Compare costs of peeling for known and unknown alignment. */
2490 if (peel_for_known_alignment.peel_info.dr_info != NULL
2491 && peel_for_unknown_alignment.inside_cost
2492 >= peel_for_known_alignment.inside_cost)
2494 best_peel = peel_for_known_alignment;
2496 /* If the best peeling for known alignment has NPEEL == 0, perform no
2497 peeling at all except if there is an unsupportable dr that we can
2498 align. */
2499 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2500 do_peeling = false;
2503 /* If there is an unsupportable data ref, prefer this over all choices so far
2504 since we'd have to discard a chosen peeling except when it accidentally
2505 aligned the unsupportable data ref. */
2506 if (one_dr_unsupportable)
2507 dr0_info = unsupportable_dr_info;
2508 else if (do_peeling)
2510 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2511 TODO: Use nopeel_outside_cost or get rid of it? */
2512 unsigned nopeel_inside_cost = 0;
2513 unsigned nopeel_outside_cost = 0;
2515 stmt_vector_for_cost dummy;
2516 dummy.create (2);
2517 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2518 &nopeel_outside_cost, &dummy, &dummy, 0);
2519 dummy.release ();
2521 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2522 costs will be recorded. */
2523 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2524 prologue_cost_vec.create (2);
2525 epilogue_cost_vec.create (2);
2527 int dummy2;
2528 nopeel_outside_cost += vect_get_known_peeling_cost
2529 (loop_vinfo, 0, &dummy2,
2530 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2531 &prologue_cost_vec, &epilogue_cost_vec);
2533 prologue_cost_vec.release ();
2534 epilogue_cost_vec.release ();
2536 npeel = best_peel.peel_info.npeel;
2537 dr0_info = best_peel.peel_info.dr_info;
2539 /* If no peeling is not more expensive than the best peeling we
2540 have so far, don't perform any peeling. */
2541 if (nopeel_inside_cost <= best_peel.inside_cost)
2542 do_peeling = false;
2545 if (do_peeling)
2547 stmt_vec_info stmt_info = dr0_info->stmt;
2548 if (known_alignment_for_access_p (dr0_info,
2549 STMT_VINFO_VECTYPE (stmt_info)))
2551 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2552 size_zero_node) < 0;
2553 if (!npeel)
2555 /* Since it's known at compile time, compute the number of
2556 iterations in the peeled loop (the peeling factor) for use in
2557 updating DR_MISALIGNMENT values. The peeling factor is the
2558 vectorization factor minus the misalignment as an element
2559 count. */
2560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2561 poly_int64 off = 0;
2562 if (negative)
2563 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2564 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2565 unsigned int mis
2566 = dr_misalignment (dr0_info, vectype, off);
2567 mis = negative ? mis : -mis;
2568 /* If known_alignment_for_access_p then we have set
2569 DR_MISALIGNMENT which is only done if we know it at compiler
2570 time, so it is safe to assume target alignment is constant.
2572 unsigned int target_align =
2573 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2574 npeel = ((mis & (target_align - 1))
2575 / vect_get_scalar_dr_size (dr0_info));
2578 /* For interleaved data access every iteration accesses all the
2579 members of the group, therefore we divide the number of iterations
2580 by the group size. */
2581 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2582 npeel /= DR_GROUP_SIZE (stmt_info);
2584 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_NOTE, vect_location,
2586 "Try peeling by %d\n", npeel);
2589 /* Ensure that all datarefs can be vectorized after the peel. */
2590 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2591 do_peeling = false;
2593 /* Check if all datarefs are supportable and log. */
2594 if (do_peeling
2595 && npeel == 0
2596 && known_alignment_for_access_p (dr0_info,
2597 STMT_VINFO_VECTYPE (stmt_info)))
2598 return opt_result::success ();
2600 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2601 if (do_peeling)
2603 unsigned max_allowed_peel
2604 = param_vect_max_peeling_for_alignment;
2605 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2606 max_allowed_peel = 0;
2607 if (max_allowed_peel != (unsigned)-1)
2609 unsigned max_peel = npeel;
2610 if (max_peel == 0)
2612 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2613 unsigned HOST_WIDE_INT target_align_c;
2614 if (target_align.is_constant (&target_align_c))
2615 max_peel =
2616 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2617 else
2619 do_peeling = false;
2620 if (dump_enabled_p ())
2621 dump_printf_loc (MSG_NOTE, vect_location,
2622 "Disable peeling, max peels set and vector"
2623 " alignment unknown\n");
2626 if (max_peel > max_allowed_peel)
2628 do_peeling = false;
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_NOTE, vect_location,
2631 "Disable peeling, max peels reached: %d\n", max_peel);
2636 /* Cost model #2 - if peeling may result in a remaining loop not
2637 iterating enough to be vectorized then do not peel. Since this
2638 is a cost heuristic rather than a correctness decision, use the
2639 most likely runtime value for variable vectorization factors. */
2640 if (do_peeling
2641 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2643 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2644 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2645 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2646 < assumed_vf + max_peel)
2647 do_peeling = false;
2650 if (do_peeling)
2652 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2653 If the misalignment of DR_i is identical to that of dr0 then set
2654 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2655 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2656 by the peeling factor times the element size of DR_i (MOD the
2657 vectorization factor times the size). Otherwise, the
2658 misalignment of DR_i must be set to unknown. */
2659 FOR_EACH_VEC_ELT (datarefs, i, dr)
2660 if (dr != dr0_info->dr)
2662 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2663 if (!vect_relevant_for_alignment_p (dr_info))
2664 continue;
2666 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2669 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2670 if (npeel)
2671 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2672 else
2673 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2674 SET_DR_MISALIGNMENT (dr0_info,
2675 vect_dr_misalign_for_aligned_access (dr0_info));
2676 if (dump_enabled_p ())
2678 dump_printf_loc (MSG_NOTE, vect_location,
2679 "Alignment of access forced using peeling.\n");
2680 dump_printf_loc (MSG_NOTE, vect_location,
2681 "Peeling for alignment will be applied.\n");
2684 /* The inside-loop cost will be accounted for in vectorizable_load
2685 and vectorizable_store correctly with adjusted alignments.
2686 Drop the body_cst_vec on the floor here. */
2687 return opt_result::success ();
2691 /* (2) Versioning to force alignment. */
2693 /* Try versioning if:
2694 1) optimize loop for speed and the cost-model is not cheap
2695 2) there is at least one unsupported misaligned data ref with an unknown
2696 misalignment, and
2697 3) all misaligned data refs with a known misalignment are supported, and
2698 4) the number of runtime alignment checks is within reason. */
2700 do_versioning
2701 = (optimize_loop_nest_for_speed_p (loop)
2702 && !loop->inner /* FORNOW */
2703 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2705 if (do_versioning)
2707 FOR_EACH_VEC_ELT (datarefs, i, dr)
2709 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2710 if (!vect_relevant_for_alignment_p (dr_info))
2711 continue;
2713 stmt_vec_info stmt_info = dr_info->stmt;
2714 if (STMT_VINFO_STRIDED_P (stmt_info))
2716 do_versioning = false;
2717 break;
2720 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2721 bool negative = tree_int_cst_compare (DR_STEP (dr),
2722 size_zero_node) < 0;
2723 poly_int64 off = 0;
2724 if (negative)
2725 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2726 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2727 int misalignment;
2728 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2729 continue;
2731 enum dr_alignment_support supportable_dr_alignment
2732 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2733 misalignment);
2734 if (supportable_dr_alignment == dr_unaligned_unsupported)
2736 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2737 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2738 >= (unsigned) param_vect_max_version_for_alignment_checks))
2740 do_versioning = false;
2741 break;
2744 /* At present we don't support versioning for alignment
2745 with variable VF, since there's no guarantee that the
2746 VF is a power of two. We could relax this if we added
2747 a way of enforcing a power-of-two size. */
2748 unsigned HOST_WIDE_INT size;
2749 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2751 do_versioning = false;
2752 break;
2755 /* Forcing alignment in the first iteration is no good if
2756 we don't keep it across iterations. For now, just disable
2757 versioning in this case.
2758 ?? We could actually unroll the loop to achieve the required
2759 overall step alignment, and forcing the alignment could be
2760 done by doing some iterations of the non-vectorized loop. */
2761 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2762 * DR_STEP_ALIGNMENT (dr),
2763 DR_TARGET_ALIGNMENT (dr_info)))
2765 do_versioning = false;
2766 break;
2769 /* The rightmost bits of an aligned address must be zeros.
2770 Construct the mask needed for this test. For example,
2771 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2772 mask must be 15 = 0xf. */
2773 int mask = size - 1;
2775 /* FORNOW: use the same mask to test all potentially unaligned
2776 references in the loop. */
2777 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2778 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2780 do_versioning = false;
2781 break;
2784 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2785 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2789 /* Versioning requires at least one misaligned data reference. */
2790 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2791 do_versioning = false;
2792 else if (!do_versioning)
2793 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2796 if (do_versioning)
2798 const vec<stmt_vec_info> &may_misalign_stmts
2799 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2800 stmt_vec_info stmt_info;
2802 /* It can now be assumed that the data references in the statements
2803 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2804 of the loop being vectorized. */
2805 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2807 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2808 SET_DR_MISALIGNMENT (dr_info,
2809 vect_dr_misalign_for_aligned_access (dr_info));
2810 if (dump_enabled_p ())
2811 dump_printf_loc (MSG_NOTE, vect_location,
2812 "Alignment of access forced using versioning.\n");
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_NOTE, vect_location,
2817 "Versioning for alignment will be applied.\n");
2819 /* Peeling and versioning can't be done together at this time. */
2820 gcc_assert (! (do_peeling && do_versioning));
2822 return opt_result::success ();
2825 /* This point is reached if neither peeling nor versioning is being done. */
2826 gcc_assert (! (do_peeling || do_versioning));
2828 return opt_result::success ();
2832 /* Function vect_analyze_data_refs_alignment
2834 Analyze the alignment of the data-references in the loop.
2835 Return FALSE if a data reference is found that cannot be vectorized. */
2837 opt_result
2838 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2840 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2842 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2843 struct data_reference *dr;
2844 unsigned int i;
2846 vect_record_base_alignments (loop_vinfo);
2847 FOR_EACH_VEC_ELT (datarefs, i, dr)
2849 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2850 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2852 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2853 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2854 continue;
2855 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2856 STMT_VINFO_VECTYPE (dr_info->stmt));
2860 return opt_result::success ();
2864 /* Analyze alignment of DRs of stmts in NODE. */
2866 static bool
2867 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2869 /* Alignment is maintained in the first element of the group. */
2870 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2871 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2872 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2873 tree vectype = SLP_TREE_VECTYPE (node);
2874 poly_uint64 vector_alignment
2875 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2876 BITS_PER_UNIT);
2877 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2878 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2879 /* Re-analyze alignment when we're facing a vectorization with a bigger
2880 alignment requirement. */
2881 else if (known_lt (dr_info->target_alignment, vector_alignment))
2883 poly_uint64 old_target_alignment = dr_info->target_alignment;
2884 int old_misalignment = dr_info->misalignment;
2885 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2886 /* But keep knowledge about a smaller alignment. */
2887 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2888 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2890 dr_info->target_alignment = old_target_alignment;
2891 dr_info->misalignment = old_misalignment;
2894 /* When we ever face unordered target alignments the first one wins in terms
2895 of analyzing and the other will become unknown in dr_misalignment. */
2896 return true;
2899 /* Function vect_slp_analyze_instance_alignment
2901 Analyze the alignment of the data-references in the SLP instance.
2902 Return FALSE if a data reference is found that cannot be vectorized. */
2904 bool
2905 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2906 slp_instance instance)
2908 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2910 slp_tree node;
2911 unsigned i;
2912 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2913 if (! vect_slp_analyze_node_alignment (vinfo, node))
2914 return false;
2916 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2917 && ! vect_slp_analyze_node_alignment
2918 (vinfo, SLP_INSTANCE_TREE (instance)))
2919 return false;
2921 return true;
2925 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2926 accesses of legal size, step, etc. Detect gaps, single element
2927 interleaving, and other special cases. Set grouped access info.
2928 Collect groups of strided stores for further use in SLP analysis.
2929 Worker for vect_analyze_group_access. */
2931 static bool
2932 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2934 data_reference *dr = dr_info->dr;
2935 tree step = DR_STEP (dr);
2936 tree scalar_type = TREE_TYPE (DR_REF (dr));
2937 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2938 stmt_vec_info stmt_info = dr_info->stmt;
2939 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2940 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2941 HOST_WIDE_INT dr_step = -1;
2942 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2943 bool slp_impossible = false;
2945 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2946 size of the interleaving group (including gaps). */
2947 if (tree_fits_shwi_p (step))
2949 dr_step = tree_to_shwi (step);
2950 /* Check that STEP is a multiple of type size. Otherwise there is
2951 a non-element-sized gap at the end of the group which we
2952 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2953 ??? As we can handle non-constant step fine here we should
2954 simply remove uses of DR_GROUP_GAP between the last and first
2955 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2956 simply not include that gap. */
2957 if ((dr_step % type_size) != 0)
2959 if (dump_enabled_p ())
2960 dump_printf_loc (MSG_NOTE, vect_location,
2961 "Step %T is not a multiple of the element size"
2962 " for %T\n",
2963 step, DR_REF (dr));
2964 return false;
2966 groupsize = absu_hwi (dr_step) / type_size;
2968 else
2969 groupsize = 0;
2971 /* Not consecutive access is possible only if it is a part of interleaving. */
2972 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2974 /* Check if it this DR is a part of interleaving, and is a single
2975 element of the group that is accessed in the loop. */
2977 /* Gaps are supported only for loads. STEP must be a multiple of the type
2978 size. */
2979 if (DR_IS_READ (dr)
2980 && (dr_step % type_size) == 0
2981 && groupsize > 0
2982 /* This could be UINT_MAX but as we are generating code in a very
2983 inefficient way we have to cap earlier.
2984 See PR91403 for example. */
2985 && groupsize <= 4096)
2987 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2988 DR_GROUP_SIZE (stmt_info) = groupsize;
2989 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2990 if (dump_enabled_p ())
2991 dump_printf_loc (MSG_NOTE, vect_location,
2992 "Detected single element interleaving %T"
2993 " step %T\n",
2994 DR_REF (dr), step);
2996 return true;
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001 "not consecutive access %G", stmt_info->stmt);
3003 if (bb_vinfo)
3005 /* Mark the statement as unvectorizable. */
3006 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3007 return true;
3010 if (dump_enabled_p ())
3011 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3012 STMT_VINFO_STRIDED_P (stmt_info) = true;
3013 return true;
3016 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3018 /* First stmt in the interleaving chain. Check the chain. */
3019 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3020 struct data_reference *data_ref = dr;
3021 unsigned int count = 1;
3022 tree prev_init = DR_INIT (data_ref);
3023 HOST_WIDE_INT diff, gaps = 0;
3025 /* By construction, all group members have INTEGER_CST DR_INITs. */
3026 while (next)
3028 /* We never have the same DR multiple times. */
3029 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3030 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3032 data_ref = STMT_VINFO_DATA_REF (next);
3034 /* All group members have the same STEP by construction. */
3035 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3037 /* Check that the distance between two accesses is equal to the type
3038 size. Otherwise, we have gaps. */
3039 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3040 - TREE_INT_CST_LOW (prev_init)) / type_size;
3041 if (diff < 1 || diff > UINT_MAX)
3043 /* For artificial testcases with array accesses with large
3044 constant indices we can run into overflow issues which
3045 can end up fooling the groupsize constraint below so
3046 check the individual gaps (which are represented as
3047 unsigned int) as well. */
3048 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3050 "interleaved access with gap larger "
3051 "than representable\n");
3052 return false;
3054 if (diff != 1)
3056 /* FORNOW: SLP of accesses with gaps is not supported. */
3057 slp_impossible = true;
3058 if (DR_IS_WRITE (data_ref))
3060 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062 "interleaved store with gaps\n");
3063 return false;
3066 gaps += diff - 1;
3069 last_accessed_element += diff;
3071 /* Store the gap from the previous member of the group. If there is no
3072 gap in the access, DR_GROUP_GAP is always 1. */
3073 DR_GROUP_GAP (next) = diff;
3075 prev_init = DR_INIT (data_ref);
3076 next = DR_GROUP_NEXT_ELEMENT (next);
3077 /* Count the number of data-refs in the chain. */
3078 count++;
3081 if (groupsize == 0)
3082 groupsize = count + gaps;
3084 /* This could be UINT_MAX but as we are generating code in a very
3085 inefficient way we have to cap earlier. See PR78699 for example. */
3086 if (groupsize > 4096)
3088 if (dump_enabled_p ())
3089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090 "group is too large\n");
3091 return false;
3094 /* Check that the size of the interleaving is equal to count for stores,
3095 i.e., that there are no gaps. */
3096 if (groupsize != count
3097 && !DR_IS_READ (dr))
3099 groupsize = count;
3100 STMT_VINFO_STRIDED_P (stmt_info) = true;
3103 /* If there is a gap after the last load in the group it is the
3104 difference between the groupsize and the last accessed
3105 element.
3106 When there is no gap, this difference should be 0. */
3107 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3109 DR_GROUP_SIZE (stmt_info) = groupsize;
3110 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE, vect_location,
3113 "Detected interleaving ");
3114 if (DR_IS_READ (dr))
3115 dump_printf (MSG_NOTE, "load ");
3116 else if (STMT_VINFO_STRIDED_P (stmt_info))
3117 dump_printf (MSG_NOTE, "strided store ");
3118 else
3119 dump_printf (MSG_NOTE, "store ");
3120 dump_printf (MSG_NOTE, "of size %u\n",
3121 (unsigned)groupsize);
3122 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3123 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3124 while (next)
3126 if (DR_GROUP_GAP (next) != 1)
3127 dump_printf_loc (MSG_NOTE, vect_location,
3128 "\t<gap of %d elements>\n",
3129 DR_GROUP_GAP (next) - 1);
3130 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3131 next = DR_GROUP_NEXT_ELEMENT (next);
3133 if (DR_GROUP_GAP (stmt_info) != 0)
3134 dump_printf_loc (MSG_NOTE, vect_location,
3135 "\t<gap of %d elements>\n",
3136 DR_GROUP_GAP (stmt_info));
3139 /* SLP: create an SLP data structure for every interleaving group of
3140 stores for further analysis in vect_analyse_slp. */
3141 if (DR_IS_WRITE (dr) && !slp_impossible)
3143 if (loop_vinfo)
3144 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3145 if (bb_vinfo)
3146 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3150 return true;
3153 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3154 accesses of legal size, step, etc. Detect gaps, single element
3155 interleaving, and other special cases. Set grouped access info.
3156 Collect groups of strided stores for further use in SLP analysis. */
3158 static bool
3159 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3161 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3163 /* Dissolve the group if present. */
3164 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3165 while (stmt_info)
3167 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3168 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3169 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3170 stmt_info = next;
3172 return false;
3174 return true;
3177 /* Analyze the access pattern of the data-reference DR_INFO.
3178 In case of non-consecutive accesses call vect_analyze_group_access() to
3179 analyze groups of accesses. */
3181 static bool
3182 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3184 data_reference *dr = dr_info->dr;
3185 tree step = DR_STEP (dr);
3186 tree scalar_type = TREE_TYPE (DR_REF (dr));
3187 stmt_vec_info stmt_info = dr_info->stmt;
3188 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3189 class loop *loop = NULL;
3191 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3192 return true;
3194 if (loop_vinfo)
3195 loop = LOOP_VINFO_LOOP (loop_vinfo);
3197 if (loop_vinfo && !step)
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201 "bad data-ref access in loop\n");
3202 return false;
3205 /* Allow loads with zero step in inner-loop vectorization. */
3206 if (loop_vinfo && integer_zerop (step))
3208 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3209 if (!nested_in_vect_loop_p (loop, stmt_info))
3210 return DR_IS_READ (dr);
3211 /* Allow references with zero step for outer loops marked
3212 with pragma omp simd only - it guarantees absence of
3213 loop-carried dependencies between inner loop iterations. */
3214 if (loop->safelen < 2)
3216 if (dump_enabled_p ())
3217 dump_printf_loc (MSG_NOTE, vect_location,
3218 "zero step in inner loop of nest\n");
3219 return false;
3223 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3225 /* Interleaved accesses are not yet supported within outer-loop
3226 vectorization for references in the inner-loop. */
3227 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3229 /* For the rest of the analysis we use the outer-loop step. */
3230 step = STMT_VINFO_DR_STEP (stmt_info);
3231 if (integer_zerop (step))
3233 if (dump_enabled_p ())
3234 dump_printf_loc (MSG_NOTE, vect_location,
3235 "zero step in outer loop.\n");
3236 return DR_IS_READ (dr);
3240 /* Consecutive? */
3241 if (TREE_CODE (step) == INTEGER_CST)
3243 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3244 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3245 || (dr_step < 0
3246 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3248 /* Mark that it is not interleaving. */
3249 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3250 return true;
3254 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3256 if (dump_enabled_p ())
3257 dump_printf_loc (MSG_NOTE, vect_location,
3258 "grouped access in outer loop.\n");
3259 return false;
3263 /* Assume this is a DR handled by non-constant strided load case. */
3264 if (TREE_CODE (step) != INTEGER_CST)
3265 return (STMT_VINFO_STRIDED_P (stmt_info)
3266 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3267 || vect_analyze_group_access (vinfo, dr_info)));
3269 /* Not consecutive access - check if it's a part of interleaving group. */
3270 return vect_analyze_group_access (vinfo, dr_info);
3273 /* Compare two data-references DRA and DRB to group them into chunks
3274 suitable for grouping. */
3276 static int
3277 dr_group_sort_cmp (const void *dra_, const void *drb_)
3279 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3280 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3281 data_reference_p dra = dra_info->dr;
3282 data_reference_p drb = drb_info->dr;
3283 int cmp;
3285 /* Stabilize sort. */
3286 if (dra == drb)
3287 return 0;
3289 /* Different group IDs lead never belong to the same group. */
3290 if (dra_info->group != drb_info->group)
3291 return dra_info->group < drb_info->group ? -1 : 1;
3293 /* Ordering of DRs according to base. */
3294 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3295 DR_BASE_ADDRESS (drb));
3296 if (cmp != 0)
3297 return cmp;
3299 /* And according to DR_OFFSET. */
3300 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3301 if (cmp != 0)
3302 return cmp;
3304 /* Put reads before writes. */
3305 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3306 return DR_IS_READ (dra) ? -1 : 1;
3308 /* Then sort after access size. */
3309 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3310 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3311 if (cmp != 0)
3312 return cmp;
3314 /* And after step. */
3315 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3316 if (cmp != 0)
3317 return cmp;
3319 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3320 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3321 if (cmp == 0)
3322 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3323 return cmp;
3326 /* If OP is the result of a conversion, return the unconverted value,
3327 otherwise return null. */
3329 static tree
3330 strip_conversion (tree op)
3332 if (TREE_CODE (op) != SSA_NAME)
3333 return NULL_TREE;
3334 gimple *stmt = SSA_NAME_DEF_STMT (op);
3335 if (!is_gimple_assign (stmt)
3336 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3337 return NULL_TREE;
3338 return gimple_assign_rhs1 (stmt);
3341 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3342 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3343 be grouped in SLP mode. */
3345 static bool
3346 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3347 bool allow_slp_p)
3349 if (gimple_assign_single_p (stmt1_info->stmt))
3350 return gimple_assign_single_p (stmt2_info->stmt);
3352 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3353 if (call1 && gimple_call_internal_p (call1))
3355 /* Check for two masked loads or two masked stores. */
3356 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3357 if (!call2 || !gimple_call_internal_p (call2))
3358 return false;
3359 internal_fn ifn = gimple_call_internal_fn (call1);
3360 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3361 return false;
3362 if (ifn != gimple_call_internal_fn (call2))
3363 return false;
3365 /* Check that the masks are the same. Cope with casts of masks,
3366 like those created by build_mask_conversion. */
3367 tree mask1 = gimple_call_arg (call1, 2);
3368 tree mask2 = gimple_call_arg (call2, 2);
3369 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3371 mask1 = strip_conversion (mask1);
3372 if (!mask1)
3373 return false;
3374 mask2 = strip_conversion (mask2);
3375 if (!mask2)
3376 return false;
3377 if (!operand_equal_p (mask1, mask2, 0))
3378 return false;
3380 return true;
3383 return false;
3386 /* Function vect_analyze_data_ref_accesses.
3388 Analyze the access pattern of all the data references in the loop.
3390 FORNOW: the only access pattern that is considered vectorizable is a
3391 simple step 1 (consecutive) access.
3393 FORNOW: handle only arrays and pointer accesses. */
3395 opt_result
3396 vect_analyze_data_ref_accesses (vec_info *vinfo,
3397 vec<int> *dataref_groups)
3399 unsigned int i;
3400 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3402 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3404 if (datarefs.is_empty ())
3405 return opt_result::success ();
3407 /* Sort the array of datarefs to make building the interleaving chains
3408 linear. Don't modify the original vector's order, it is needed for
3409 determining what dependencies are reversed. */
3410 vec<dr_vec_info *> datarefs_copy;
3411 datarefs_copy.create (datarefs.length ());
3412 for (unsigned i = 0; i < datarefs.length (); i++)
3414 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3415 /* If the caller computed DR grouping use that, otherwise group by
3416 basic blocks. */
3417 if (dataref_groups)
3418 dr_info->group = (*dataref_groups)[i];
3419 else
3420 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3421 datarefs_copy.quick_push (dr_info);
3423 datarefs_copy.qsort (dr_group_sort_cmp);
3424 hash_set<stmt_vec_info> to_fixup;
3426 /* Build the interleaving chains. */
3427 for (i = 0; i < datarefs_copy.length () - 1;)
3429 dr_vec_info *dr_info_a = datarefs_copy[i];
3430 data_reference_p dra = dr_info_a->dr;
3431 int dra_group_id = dr_info_a->group;
3432 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3433 stmt_vec_info lastinfo = NULL;
3434 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3435 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3437 ++i;
3438 continue;
3440 for (i = i + 1; i < datarefs_copy.length (); ++i)
3442 dr_vec_info *dr_info_b = datarefs_copy[i];
3443 data_reference_p drb = dr_info_b->dr;
3444 int drb_group_id = dr_info_b->group;
3445 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3446 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3447 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3448 break;
3450 /* ??? Imperfect sorting (non-compatible types, non-modulo
3451 accesses, same accesses) can lead to a group to be artificially
3452 split here as we don't just skip over those. If it really
3453 matters we can push those to a worklist and re-iterate
3454 over them. The we can just skip ahead to the next DR here. */
3456 /* DRs in a different DR group should not be put into the same
3457 interleaving group. */
3458 if (dra_group_id != drb_group_id)
3459 break;
3461 /* Check that the data-refs have same first location (except init)
3462 and they are both either store or load (not load and store,
3463 not masked loads or stores). */
3464 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3465 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3466 DR_BASE_ADDRESS (drb)) != 0
3467 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3468 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3469 break;
3471 /* Check that the data-refs have the same constant size. */
3472 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3473 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3474 if (!tree_fits_uhwi_p (sza)
3475 || !tree_fits_uhwi_p (szb)
3476 || !tree_int_cst_equal (sza, szb))
3477 break;
3479 /* Check that the data-refs have the same step. */
3480 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3481 break;
3483 /* Check the types are compatible.
3484 ??? We don't distinguish this during sorting. */
3485 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3486 TREE_TYPE (DR_REF (drb))))
3487 break;
3489 /* Check that the DR_INITs are compile-time constants. */
3490 if (!tree_fits_shwi_p (DR_INIT (dra))
3491 || !tree_fits_shwi_p (DR_INIT (drb)))
3492 break;
3494 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3495 just hold extra information. */
3496 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3497 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3498 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3499 break;
3501 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3502 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3503 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3504 HOST_WIDE_INT init_prev
3505 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3506 gcc_assert (init_a <= init_b
3507 && init_a <= init_prev
3508 && init_prev <= init_b);
3510 /* Do not place the same access in the interleaving chain twice. */
3511 if (init_b == init_prev)
3513 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3514 < gimple_uid (DR_STMT (drb)));
3515 /* Simply link in duplicates and fix up the chain below. */
3517 else
3519 /* If init_b == init_a + the size of the type * k, we have an
3520 interleaving, and DRA is accessed before DRB. */
3521 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3522 if (type_size_a == 0
3523 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3524 % type_size_a != 0))
3525 break;
3527 /* If we have a store, the accesses are adjacent. This splits
3528 groups into chunks we support (we don't support vectorization
3529 of stores with gaps). */
3530 if (!DR_IS_READ (dra)
3531 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3532 != type_size_a))
3533 break;
3535 /* If the step (if not zero or non-constant) is smaller than the
3536 difference between data-refs' inits this splits groups into
3537 suitable sizes. */
3538 if (tree_fits_shwi_p (DR_STEP (dra)))
3540 unsigned HOST_WIDE_INT step
3541 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3542 if (step != 0
3543 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3544 break;
3548 if (dump_enabled_p ())
3549 dump_printf_loc (MSG_NOTE, vect_location,
3550 DR_IS_READ (dra)
3551 ? "Detected interleaving load %T and %T\n"
3552 : "Detected interleaving store %T and %T\n",
3553 DR_REF (dra), DR_REF (drb));
3555 /* Link the found element into the group list. */
3556 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3558 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3559 lastinfo = stmtinfo_a;
3561 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3562 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3563 lastinfo = stmtinfo_b;
3565 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3566 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3568 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3569 dump_printf_loc (MSG_NOTE, vect_location,
3570 "Load suitable for SLP vectorization only.\n");
3572 if (init_b == init_prev
3573 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3574 && dump_enabled_p ())
3575 dump_printf_loc (MSG_NOTE, vect_location,
3576 "Queuing group with duplicate access for fixup\n");
3580 /* Fixup groups with duplicate entries by splitting it. */
3581 while (1)
3583 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3584 if (!(it != to_fixup.end ()))
3585 break;
3586 stmt_vec_info grp = *it;
3587 to_fixup.remove (grp);
3589 /* Find the earliest duplicate group member. */
3590 unsigned first_duplicate = -1u;
3591 stmt_vec_info next, g = grp;
3592 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3594 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3595 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3596 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3597 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3598 g = next;
3600 if (first_duplicate == -1U)
3601 continue;
3603 /* Then move all stmts after the first duplicate to a new group.
3604 Note this is a heuristic but one with the property that *it
3605 is fixed up completely. */
3606 g = grp;
3607 stmt_vec_info newgroup = NULL, ng = grp;
3608 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3610 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3612 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3613 if (!newgroup)
3614 newgroup = next;
3615 else
3616 DR_GROUP_NEXT_ELEMENT (ng) = next;
3617 ng = next;
3618 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3620 else
3621 g = DR_GROUP_NEXT_ELEMENT (g);
3623 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3625 /* Fixup the new group which still may contain duplicates. */
3626 to_fixup.add (newgroup);
3629 dr_vec_info *dr_info;
3630 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3632 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3633 && !vect_analyze_data_ref_access (vinfo, dr_info))
3635 if (dump_enabled_p ())
3636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637 "not vectorized: complicated access pattern.\n");
3639 if (is_a <bb_vec_info> (vinfo))
3641 /* Mark the statement as not vectorizable. */
3642 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3643 continue;
3645 else
3647 datarefs_copy.release ();
3648 return opt_result::failure_at (dr_info->stmt->stmt,
3649 "not vectorized:"
3650 " complicated access pattern.\n");
3655 datarefs_copy.release ();
3656 return opt_result::success ();
3659 /* Function vect_vfa_segment_size.
3661 Input:
3662 DR_INFO: The data reference.
3663 LENGTH_FACTOR: segment length to consider.
3665 Return a value suitable for the dr_with_seg_len::seg_len field.
3666 This is the "distance travelled" by the pointer from the first
3667 iteration in the segment to the last. Note that it does not include
3668 the size of the access; in effect it only describes the first byte. */
3670 static tree
3671 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3673 length_factor = size_binop (MINUS_EXPR,
3674 fold_convert (sizetype, length_factor),
3675 size_one_node);
3676 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3677 length_factor);
3680 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3681 gives the worst-case number of bytes covered by the segment. */
3683 static unsigned HOST_WIDE_INT
3684 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3686 stmt_vec_info stmt_vinfo = dr_info->stmt;
3687 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3688 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3689 unsigned HOST_WIDE_INT access_size = ref_size;
3690 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3692 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3693 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3695 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3696 int misalignment;
3697 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3698 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3699 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3700 == dr_explicit_realign_optimized))
3702 /* We might access a full vector's worth. */
3703 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3705 return access_size;
3708 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3709 describes. */
3711 static unsigned int
3712 vect_vfa_align (dr_vec_info *dr_info)
3714 return dr_alignment (dr_info->dr);
3717 /* Function vect_no_alias_p.
3719 Given data references A and B with equal base and offset, see whether
3720 the alias relation can be decided at compilation time. Return 1 if
3721 it can and the references alias, 0 if it can and the references do
3722 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3723 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3724 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3726 static int
3727 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3728 tree segment_length_a, tree segment_length_b,
3729 unsigned HOST_WIDE_INT access_size_a,
3730 unsigned HOST_WIDE_INT access_size_b)
3732 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3733 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3734 poly_uint64 const_length_a;
3735 poly_uint64 const_length_b;
3737 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3738 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3739 [a, a+12) */
3740 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3742 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3743 offset_a -= const_length_a;
3745 else
3746 const_length_a = tree_to_poly_uint64 (segment_length_a);
3747 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3749 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3750 offset_b -= const_length_b;
3752 else
3753 const_length_b = tree_to_poly_uint64 (segment_length_b);
3755 const_length_a += access_size_a;
3756 const_length_b += access_size_b;
3758 if (ranges_known_overlap_p (offset_a, const_length_a,
3759 offset_b, const_length_b))
3760 return 1;
3762 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3763 offset_b, const_length_b))
3764 return 0;
3766 return -1;
3769 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3770 in DDR is >= VF. */
3772 static bool
3773 dependence_distance_ge_vf (data_dependence_relation *ddr,
3774 unsigned int loop_depth, poly_uint64 vf)
3776 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3777 || DDR_NUM_DIST_VECTS (ddr) == 0)
3778 return false;
3780 /* If the dependence is exact, we should have limited the VF instead. */
3781 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3783 unsigned int i;
3784 lambda_vector dist_v;
3785 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3787 HOST_WIDE_INT dist = dist_v[loop_depth];
3788 if (dist != 0
3789 && !(dist > 0 && DDR_REVERSED_P (ddr))
3790 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3791 return false;
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "dependence distance between %T and %T is >= VF\n",
3797 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3799 return true;
3802 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3804 static void
3805 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3807 dump_printf (dump_kind, "%s (%T) >= ",
3808 lower_bound.unsigned_p ? "unsigned" : "abs",
3809 lower_bound.expr);
3810 dump_dec (dump_kind, lower_bound.min_value);
3813 /* Record that the vectorized loop requires the vec_lower_bound described
3814 by EXPR, UNSIGNED_P and MIN_VALUE. */
3816 static void
3817 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3818 poly_uint64 min_value)
3820 vec<vec_lower_bound> &lower_bounds
3821 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3822 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3823 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3825 unsigned_p &= lower_bounds[i].unsigned_p;
3826 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3827 if (lower_bounds[i].unsigned_p != unsigned_p
3828 || maybe_lt (lower_bounds[i].min_value, min_value))
3830 lower_bounds[i].unsigned_p = unsigned_p;
3831 lower_bounds[i].min_value = min_value;
3832 if (dump_enabled_p ())
3834 dump_printf_loc (MSG_NOTE, vect_location,
3835 "updating run-time check to ");
3836 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3837 dump_printf (MSG_NOTE, "\n");
3840 return;
3843 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3844 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3847 dump_lower_bound (MSG_NOTE, lower_bound);
3848 dump_printf (MSG_NOTE, "\n");
3850 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3853 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3854 will span fewer than GAP bytes. */
3856 static bool
3857 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3858 poly_int64 gap)
3860 stmt_vec_info stmt_info = dr_info->stmt;
3861 HOST_WIDE_INT count
3862 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3863 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3864 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3865 return (estimated_poly_value (gap)
3866 <= count * vect_get_scalar_dr_size (dr_info));
3869 /* Return true if we know that there is no alias between DR_INFO_A and
3870 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3871 When returning true, set *LOWER_BOUND_OUT to this N. */
3873 static bool
3874 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3875 poly_uint64 *lower_bound_out)
3877 /* Check that there is a constant gap of known sign between DR_A
3878 and DR_B. */
3879 data_reference *dr_a = dr_info_a->dr;
3880 data_reference *dr_b = dr_info_b->dr;
3881 poly_int64 init_a, init_b;
3882 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3883 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3884 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3885 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3886 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3887 || !ordered_p (init_a, init_b))
3888 return false;
3890 /* Sort DR_A and DR_B by the address they access. */
3891 if (maybe_lt (init_b, init_a))
3893 std::swap (init_a, init_b);
3894 std::swap (dr_info_a, dr_info_b);
3895 std::swap (dr_a, dr_b);
3898 /* If the two accesses could be dependent within a scalar iteration,
3899 make sure that we'd retain their order. */
3900 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3901 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3902 return false;
3904 /* There is no alias if abs (DR_STEP) is greater than or equal to
3905 the bytes spanned by the combination of the two accesses. */
3906 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3907 return true;
3910 /* Function vect_prune_runtime_alias_test_list.
3912 Prune a list of ddrs to be tested at run-time by versioning for alias.
3913 Merge several alias checks into one if possible.
3914 Return FALSE if resulting list of ddrs is longer then allowed by
3915 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3917 opt_result
3918 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3920 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3921 hash_set <tree_pair_hash> compared_objects;
3923 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3924 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3925 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3926 const vec<vec_object_pair> &check_unequal_addrs
3927 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3928 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3929 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3931 ddr_p ddr;
3932 unsigned int i;
3933 tree length_factor;
3935 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3937 /* Step values are irrelevant for aliasing if the number of vector
3938 iterations is equal to the number of scalar iterations (which can
3939 happen for fully-SLP loops). */
3940 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3942 if (!vf_one_p)
3944 /* Convert the checks for nonzero steps into bound tests. */
3945 tree value;
3946 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3947 vect_check_lower_bound (loop_vinfo, value, true, 1);
3950 if (may_alias_ddrs.is_empty ())
3951 return opt_result::success ();
3953 comp_alias_ddrs.create (may_alias_ddrs.length ());
3955 unsigned int loop_depth
3956 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3957 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3959 /* First, we collect all data ref pairs for aliasing checks. */
3960 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3962 poly_uint64 lower_bound;
3963 tree segment_length_a, segment_length_b;
3964 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3965 unsigned int align_a, align_b;
3967 /* Ignore the alias if the VF we chose ended up being no greater
3968 than the dependence distance. */
3969 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3970 continue;
3972 if (DDR_OBJECT_A (ddr))
3974 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3975 if (!compared_objects.add (new_pair))
3977 if (dump_enabled_p ())
3978 dump_printf_loc (MSG_NOTE, vect_location,
3979 "checking that %T and %T"
3980 " have different addresses\n",
3981 new_pair.first, new_pair.second);
3982 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3984 continue;
3987 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3988 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3990 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3991 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3993 bool preserves_scalar_order_p
3994 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3995 bool ignore_step_p
3996 = (vf_one_p
3997 && (preserves_scalar_order_p
3998 || operand_equal_p (DR_STEP (dr_info_a->dr),
3999 DR_STEP (dr_info_b->dr))));
4001 /* Skip the pair if inter-iteration dependencies are irrelevant
4002 and intra-iteration dependencies are guaranteed to be honored. */
4003 if (ignore_step_p
4004 && (preserves_scalar_order_p
4005 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4006 &lower_bound)))
4008 if (dump_enabled_p ())
4009 dump_printf_loc (MSG_NOTE, vect_location,
4010 "no need for alias check between "
4011 "%T and %T when VF is 1\n",
4012 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4013 continue;
4016 /* See whether we can handle the alias using a bounds check on
4017 the step, and whether that's likely to be the best approach.
4018 (It might not be, for example, if the minimum step is much larger
4019 than the number of bytes handled by one vector iteration.) */
4020 if (!ignore_step_p
4021 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4022 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4023 &lower_bound)
4024 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4025 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4027 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4028 if (dump_enabled_p ())
4030 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4031 "%T and %T when the step %T is outside ",
4032 DR_REF (dr_info_a->dr),
4033 DR_REF (dr_info_b->dr),
4034 DR_STEP (dr_info_a->dr));
4035 if (unsigned_p)
4036 dump_printf (MSG_NOTE, "[0");
4037 else
4039 dump_printf (MSG_NOTE, "(");
4040 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4042 dump_printf (MSG_NOTE, ", ");
4043 dump_dec (MSG_NOTE, lower_bound);
4044 dump_printf (MSG_NOTE, ")\n");
4046 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4047 unsigned_p, lower_bound);
4048 continue;
4051 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4052 if (dr_group_first_a)
4054 stmt_info_a = dr_group_first_a;
4055 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4058 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4059 if (dr_group_first_b)
4061 stmt_info_b = dr_group_first_b;
4062 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4065 if (ignore_step_p)
4067 segment_length_a = size_zero_node;
4068 segment_length_b = size_zero_node;
4070 else
4072 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4073 DR_STEP (dr_info_b->dr), 0))
4074 length_factor = scalar_loop_iters;
4075 else
4076 length_factor = size_int (vect_factor);
4077 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4078 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4080 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4081 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4082 align_a = vect_vfa_align (dr_info_a);
4083 align_b = vect_vfa_align (dr_info_b);
4085 /* See whether the alias is known at compilation time. */
4086 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4087 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4088 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4089 DR_OFFSET (dr_info_b->dr), 0)
4090 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4091 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4092 && poly_int_tree_p (segment_length_a)
4093 && poly_int_tree_p (segment_length_b))
4095 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4096 segment_length_a,
4097 segment_length_b,
4098 access_size_a,
4099 access_size_b);
4100 if (res >= 0 && dump_enabled_p ())
4102 dump_printf_loc (MSG_NOTE, vect_location,
4103 "can tell at compile time that %T and %T",
4104 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4105 if (res == 0)
4106 dump_printf (MSG_NOTE, " do not alias\n");
4107 else
4108 dump_printf (MSG_NOTE, " alias\n");
4111 if (res == 0)
4112 continue;
4114 if (res == 1)
4115 return opt_result::failure_at (stmt_info_b->stmt,
4116 "not vectorized:"
4117 " compilation time alias: %G%G",
4118 stmt_info_a->stmt,
4119 stmt_info_b->stmt);
4122 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4123 access_size_a, align_a);
4124 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4125 access_size_b, align_b);
4126 /* Canonicalize the order to be the one that's needed for accurate
4127 RAW, WAR and WAW flags, in cases where the data references are
4128 well-ordered. The order doesn't really matter otherwise,
4129 but we might as well be consistent. */
4130 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4131 std::swap (dr_a, dr_b);
4133 dr_with_seg_len_pair_t dr_with_seg_len_pair
4134 (dr_a, dr_b, (preserves_scalar_order_p
4135 ? dr_with_seg_len_pair_t::WELL_ORDERED
4136 : dr_with_seg_len_pair_t::REORDERED));
4138 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4141 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4143 unsigned int count = (comp_alias_ddrs.length ()
4144 + check_unequal_addrs.length ());
4146 if (count
4147 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4148 == VECT_COST_MODEL_VERY_CHEAP))
4149 return opt_result::failure_at
4150 (vect_location, "would need a runtime alias check\n");
4152 if (dump_enabled_p ())
4153 dump_printf_loc (MSG_NOTE, vect_location,
4154 "improved number of alias checks from %d to %d\n",
4155 may_alias_ddrs.length (), count);
4156 unsigned limit = param_vect_max_version_for_alias_checks;
4157 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4158 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4159 if (count > limit)
4160 return opt_result::failure_at
4161 (vect_location,
4162 "number of versioning for alias run-time tests exceeds %d "
4163 "(--param vect-max-version-for-alias-checks)\n", limit);
4165 return opt_result::success ();
4168 /* Check whether we can use an internal function for a gather load
4169 or scatter store. READ_P is true for loads and false for stores.
4170 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4171 the type of the memory elements being loaded or stored. OFFSET_TYPE
4172 is the type of the offset that is being applied to the invariant
4173 base address. SCALE is the amount by which the offset should
4174 be multiplied *after* it has been converted to address width.
4176 Return true if the function is supported, storing the function id in
4177 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
4179 bool
4180 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4181 tree vectype, tree memory_type, tree offset_type,
4182 int scale, internal_fn *ifn_out,
4183 tree *offset_vectype_out)
4185 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4186 unsigned int element_bits = vector_element_bits (vectype);
4187 if (element_bits != memory_bits)
4188 /* For now the vector elements must be the same width as the
4189 memory elements. */
4190 return false;
4192 /* Work out which function we need. */
4193 internal_fn ifn, alt_ifn, alt_ifn2;
4194 if (read_p)
4196 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4197 alt_ifn = IFN_MASK_GATHER_LOAD;
4198 /* When target supports MASK_LEN_GATHER_LOAD, we always
4199 use MASK_LEN_GATHER_LOAD regardless whether len and
4200 mask are valid or not. */
4201 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4203 else
4205 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4206 alt_ifn = IFN_MASK_SCATTER_STORE;
4207 /* When target supports MASK_LEN_SCATTER_STORE, we always
4208 use MASK_LEN_SCATTER_STORE regardless whether len and
4209 mask are valid or not. */
4210 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4213 for (;;)
4215 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4216 if (!offset_vectype)
4217 return false;
4219 /* Test whether the target supports this combination. */
4220 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4221 offset_vectype, scale))
4223 *ifn_out = ifn;
4224 *offset_vectype_out = offset_vectype;
4225 return true;
4227 else if (!masked_p
4228 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4229 memory_type,
4230 offset_vectype,
4231 scale))
4233 *ifn_out = alt_ifn;
4234 *offset_vectype_out = offset_vectype;
4235 return true;
4237 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4238 memory_type,
4239 offset_vectype, scale))
4241 *ifn_out = alt_ifn2;
4242 *offset_vectype_out = offset_vectype;
4243 return true;
4246 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4247 && TYPE_PRECISION (offset_type) >= element_bits)
4248 return false;
4250 offset_type = build_nonstandard_integer_type
4251 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4255 /* STMT_INFO is a call to an internal gather load or scatter store function.
4256 Describe the operation in INFO. */
4258 static void
4259 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4260 gather_scatter_info *info)
4262 gcall *call = as_a <gcall *> (stmt_info->stmt);
4263 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4264 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4266 info->ifn = gimple_call_internal_fn (call);
4267 info->decl = NULL_TREE;
4268 info->base = gimple_call_arg (call, 0);
4269 info->offset = gimple_call_arg (call, 1);
4270 info->offset_dt = vect_unknown_def_type;
4271 info->offset_vectype = NULL_TREE;
4272 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4273 info->element_type = TREE_TYPE (vectype);
4274 info->memory_type = TREE_TYPE (DR_REF (dr));
4277 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4278 gather load or scatter store. Describe the operation in *INFO if so. */
4280 bool
4281 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4282 gather_scatter_info *info)
4284 HOST_WIDE_INT scale = 1;
4285 poly_int64 pbitpos, pbitsize;
4286 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4287 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4288 tree offtype = NULL_TREE;
4289 tree decl = NULL_TREE, base, off;
4290 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4291 tree memory_type = TREE_TYPE (DR_REF (dr));
4292 machine_mode pmode;
4293 int punsignedp, reversep, pvolatilep = 0;
4294 internal_fn ifn;
4295 tree offset_vectype;
4296 bool masked_p = false;
4298 /* See whether this is already a call to a gather/scatter internal function.
4299 If not, see whether it's a masked load or store. */
4300 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4301 if (call && gimple_call_internal_p (call))
4303 ifn = gimple_call_internal_fn (call);
4304 if (internal_gather_scatter_fn_p (ifn))
4306 vect_describe_gather_scatter_call (stmt_info, info);
4307 return true;
4309 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4312 /* True if we should aim to use internal functions rather than
4313 built-in functions. */
4314 bool use_ifn_p = (DR_IS_READ (dr)
4315 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4316 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4318 base = DR_REF (dr);
4319 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4320 see if we can use the def stmt of the address. */
4321 if (masked_p
4322 && TREE_CODE (base) == MEM_REF
4323 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4324 && integer_zerop (TREE_OPERAND (base, 1))
4325 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4327 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4328 if (is_gimple_assign (def_stmt)
4329 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4330 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4333 /* The gather and scatter builtins need address of the form
4334 loop_invariant + vector * {1, 2, 4, 8}
4336 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4337 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4338 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4339 multiplications and additions in it. To get a vector, we need
4340 a single SSA_NAME that will be defined in the loop and will
4341 contain everything that is not loop invariant and that can be
4342 vectorized. The following code attempts to find such a preexistng
4343 SSA_NAME OFF and put the loop invariants into a tree BASE
4344 that can be gimplified before the loop. */
4345 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4346 &punsignedp, &reversep, &pvolatilep);
4347 if (reversep)
4348 return false;
4350 /* PR 107346. Packed structs can have fields at offsets that are not
4351 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4352 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4353 return false;
4355 /* We need to be able to form an address to the base which for example
4356 isn't possible for hard registers. */
4357 if (may_be_nonaddressable_p (base))
4358 return false;
4360 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4362 if (TREE_CODE (base) == MEM_REF)
4364 if (!integer_zerop (TREE_OPERAND (base, 1)))
4366 if (off == NULL_TREE)
4367 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4368 else
4369 off = size_binop (PLUS_EXPR, off,
4370 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4372 base = TREE_OPERAND (base, 0);
4374 else
4375 base = build_fold_addr_expr (base);
4377 if (off == NULL_TREE)
4378 off = size_zero_node;
4380 /* If base is not loop invariant, either off is 0, then we start with just
4381 the constant offset in the loop invariant BASE and continue with base
4382 as OFF, otherwise give up.
4383 We could handle that case by gimplifying the addition of base + off
4384 into some SSA_NAME and use that as off, but for now punt. */
4385 if (!expr_invariant_in_loop_p (loop, base))
4387 if (!integer_zerop (off))
4388 return false;
4389 off = base;
4390 base = size_int (pbytepos);
4392 /* Otherwise put base + constant offset into the loop invariant BASE
4393 and continue with OFF. */
4394 else
4396 base = fold_convert (sizetype, base);
4397 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4400 /* OFF at this point may be either a SSA_NAME or some tree expression
4401 from get_inner_reference. Try to peel off loop invariants from it
4402 into BASE as long as possible. */
4403 STRIP_NOPS (off);
4404 while (offtype == NULL_TREE)
4406 enum tree_code code;
4407 tree op0, op1, add = NULL_TREE;
4409 if (TREE_CODE (off) == SSA_NAME)
4411 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4413 if (expr_invariant_in_loop_p (loop, off))
4414 return false;
4416 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4417 break;
4419 op0 = gimple_assign_rhs1 (def_stmt);
4420 code = gimple_assign_rhs_code (def_stmt);
4421 op1 = gimple_assign_rhs2 (def_stmt);
4423 else
4425 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4426 return false;
4427 code = TREE_CODE (off);
4428 extract_ops_from_tree (off, &code, &op0, &op1);
4430 switch (code)
4432 case POINTER_PLUS_EXPR:
4433 case PLUS_EXPR:
4434 if (expr_invariant_in_loop_p (loop, op0))
4436 add = op0;
4437 off = op1;
4438 do_add:
4439 add = fold_convert (sizetype, add);
4440 if (scale != 1)
4441 add = size_binop (MULT_EXPR, add, size_int (scale));
4442 base = size_binop (PLUS_EXPR, base, add);
4443 continue;
4445 if (expr_invariant_in_loop_p (loop, op1))
4447 add = op1;
4448 off = op0;
4449 goto do_add;
4451 break;
4452 case MINUS_EXPR:
4453 if (expr_invariant_in_loop_p (loop, op1))
4455 add = fold_convert (sizetype, op1);
4456 add = size_binop (MINUS_EXPR, size_zero_node, add);
4457 off = op0;
4458 goto do_add;
4460 break;
4461 case MULT_EXPR:
4462 if (scale == 1 && tree_fits_shwi_p (op1))
4464 int new_scale = tree_to_shwi (op1);
4465 /* Only treat this as a scaling operation if the target
4466 supports it for at least some offset type. */
4467 if (use_ifn_p
4468 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4469 masked_p, vectype, memory_type,
4470 signed_char_type_node,
4471 new_scale, &ifn,
4472 &offset_vectype)
4473 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4474 masked_p, vectype, memory_type,
4475 unsigned_char_type_node,
4476 new_scale, &ifn,
4477 &offset_vectype))
4478 break;
4479 scale = new_scale;
4480 off = op0;
4481 continue;
4483 break;
4484 case SSA_NAME:
4485 off = op0;
4486 continue;
4487 CASE_CONVERT:
4488 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4489 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4490 break;
4492 /* Don't include the conversion if the target is happy with
4493 the current offset type. */
4494 if (use_ifn_p
4495 && TREE_CODE (off) == SSA_NAME
4496 && !POINTER_TYPE_P (TREE_TYPE (off))
4497 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4498 masked_p, vectype, memory_type,
4499 TREE_TYPE (off), scale, &ifn,
4500 &offset_vectype))
4501 break;
4503 if (TYPE_PRECISION (TREE_TYPE (op0))
4504 == TYPE_PRECISION (TREE_TYPE (off)))
4506 off = op0;
4507 continue;
4510 /* Include the conversion if it is widening and we're using
4511 the IFN path or the target can handle the converted from
4512 offset or the current size is not already the same as the
4513 data vector element size. */
4514 if ((TYPE_PRECISION (TREE_TYPE (op0))
4515 < TYPE_PRECISION (TREE_TYPE (off)))
4516 && (use_ifn_p
4517 || (DR_IS_READ (dr)
4518 ? (targetm.vectorize.builtin_gather
4519 && targetm.vectorize.builtin_gather (vectype,
4520 TREE_TYPE (op0),
4521 scale))
4522 : (targetm.vectorize.builtin_scatter
4523 && targetm.vectorize.builtin_scatter (vectype,
4524 TREE_TYPE (op0),
4525 scale)))
4526 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4527 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4529 off = op0;
4530 offtype = TREE_TYPE (off);
4531 STRIP_NOPS (off);
4532 continue;
4534 break;
4535 default:
4536 break;
4538 break;
4541 /* If at the end OFF still isn't a SSA_NAME or isn't
4542 defined in the loop, punt. */
4543 if (TREE_CODE (off) != SSA_NAME
4544 || expr_invariant_in_loop_p (loop, off))
4545 return false;
4547 if (offtype == NULL_TREE)
4548 offtype = TREE_TYPE (off);
4550 if (use_ifn_p)
4552 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4553 vectype, memory_type, offtype, scale,
4554 &ifn, &offset_vectype))
4555 ifn = IFN_LAST;
4556 decl = NULL_TREE;
4558 else
4560 if (DR_IS_READ (dr))
4562 if (targetm.vectorize.builtin_gather)
4563 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4565 else
4567 if (targetm.vectorize.builtin_scatter)
4568 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4570 ifn = IFN_LAST;
4571 /* The offset vector type will be read from DECL when needed. */
4572 offset_vectype = NULL_TREE;
4575 info->ifn = ifn;
4576 info->decl = decl;
4577 info->base = base;
4578 info->offset = off;
4579 info->offset_dt = vect_unknown_def_type;
4580 info->offset_vectype = offset_vectype;
4581 info->scale = scale;
4582 info->element_type = TREE_TYPE (vectype);
4583 info->memory_type = memory_type;
4584 return true;
4587 /* Find the data references in STMT, analyze them with respect to LOOP and
4588 append them to DATAREFS. Return false if datarefs in this stmt cannot
4589 be handled. */
4591 opt_result
4592 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4593 vec<data_reference_p> *datarefs,
4594 vec<int> *dataref_groups, int group_id)
4596 /* We can ignore clobbers for dataref analysis - they are removed during
4597 loop vectorization and BB vectorization checks dependences with a
4598 stmt walk. */
4599 if (gimple_clobber_p (stmt))
4600 return opt_result::success ();
4602 if (gimple_has_volatile_ops (stmt))
4603 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4604 stmt);
4606 if (stmt_can_throw_internal (cfun, stmt))
4607 return opt_result::failure_at (stmt,
4608 "not vectorized:"
4609 " statement can throw an exception: %G",
4610 stmt);
4612 auto_vec<data_reference_p, 2> refs;
4613 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4614 if (!res)
4615 return res;
4617 if (refs.is_empty ())
4618 return opt_result::success ();
4620 if (refs.length () > 1)
4622 while (!refs.is_empty ())
4623 free_data_ref (refs.pop ());
4624 return opt_result::failure_at (stmt,
4625 "not vectorized: more than one "
4626 "data ref in stmt: %G", stmt);
4629 data_reference_p dr = refs.pop ();
4630 if (gcall *call = dyn_cast <gcall *> (stmt))
4631 if (!gimple_call_internal_p (call)
4632 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4633 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4635 free_data_ref (dr);
4636 return opt_result::failure_at (stmt,
4637 "not vectorized: dr in a call %G", stmt);
4640 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4641 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4643 free_data_ref (dr);
4644 return opt_result::failure_at (stmt,
4645 "not vectorized:"
4646 " statement is an unsupported"
4647 " bitfield access %G", stmt);
4650 if (DR_BASE_ADDRESS (dr)
4651 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4653 free_data_ref (dr);
4654 return opt_result::failure_at (stmt,
4655 "not vectorized:"
4656 " base addr of dr is a constant\n");
4659 /* Check whether this may be a SIMD lane access and adjust the
4660 DR to make it easier for us to handle it. */
4661 if (loop
4662 && loop->simduid
4663 && (!DR_BASE_ADDRESS (dr)
4664 || !DR_OFFSET (dr)
4665 || !DR_INIT (dr)
4666 || !DR_STEP (dr)))
4668 struct data_reference *newdr
4669 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4670 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4671 if (DR_BASE_ADDRESS (newdr)
4672 && DR_OFFSET (newdr)
4673 && DR_INIT (newdr)
4674 && DR_STEP (newdr)
4675 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4676 && integer_zerop (DR_STEP (newdr)))
4678 tree base_address = DR_BASE_ADDRESS (newdr);
4679 tree off = DR_OFFSET (newdr);
4680 tree step = ssize_int (1);
4681 if (integer_zerop (off)
4682 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4684 off = TREE_OPERAND (base_address, 1);
4685 base_address = TREE_OPERAND (base_address, 0);
4687 STRIP_NOPS (off);
4688 if (TREE_CODE (off) == MULT_EXPR
4689 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4691 step = TREE_OPERAND (off, 1);
4692 off = TREE_OPERAND (off, 0);
4693 STRIP_NOPS (off);
4695 if (CONVERT_EXPR_P (off)
4696 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4697 < TYPE_PRECISION (TREE_TYPE (off))))
4698 off = TREE_OPERAND (off, 0);
4699 if (TREE_CODE (off) == SSA_NAME)
4701 gimple *def = SSA_NAME_DEF_STMT (off);
4702 /* Look through widening conversion. */
4703 if (is_gimple_assign (def)
4704 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4706 tree rhs1 = gimple_assign_rhs1 (def);
4707 if (TREE_CODE (rhs1) == SSA_NAME
4708 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4709 && (TYPE_PRECISION (TREE_TYPE (off))
4710 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4711 def = SSA_NAME_DEF_STMT (rhs1);
4713 if (is_gimple_call (def)
4714 && gimple_call_internal_p (def)
4715 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4717 tree arg = gimple_call_arg (def, 0);
4718 tree reft = TREE_TYPE (DR_REF (newdr));
4719 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4720 arg = SSA_NAME_VAR (arg);
4721 if (arg == loop->simduid
4722 /* For now. */
4723 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4725 DR_BASE_ADDRESS (newdr) = base_address;
4726 DR_OFFSET (newdr) = ssize_int (0);
4727 DR_STEP (newdr) = step;
4728 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4729 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4730 /* Mark as simd-lane access. */
4731 tree arg2 = gimple_call_arg (def, 1);
4732 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4733 free_data_ref (dr);
4734 datarefs->safe_push (newdr);
4735 if (dataref_groups)
4736 dataref_groups->safe_push (group_id);
4737 return opt_result::success ();
4742 free_data_ref (newdr);
4745 datarefs->safe_push (dr);
4746 if (dataref_groups)
4747 dataref_groups->safe_push (group_id);
4748 return opt_result::success ();
4751 /* Function vect_analyze_data_refs.
4753 Find all the data references in the loop or basic block.
4755 The general structure of the analysis of data refs in the vectorizer is as
4756 follows:
4757 1- vect_analyze_data_refs(loop/bb): call
4758 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4759 in the loop/bb and their dependences.
4760 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4761 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4762 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4766 opt_result
4767 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4769 class loop *loop = NULL;
4770 unsigned int i;
4771 struct data_reference *dr;
4772 tree scalar_type;
4774 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4776 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4777 loop = LOOP_VINFO_LOOP (loop_vinfo);
4779 /* Go through the data-refs, check that the analysis succeeded. Update
4780 pointer from stmt_vec_info struct to DR and vectype. */
4782 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4783 FOR_EACH_VEC_ELT (datarefs, i, dr)
4785 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4786 poly_uint64 vf;
4788 gcc_assert (DR_REF (dr));
4789 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4790 gcc_assert (!stmt_info->dr_aux.dr);
4791 stmt_info->dr_aux.dr = dr;
4792 stmt_info->dr_aux.stmt = stmt_info;
4794 /* Check that analysis of the data-ref succeeded. */
4795 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4796 || !DR_STEP (dr))
4798 bool maybe_gather
4799 = DR_IS_READ (dr)
4800 && !TREE_THIS_VOLATILE (DR_REF (dr));
4801 bool maybe_scatter
4802 = DR_IS_WRITE (dr)
4803 && !TREE_THIS_VOLATILE (DR_REF (dr));
4805 /* If target supports vector gather loads or scatter stores,
4806 see if they can't be used. */
4807 if (is_a <loop_vec_info> (vinfo)
4808 && !nested_in_vect_loop_p (loop, stmt_info))
4810 if (maybe_gather || maybe_scatter)
4812 if (maybe_gather)
4813 gatherscatter = GATHER;
4814 else
4815 gatherscatter = SCATTER;
4819 if (gatherscatter == SG_NONE)
4821 if (dump_enabled_p ())
4822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4823 "not vectorized: data ref analysis "
4824 "failed %G", stmt_info->stmt);
4825 if (is_a <bb_vec_info> (vinfo))
4827 /* In BB vectorization the ref can still participate
4828 in dependence analysis, we just can't vectorize it. */
4829 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4830 continue;
4832 return opt_result::failure_at (stmt_info->stmt,
4833 "not vectorized:"
4834 " data ref analysis failed: %G",
4835 stmt_info->stmt);
4839 /* See if this was detected as SIMD lane access. */
4840 if (dr->aux == (void *)-1
4841 || dr->aux == (void *)-2
4842 || dr->aux == (void *)-3
4843 || dr->aux == (void *)-4)
4845 if (nested_in_vect_loop_p (loop, stmt_info))
4846 return opt_result::failure_at (stmt_info->stmt,
4847 "not vectorized:"
4848 " data ref analysis failed: %G",
4849 stmt_info->stmt);
4850 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4851 = -(uintptr_t) dr->aux;
4854 tree base = get_base_address (DR_REF (dr));
4855 if (base && VAR_P (base) && DECL_NONALIASED (base))
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4859 "not vectorized: base object not addressable "
4860 "for stmt: %G", stmt_info->stmt);
4861 if (is_a <bb_vec_info> (vinfo))
4863 /* In BB vectorization the ref can still participate
4864 in dependence analysis, we just can't vectorize it. */
4865 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4866 continue;
4868 return opt_result::failure_at (stmt_info->stmt,
4869 "not vectorized: base object not"
4870 " addressable for stmt: %G",
4871 stmt_info->stmt);
4874 if (is_a <loop_vec_info> (vinfo)
4875 && DR_STEP (dr)
4876 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4878 if (nested_in_vect_loop_p (loop, stmt_info))
4879 return opt_result::failure_at (stmt_info->stmt,
4880 "not vectorized: "
4881 "not suitable for strided load %G",
4882 stmt_info->stmt);
4883 STMT_VINFO_STRIDED_P (stmt_info) = true;
4886 /* Update DR field in stmt_vec_info struct. */
4888 /* If the dataref is in an inner-loop of the loop that is considered for
4889 for vectorization, we also want to analyze the access relative to
4890 the outer-loop (DR contains information only relative to the
4891 inner-most enclosing loop). We do that by building a reference to the
4892 first location accessed by the inner-loop, and analyze it relative to
4893 the outer-loop. */
4894 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4896 /* Build a reference to the first location accessed by the
4897 inner loop: *(BASE + INIT + OFFSET). By construction,
4898 this address must be invariant in the inner loop, so we
4899 can consider it as being used in the outer loop. */
4900 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4901 tree offset = unshare_expr (DR_OFFSET (dr));
4902 tree init = unshare_expr (DR_INIT (dr));
4903 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4904 init, offset);
4905 tree init_addr = fold_build_pointer_plus (base, init_offset);
4906 tree init_ref = build_fold_indirect_ref (init_addr);
4908 if (dump_enabled_p ())
4909 dump_printf_loc (MSG_NOTE, vect_location,
4910 "analyze in outer loop: %T\n", init_ref);
4912 opt_result res
4913 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4914 init_ref, loop, stmt_info->stmt);
4915 if (!res)
4916 /* dr_analyze_innermost already explained the failure. */
4917 return res;
4919 if (dump_enabled_p ())
4920 dump_printf_loc (MSG_NOTE, vect_location,
4921 "\touter base_address: %T\n"
4922 "\touter offset from base address: %T\n"
4923 "\touter constant offset from base address: %T\n"
4924 "\touter step: %T\n"
4925 "\touter base alignment: %d\n\n"
4926 "\touter base misalignment: %d\n"
4927 "\touter offset alignment: %d\n"
4928 "\touter step alignment: %d\n",
4929 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4930 STMT_VINFO_DR_OFFSET (stmt_info),
4931 STMT_VINFO_DR_INIT (stmt_info),
4932 STMT_VINFO_DR_STEP (stmt_info),
4933 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4934 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4935 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4936 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4939 /* Set vectype for STMT. */
4940 scalar_type = TREE_TYPE (DR_REF (dr));
4941 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4942 if (!vectype)
4944 if (dump_enabled_p ())
4946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4947 "not vectorized: no vectype for stmt: %G",
4948 stmt_info->stmt);
4949 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4950 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4951 scalar_type);
4952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4955 if (is_a <bb_vec_info> (vinfo))
4957 /* No vector type is fine, the ref can still participate
4958 in dependence analysis, we just can't vectorize it. */
4959 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4960 continue;
4962 if (fatal)
4963 *fatal = false;
4964 return opt_result::failure_at (stmt_info->stmt,
4965 "not vectorized:"
4966 " no vectype for stmt: %G"
4967 " scalar_type: %T\n",
4968 stmt_info->stmt, scalar_type);
4970 else
4972 if (dump_enabled_p ())
4973 dump_printf_loc (MSG_NOTE, vect_location,
4974 "got vectype for stmt: %G%T\n",
4975 stmt_info->stmt, vectype);
4978 /* Adjust the minimal vectorization factor according to the
4979 vector type. */
4980 vf = TYPE_VECTOR_SUBPARTS (vectype);
4981 *min_vf = upper_bound (*min_vf, vf);
4983 /* Leave the BB vectorizer to pick the vector type later, based on
4984 the final dataref group size and SLP node size. */
4985 if (is_a <loop_vec_info> (vinfo))
4986 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4988 if (gatherscatter != SG_NONE)
4990 gather_scatter_info gs_info;
4991 if (!vect_check_gather_scatter (stmt_info,
4992 as_a <loop_vec_info> (vinfo),
4993 &gs_info)
4994 || !get_vectype_for_scalar_type (vinfo,
4995 TREE_TYPE (gs_info.offset)))
4997 if (fatal)
4998 *fatal = false;
4999 return opt_result::failure_at
5000 (stmt_info->stmt,
5001 (gatherscatter == GATHER)
5002 ? "not vectorized: not suitable for gather load %G"
5003 : "not vectorized: not suitable for scatter store %G",
5004 stmt_info->stmt);
5006 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5010 /* We used to stop processing and prune the list here. Verify we no
5011 longer need to. */
5012 gcc_assert (i == datarefs.length ());
5014 return opt_result::success ();
5018 /* Function vect_get_new_vect_var.
5020 Returns a name for a new variable. The current naming scheme appends the
5021 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5022 the name of vectorizer generated variables, and appends that to NAME if
5023 provided. */
5025 tree
5026 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5028 const char *prefix;
5029 tree new_vect_var;
5031 switch (var_kind)
5033 case vect_simple_var:
5034 prefix = "vect";
5035 break;
5036 case vect_scalar_var:
5037 prefix = "stmp";
5038 break;
5039 case vect_mask_var:
5040 prefix = "mask";
5041 break;
5042 case vect_pointer_var:
5043 prefix = "vectp";
5044 break;
5045 default:
5046 gcc_unreachable ();
5049 if (name)
5051 char* tmp = concat (prefix, "_", name, NULL);
5052 new_vect_var = create_tmp_reg (type, tmp);
5053 free (tmp);
5055 else
5056 new_vect_var = create_tmp_reg (type, prefix);
5058 return new_vect_var;
5061 /* Like vect_get_new_vect_var but return an SSA name. */
5063 tree
5064 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5066 const char *prefix;
5067 tree new_vect_var;
5069 switch (var_kind)
5071 case vect_simple_var:
5072 prefix = "vect";
5073 break;
5074 case vect_scalar_var:
5075 prefix = "stmp";
5076 break;
5077 case vect_pointer_var:
5078 prefix = "vectp";
5079 break;
5080 default:
5081 gcc_unreachable ();
5084 if (name)
5086 char* tmp = concat (prefix, "_", name, NULL);
5087 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5088 free (tmp);
5090 else
5091 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5093 return new_vect_var;
5096 /* Duplicate points-to info on NAME from DR_INFO. */
5098 static void
5099 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5101 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5102 /* DR_PTR_INFO is for a base SSA name, not including constant or
5103 variable offsets in the ref so its alignment info does not apply. */
5104 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5107 /* Function vect_create_addr_base_for_vector_ref.
5109 Create an expression that computes the address of the first memory location
5110 that will be accessed for a data reference.
5112 Input:
5113 STMT_INFO: The statement containing the data reference.
5114 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5115 OFFSET: Optional. If supplied, it is be added to the initial address.
5116 LOOP: Specify relative to which loop-nest should the address be computed.
5117 For example, when the dataref is in an inner-loop nested in an
5118 outer-loop that is now being vectorized, LOOP can be either the
5119 outer-loop, or the inner-loop. The first memory location accessed
5120 by the following dataref ('in' points to short):
5122 for (i=0; i<N; i++)
5123 for (j=0; j<M; j++)
5124 s += in[i+j]
5126 is as follows:
5127 if LOOP=i_loop: &in (relative to i_loop)
5128 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5130 Output:
5131 1. Return an SSA_NAME whose value is the address of the memory location of
5132 the first vector of the data reference.
5133 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5134 these statement(s) which define the returned SSA_NAME.
5136 FORNOW: We are only handling array accesses with step 1. */
5138 tree
5139 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5140 gimple_seq *new_stmt_list,
5141 tree offset)
5143 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5144 struct data_reference *dr = dr_info->dr;
5145 const char *base_name;
5146 tree addr_base;
5147 tree dest;
5148 gimple_seq seq = NULL;
5149 tree vect_ptr_type;
5150 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5151 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5153 tree data_ref_base = unshare_expr (drb->base_address);
5154 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5155 tree init = unshare_expr (drb->init);
5157 if (loop_vinfo)
5158 base_name = get_name (data_ref_base);
5159 else
5161 base_offset = ssize_int (0);
5162 init = ssize_int (0);
5163 base_name = get_name (DR_REF (dr));
5166 /* Create base_offset */
5167 base_offset = size_binop (PLUS_EXPR,
5168 fold_convert (sizetype, base_offset),
5169 fold_convert (sizetype, init));
5171 if (offset)
5173 offset = fold_convert (sizetype, offset);
5174 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5175 base_offset, offset);
5178 /* base + base_offset */
5179 if (loop_vinfo)
5180 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5181 else
5182 addr_base = build1 (ADDR_EXPR,
5183 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5184 /* Strip zero offset components since we don't need
5185 them and they can confuse late diagnostics if
5186 we CSE them wrongly. See PR106904 for example. */
5187 unshare_expr (strip_zero_offset_components
5188 (DR_REF (dr))));
5190 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5191 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5192 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5193 gimple_seq_add_seq (new_stmt_list, seq);
5195 if (DR_PTR_INFO (dr)
5196 && TREE_CODE (addr_base) == SSA_NAME
5197 /* We should only duplicate pointer info to newly created SSA names. */
5198 && SSA_NAME_VAR (addr_base) == dest)
5200 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5201 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5204 if (dump_enabled_p ())
5205 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5207 return addr_base;
5211 /* Function vect_create_data_ref_ptr.
5213 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5214 location accessed in the loop by STMT_INFO, along with the def-use update
5215 chain to appropriately advance the pointer through the loop iterations.
5216 Also set aliasing information for the pointer. This pointer is used by
5217 the callers to this function to create a memory reference expression for
5218 vector load/store access.
5220 Input:
5221 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5222 GIMPLE_ASSIGN <name, data-ref> or
5223 GIMPLE_ASSIGN <data-ref, name>.
5224 2. AGGR_TYPE: the type of the reference, which should be either a vector
5225 or an array.
5226 3. AT_LOOP: the loop where the vector memref is to be created.
5227 4. OFFSET (optional): a byte offset to be added to the initial address
5228 accessed by the data-ref in STMT_INFO.
5229 5. BSI: location where the new stmts are to be placed if there is no loop
5230 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5231 pointing to the initial address.
5232 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5233 to the IV during each iteration of the loop. NULL says to move
5234 by one copy of AGGR_TYPE up or down, depending on the step of the
5235 data reference.
5237 Output:
5238 1. Declare a new ptr to vector_type, and have it point to the base of the
5239 data reference (initial addressed accessed by the data reference).
5240 For example, for vector of type V8HI, the following code is generated:
5242 v8hi *ap;
5243 ap = (v8hi *)initial_address;
5245 if OFFSET is not supplied:
5246 initial_address = &a[init];
5247 if OFFSET is supplied:
5248 initial_address = &a[init] + OFFSET;
5249 if BYTE_OFFSET is supplied:
5250 initial_address = &a[init] + BYTE_OFFSET;
5252 Return the initial_address in INITIAL_ADDRESS.
5254 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5255 update the pointer in each iteration of the loop.
5257 Return the increment stmt that updates the pointer in PTR_INCR.
5259 3. Return the pointer. */
5261 tree
5262 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5263 tree aggr_type, class loop *at_loop, tree offset,
5264 tree *initial_address, gimple_stmt_iterator *gsi,
5265 gimple **ptr_incr, bool only_init,
5266 tree iv_step)
5268 const char *base_name;
5269 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5270 class loop *loop = NULL;
5271 bool nested_in_vect_loop = false;
5272 class loop *containing_loop = NULL;
5273 tree aggr_ptr_type;
5274 tree aggr_ptr;
5275 tree new_temp;
5276 gimple_seq new_stmt_list = NULL;
5277 edge pe = NULL;
5278 basic_block new_bb;
5279 tree aggr_ptr_init;
5280 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5281 struct data_reference *dr = dr_info->dr;
5282 tree aptr;
5283 gimple_stmt_iterator incr_gsi;
5284 bool insert_after;
5285 tree indx_before_incr, indx_after_incr;
5286 gimple *incr;
5287 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5289 gcc_assert (iv_step != NULL_TREE
5290 || TREE_CODE (aggr_type) == ARRAY_TYPE
5291 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5293 if (loop_vinfo)
5295 loop = LOOP_VINFO_LOOP (loop_vinfo);
5296 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5297 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5298 pe = loop_preheader_edge (loop);
5300 else
5302 gcc_assert (bb_vinfo);
5303 only_init = true;
5304 *ptr_incr = NULL;
5307 /* Create an expression for the first address accessed by this load
5308 in LOOP. */
5309 base_name = get_name (DR_BASE_ADDRESS (dr));
5311 if (dump_enabled_p ())
5313 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5314 dump_printf_loc (MSG_NOTE, vect_location,
5315 "create %s-pointer variable to type: %T",
5316 get_tree_code_name (TREE_CODE (aggr_type)),
5317 aggr_type);
5318 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5319 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5320 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5321 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5322 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5323 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5324 else
5325 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5326 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5329 /* (1) Create the new aggregate-pointer variable.
5330 Vector and array types inherit the alias set of their component
5331 type by default so we need to use a ref-all pointer if the data
5332 reference does not conflict with the created aggregated data
5333 reference because it is not addressable. */
5334 bool need_ref_all = false;
5335 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5336 get_alias_set (DR_REF (dr))))
5337 need_ref_all = true;
5338 /* Likewise for any of the data references in the stmt group. */
5339 else if (DR_GROUP_SIZE (stmt_info) > 1)
5341 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5344 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5345 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5346 get_alias_set (DR_REF (sdr))))
5348 need_ref_all = true;
5349 break;
5351 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5353 while (sinfo);
5355 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5356 need_ref_all);
5357 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5360 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5361 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5362 def-use update cycles for the pointer: one relative to the outer-loop
5363 (LOOP), which is what steps (3) and (4) below do. The other is relative
5364 to the inner-loop (which is the inner-most loop containing the dataref),
5365 and this is done be step (5) below.
5367 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5368 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5369 redundant. Steps (3),(4) create the following:
5371 vp0 = &base_addr;
5372 LOOP: vp1 = phi(vp0,vp2)
5375 vp2 = vp1 + step
5376 goto LOOP
5378 If there is an inner-loop nested in loop, then step (5) will also be
5379 applied, and an additional update in the inner-loop will be created:
5381 vp0 = &base_addr;
5382 LOOP: vp1 = phi(vp0,vp2)
5384 inner: vp3 = phi(vp1,vp4)
5385 vp4 = vp3 + inner_step
5386 if () goto inner
5388 vp2 = vp1 + step
5389 if () goto LOOP */
5391 /* (2) Calculate the initial address of the aggregate-pointer, and set
5392 the aggregate-pointer to point to it before the loop. */
5394 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5396 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5397 stmt_info, &new_stmt_list,
5398 offset);
5399 if (new_stmt_list)
5401 if (pe)
5403 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5404 gcc_assert (!new_bb);
5406 else
5407 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5410 *initial_address = new_temp;
5411 aggr_ptr_init = new_temp;
5413 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5414 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5415 inner-loop nested in LOOP (during outer-loop vectorization). */
5417 /* No update in loop is required. */
5418 if (only_init && (!loop_vinfo || at_loop == loop))
5419 aptr = aggr_ptr_init;
5420 else
5422 /* Accesses to invariant addresses should be handled specially
5423 by the caller. */
5424 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5425 gcc_assert (!integer_zerop (step));
5427 if (iv_step == NULL_TREE)
5429 /* The step of the aggregate pointer is the type size,
5430 negated for downward accesses. */
5431 iv_step = TYPE_SIZE_UNIT (aggr_type);
5432 if (tree_int_cst_sgn (step) == -1)
5433 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5436 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5438 create_iv (aggr_ptr_init, PLUS_EXPR,
5439 fold_convert (aggr_ptr_type, iv_step),
5440 aggr_ptr, loop, &incr_gsi, insert_after,
5441 &indx_before_incr, &indx_after_incr);
5442 incr = gsi_stmt (incr_gsi);
5444 /* Copy the points-to information if it exists. */
5445 if (DR_PTR_INFO (dr))
5447 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5448 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5450 if (ptr_incr)
5451 *ptr_incr = incr;
5453 aptr = indx_before_incr;
5456 if (!nested_in_vect_loop || only_init)
5457 return aptr;
5460 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5461 nested in LOOP, if exists. */
5463 gcc_assert (nested_in_vect_loop);
5464 if (!only_init)
5466 standard_iv_increment_position (containing_loop, &incr_gsi,
5467 &insert_after);
5468 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5469 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5470 &indx_before_incr, &indx_after_incr);
5471 incr = gsi_stmt (incr_gsi);
5473 /* Copy the points-to information if it exists. */
5474 if (DR_PTR_INFO (dr))
5476 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5477 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5479 if (ptr_incr)
5480 *ptr_incr = incr;
5482 return indx_before_incr;
5484 else
5485 gcc_unreachable ();
5489 /* Function bump_vector_ptr
5491 Increment a pointer (to a vector type) by vector-size. If requested,
5492 i.e. if PTR-INCR is given, then also connect the new increment stmt
5493 to the existing def-use update-chain of the pointer, by modifying
5494 the PTR_INCR as illustrated below:
5496 The pointer def-use update-chain before this function:
5497 DATAREF_PTR = phi (p_0, p_2)
5498 ....
5499 PTR_INCR: p_2 = DATAREF_PTR + step
5501 The pointer def-use update-chain after this function:
5502 DATAREF_PTR = phi (p_0, p_2)
5503 ....
5504 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5505 ....
5506 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5508 Input:
5509 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5510 in the loop.
5511 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5512 the loop. The increment amount across iterations is expected
5513 to be vector_size.
5514 BSI - location where the new update stmt is to be placed.
5515 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5516 BUMP - optional. The offset by which to bump the pointer. If not given,
5517 the offset is assumed to be vector_size.
5519 Output: Return NEW_DATAREF_PTR as illustrated above.
5523 tree
5524 bump_vector_ptr (vec_info *vinfo,
5525 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5526 stmt_vec_info stmt_info, tree bump)
5528 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5529 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5530 tree update = TYPE_SIZE_UNIT (vectype);
5531 gimple *incr_stmt;
5532 ssa_op_iter iter;
5533 use_operand_p use_p;
5534 tree new_dataref_ptr;
5536 if (bump)
5537 update = bump;
5539 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5540 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5541 else if (is_gimple_min_invariant (dataref_ptr))
5542 /* When possible avoid emitting a separate increment stmt that will
5543 force the addressed object addressable. */
5544 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5545 fold_build2 (MEM_REF,
5546 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5547 dataref_ptr,
5548 fold_convert (ptr_type_node, update)));
5549 else
5550 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5551 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5552 dataref_ptr, update);
5553 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5554 /* Fold the increment, avoiding excessive chains use-def chains of
5555 those, leading to compile-time issues for passes until the next
5556 forwprop pass which would do this as well. */
5557 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5558 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5560 incr_stmt = gsi_stmt (fold_gsi);
5561 update_stmt (incr_stmt);
5564 /* Copy the points-to information if it exists. */
5565 if (DR_PTR_INFO (dr))
5567 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5568 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5571 if (!ptr_incr)
5572 return new_dataref_ptr;
5574 /* Update the vector-pointer's cross-iteration increment. */
5575 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5577 tree use = USE_FROM_PTR (use_p);
5579 if (use == dataref_ptr)
5580 SET_USE (use_p, new_dataref_ptr);
5581 else
5582 gcc_assert (operand_equal_p (use, update, 0));
5585 return new_dataref_ptr;
5589 /* Copy memory reference info such as base/clique from the SRC reference
5590 to the DEST MEM_REF. */
5592 void
5593 vect_copy_ref_info (tree dest, tree src)
5595 if (TREE_CODE (dest) != MEM_REF)
5596 return;
5598 tree src_base = src;
5599 while (handled_component_p (src_base))
5600 src_base = TREE_OPERAND (src_base, 0);
5601 if (TREE_CODE (src_base) != MEM_REF
5602 && TREE_CODE (src_base) != TARGET_MEM_REF)
5603 return;
5605 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5606 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5610 /* Function vect_create_destination_var.
5612 Create a new temporary of type VECTYPE. */
5614 tree
5615 vect_create_destination_var (tree scalar_dest, tree vectype)
5617 tree vec_dest;
5618 const char *name;
5619 char *new_name;
5620 tree type;
5621 enum vect_var_kind kind;
5623 kind = vectype
5624 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5625 ? vect_mask_var
5626 : vect_simple_var
5627 : vect_scalar_var;
5628 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5630 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5632 name = get_name (scalar_dest);
5633 if (name)
5634 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5635 else
5636 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5637 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5638 free (new_name);
5640 return vec_dest;
5643 /* Function vect_grouped_store_supported.
5645 Returns TRUE if interleave high and interleave low permutations
5646 are supported, and FALSE otherwise. */
5648 bool
5649 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5651 machine_mode mode = TYPE_MODE (vectype);
5653 /* vect_permute_store_chain requires the group size to be equal to 3 or
5654 be a power of two. */
5655 if (count != 3 && exact_log2 (count) == -1)
5657 if (dump_enabled_p ())
5658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5659 "the size of the group of accesses"
5660 " is not a power of 2 or not eqaul to 3\n");
5661 return false;
5664 /* Check that the permutation is supported. */
5665 if (VECTOR_MODE_P (mode))
5667 unsigned int i;
5668 if (count == 3)
5670 unsigned int j0 = 0, j1 = 0, j2 = 0;
5671 unsigned int i, j;
5673 unsigned int nelt;
5674 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5676 if (dump_enabled_p ())
5677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5678 "cannot handle groups of 3 stores for"
5679 " variable-length vectors\n");
5680 return false;
5683 vec_perm_builder sel (nelt, nelt, 1);
5684 sel.quick_grow (nelt);
5685 vec_perm_indices indices;
5686 for (j = 0; j < 3; j++)
5688 int nelt0 = ((3 - j) * nelt) % 3;
5689 int nelt1 = ((3 - j) * nelt + 1) % 3;
5690 int nelt2 = ((3 - j) * nelt + 2) % 3;
5691 for (i = 0; i < nelt; i++)
5693 if (3 * i + nelt0 < nelt)
5694 sel[3 * i + nelt0] = j0++;
5695 if (3 * i + nelt1 < nelt)
5696 sel[3 * i + nelt1] = nelt + j1++;
5697 if (3 * i + nelt2 < nelt)
5698 sel[3 * i + nelt2] = 0;
5700 indices.new_vector (sel, 2, nelt);
5701 if (!can_vec_perm_const_p (mode, mode, indices))
5703 if (dump_enabled_p ())
5704 dump_printf (MSG_MISSED_OPTIMIZATION,
5705 "permutation op not supported by target.\n");
5706 return false;
5709 for (i = 0; i < nelt; i++)
5711 if (3 * i + nelt0 < nelt)
5712 sel[3 * i + nelt0] = 3 * i + nelt0;
5713 if (3 * i + nelt1 < nelt)
5714 sel[3 * i + nelt1] = 3 * i + nelt1;
5715 if (3 * i + nelt2 < nelt)
5716 sel[3 * i + nelt2] = nelt + j2++;
5718 indices.new_vector (sel, 2, nelt);
5719 if (!can_vec_perm_const_p (mode, mode, indices))
5721 if (dump_enabled_p ())
5722 dump_printf (MSG_MISSED_OPTIMIZATION,
5723 "permutation op not supported by target.\n");
5724 return false;
5727 return true;
5729 else
5731 /* If length is not equal to 3 then only power of 2 is supported. */
5732 gcc_assert (pow2p_hwi (count));
5733 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5735 /* The encoding has 2 interleaved stepped patterns. */
5736 if(!multiple_p (nelt, 2))
5737 return false;
5738 vec_perm_builder sel (nelt, 2, 3);
5739 sel.quick_grow (6);
5740 for (i = 0; i < 3; i++)
5742 sel[i * 2] = i;
5743 sel[i * 2 + 1] = i + nelt;
5745 vec_perm_indices indices (sel, 2, nelt);
5746 if (can_vec_perm_const_p (mode, mode, indices))
5748 for (i = 0; i < 6; i++)
5749 sel[i] += exact_div (nelt, 2);
5750 indices.new_vector (sel, 2, nelt);
5751 if (can_vec_perm_const_p (mode, mode, indices))
5752 return true;
5757 if (dump_enabled_p ())
5758 dump_printf (MSG_MISSED_OPTIMIZATION,
5759 "permutation op not supported by target.\n");
5760 return false;
5763 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5764 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5766 internal_fn
5767 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5768 bool masked_p)
5770 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5771 vec_mask_len_store_lanes_optab, vectype,
5772 count))
5773 return IFN_MASK_LEN_STORE_LANES;
5774 else if (masked_p)
5776 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5777 vec_mask_store_lanes_optab, vectype,
5778 count))
5779 return IFN_MASK_STORE_LANES;
5781 else
5783 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5784 vec_store_lanes_optab, vectype, count))
5785 return IFN_STORE_LANES;
5787 return IFN_LAST;
5791 /* Function vect_permute_store_chain.
5793 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5794 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5795 the data correctly for the stores. Return the final references for stores
5796 in RESULT_CHAIN.
5798 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5799 The input is 4 vectors each containing 8 elements. We assign a number to
5800 each element, the input sequence is:
5802 1st vec: 0 1 2 3 4 5 6 7
5803 2nd vec: 8 9 10 11 12 13 14 15
5804 3rd vec: 16 17 18 19 20 21 22 23
5805 4th vec: 24 25 26 27 28 29 30 31
5807 The output sequence should be:
5809 1st vec: 0 8 16 24 1 9 17 25
5810 2nd vec: 2 10 18 26 3 11 19 27
5811 3rd vec: 4 12 20 28 5 13 21 30
5812 4th vec: 6 14 22 30 7 15 23 31
5814 i.e., we interleave the contents of the four vectors in their order.
5816 We use interleave_high/low instructions to create such output. The input of
5817 each interleave_high/low operation is two vectors:
5818 1st vec 2nd vec
5819 0 1 2 3 4 5 6 7
5820 the even elements of the result vector are obtained left-to-right from the
5821 high/low elements of the first vector. The odd elements of the result are
5822 obtained left-to-right from the high/low elements of the second vector.
5823 The output of interleave_high will be: 0 4 1 5
5824 and of interleave_low: 2 6 3 7
5827 The permutation is done in log LENGTH stages. In each stage interleave_high
5828 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5829 where the first argument is taken from the first half of DR_CHAIN and the
5830 second argument from it's second half.
5831 In our example,
5833 I1: interleave_high (1st vec, 3rd vec)
5834 I2: interleave_low (1st vec, 3rd vec)
5835 I3: interleave_high (2nd vec, 4th vec)
5836 I4: interleave_low (2nd vec, 4th vec)
5838 The output for the first stage is:
5840 I1: 0 16 1 17 2 18 3 19
5841 I2: 4 20 5 21 6 22 7 23
5842 I3: 8 24 9 25 10 26 11 27
5843 I4: 12 28 13 29 14 30 15 31
5845 The output of the second stage, i.e. the final result is:
5847 I1: 0 8 16 24 1 9 17 25
5848 I2: 2 10 18 26 3 11 19 27
5849 I3: 4 12 20 28 5 13 21 30
5850 I4: 6 14 22 30 7 15 23 31. */
5852 void
5853 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5854 unsigned int length,
5855 stmt_vec_info stmt_info,
5856 gimple_stmt_iterator *gsi,
5857 vec<tree> *result_chain)
5859 tree vect1, vect2, high, low;
5860 gimple *perm_stmt;
5861 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5862 tree perm_mask_low, perm_mask_high;
5863 tree data_ref;
5864 tree perm3_mask_low, perm3_mask_high;
5865 unsigned int i, j, n, log_length = exact_log2 (length);
5867 result_chain->quick_grow (length);
5868 memcpy (result_chain->address (), dr_chain.address (),
5869 length * sizeof (tree));
5871 if (length == 3)
5873 /* vect_grouped_store_supported ensures that this is constant. */
5874 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5875 unsigned int j0 = 0, j1 = 0, j2 = 0;
5877 vec_perm_builder sel (nelt, nelt, 1);
5878 sel.quick_grow (nelt);
5879 vec_perm_indices indices;
5880 for (j = 0; j < 3; j++)
5882 int nelt0 = ((3 - j) * nelt) % 3;
5883 int nelt1 = ((3 - j) * nelt + 1) % 3;
5884 int nelt2 = ((3 - j) * nelt + 2) % 3;
5886 for (i = 0; i < nelt; i++)
5888 if (3 * i + nelt0 < nelt)
5889 sel[3 * i + nelt0] = j0++;
5890 if (3 * i + nelt1 < nelt)
5891 sel[3 * i + nelt1] = nelt + j1++;
5892 if (3 * i + nelt2 < nelt)
5893 sel[3 * i + nelt2] = 0;
5895 indices.new_vector (sel, 2, nelt);
5896 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5898 for (i = 0; i < nelt; i++)
5900 if (3 * i + nelt0 < nelt)
5901 sel[3 * i + nelt0] = 3 * i + nelt0;
5902 if (3 * i + nelt1 < nelt)
5903 sel[3 * i + nelt1] = 3 * i + nelt1;
5904 if (3 * i + nelt2 < nelt)
5905 sel[3 * i + nelt2] = nelt + j2++;
5907 indices.new_vector (sel, 2, nelt);
5908 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5910 vect1 = dr_chain[0];
5911 vect2 = dr_chain[1];
5913 /* Create interleaving stmt:
5914 low = VEC_PERM_EXPR <vect1, vect2,
5915 {j, nelt, *, j + 1, nelt + j + 1, *,
5916 j + 2, nelt + j + 2, *, ...}> */
5917 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5918 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5919 vect2, perm3_mask_low);
5920 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5922 vect1 = data_ref;
5923 vect2 = dr_chain[2];
5924 /* Create interleaving stmt:
5925 low = VEC_PERM_EXPR <vect1, vect2,
5926 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5927 6, 7, nelt + j + 2, ...}> */
5928 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5929 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5930 vect2, perm3_mask_high);
5931 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5932 (*result_chain)[j] = data_ref;
5935 else
5937 /* If length is not equal to 3 then only power of 2 is supported. */
5938 gcc_assert (pow2p_hwi (length));
5940 /* The encoding has 2 interleaved stepped patterns. */
5941 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5942 vec_perm_builder sel (nelt, 2, 3);
5943 sel.quick_grow (6);
5944 for (i = 0; i < 3; i++)
5946 sel[i * 2] = i;
5947 sel[i * 2 + 1] = i + nelt;
5949 vec_perm_indices indices (sel, 2, nelt);
5950 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5952 for (i = 0; i < 6; i++)
5953 sel[i] += exact_div (nelt, 2);
5954 indices.new_vector (sel, 2, nelt);
5955 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5957 for (i = 0, n = log_length; i < n; i++)
5959 for (j = 0; j < length/2; j++)
5961 vect1 = dr_chain[j];
5962 vect2 = dr_chain[j+length/2];
5964 /* Create interleaving stmt:
5965 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5966 ...}> */
5967 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5968 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5969 vect2, perm_mask_high);
5970 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5971 (*result_chain)[2*j] = high;
5973 /* Create interleaving stmt:
5974 low = VEC_PERM_EXPR <vect1, vect2,
5975 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5976 ...}> */
5977 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5978 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5979 vect2, perm_mask_low);
5980 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5981 (*result_chain)[2*j+1] = low;
5983 memcpy (dr_chain.address (), result_chain->address (),
5984 length * sizeof (tree));
5989 /* Function vect_setup_realignment
5991 This function is called when vectorizing an unaligned load using
5992 the dr_explicit_realign[_optimized] scheme.
5993 This function generates the following code at the loop prolog:
5995 p = initial_addr;
5996 x msq_init = *(floor(p)); # prolog load
5997 realignment_token = call target_builtin;
5998 loop:
5999 x msq = phi (msq_init, ---)
6001 The stmts marked with x are generated only for the case of
6002 dr_explicit_realign_optimized.
6004 The code above sets up a new (vector) pointer, pointing to the first
6005 location accessed by STMT_INFO, and a "floor-aligned" load using that
6006 pointer. It also generates code to compute the "realignment-token"
6007 (if the relevant target hook was defined), and creates a phi-node at the
6008 loop-header bb whose arguments are the result of the prolog-load (created
6009 by this function) and the result of a load that takes place in the loop
6010 (to be created by the caller to this function).
6012 For the case of dr_explicit_realign_optimized:
6013 The caller to this function uses the phi-result (msq) to create the
6014 realignment code inside the loop, and sets up the missing phi argument,
6015 as follows:
6016 loop:
6017 msq = phi (msq_init, lsq)
6018 lsq = *(floor(p')); # load in loop
6019 result = realign_load (msq, lsq, realignment_token);
6021 For the case of dr_explicit_realign:
6022 loop:
6023 msq = *(floor(p)); # load in loop
6024 p' = p + (VS-1);
6025 lsq = *(floor(p')); # load in loop
6026 result = realign_load (msq, lsq, realignment_token);
6028 Input:
6029 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6030 a memory location that may be unaligned.
6031 BSI - place where new code is to be inserted.
6032 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6033 is used.
6035 Output:
6036 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6037 target hook, if defined.
6038 Return value - the result of the loop-header phi node. */
6040 tree
6041 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6042 gimple_stmt_iterator *gsi, tree *realignment_token,
6043 enum dr_alignment_support alignment_support_scheme,
6044 tree init_addr,
6045 class loop **at_loop)
6047 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6048 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6049 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6050 struct data_reference *dr = dr_info->dr;
6051 class loop *loop = NULL;
6052 edge pe = NULL;
6053 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6054 tree vec_dest;
6055 gimple *inc;
6056 tree ptr;
6057 tree data_ref;
6058 basic_block new_bb;
6059 tree msq_init = NULL_TREE;
6060 tree new_temp;
6061 gphi *phi_stmt;
6062 tree msq = NULL_TREE;
6063 gimple_seq stmts = NULL;
6064 bool compute_in_loop = false;
6065 bool nested_in_vect_loop = false;
6066 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6067 class loop *loop_for_initial_load = NULL;
6069 if (loop_vinfo)
6071 loop = LOOP_VINFO_LOOP (loop_vinfo);
6072 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6075 gcc_assert (alignment_support_scheme == dr_explicit_realign
6076 || alignment_support_scheme == dr_explicit_realign_optimized);
6078 /* We need to generate three things:
6079 1. the misalignment computation
6080 2. the extra vector load (for the optimized realignment scheme).
6081 3. the phi node for the two vectors from which the realignment is
6082 done (for the optimized realignment scheme). */
6084 /* 1. Determine where to generate the misalignment computation.
6086 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6087 calculation will be generated by this function, outside the loop (in the
6088 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6089 caller, inside the loop.
6091 Background: If the misalignment remains fixed throughout the iterations of
6092 the loop, then both realignment schemes are applicable, and also the
6093 misalignment computation can be done outside LOOP. This is because we are
6094 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6095 are a multiple of VS (the Vector Size), and therefore the misalignment in
6096 different vectorized LOOP iterations is always the same.
6097 The problem arises only if the memory access is in an inner-loop nested
6098 inside LOOP, which is now being vectorized using outer-loop vectorization.
6099 This is the only case when the misalignment of the memory access may not
6100 remain fixed throughout the iterations of the inner-loop (as explained in
6101 detail in vect_supportable_dr_alignment). In this case, not only is the
6102 optimized realignment scheme not applicable, but also the misalignment
6103 computation (and generation of the realignment token that is passed to
6104 REALIGN_LOAD) have to be done inside the loop.
6106 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6107 or not, which in turn determines if the misalignment is computed inside
6108 the inner-loop, or outside LOOP. */
6110 if (init_addr != NULL_TREE || !loop_vinfo)
6112 compute_in_loop = true;
6113 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6117 /* 2. Determine where to generate the extra vector load.
6119 For the optimized realignment scheme, instead of generating two vector
6120 loads in each iteration, we generate a single extra vector load in the
6121 preheader of the loop, and in each iteration reuse the result of the
6122 vector load from the previous iteration. In case the memory access is in
6123 an inner-loop nested inside LOOP, which is now being vectorized using
6124 outer-loop vectorization, we need to determine whether this initial vector
6125 load should be generated at the preheader of the inner-loop, or can be
6126 generated at the preheader of LOOP. If the memory access has no evolution
6127 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6128 to be generated inside LOOP (in the preheader of the inner-loop). */
6130 if (nested_in_vect_loop)
6132 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6133 bool invariant_in_outerloop =
6134 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6135 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6137 else
6138 loop_for_initial_load = loop;
6139 if (at_loop)
6140 *at_loop = loop_for_initial_load;
6142 tree vuse = NULL_TREE;
6143 if (loop_for_initial_load)
6145 pe = loop_preheader_edge (loop_for_initial_load);
6146 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6147 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6149 if (!vuse)
6150 vuse = gimple_vuse (gsi_stmt (*gsi));
6152 /* 3. For the case of the optimized realignment, create the first vector
6153 load at the loop preheader. */
6155 if (alignment_support_scheme == dr_explicit_realign_optimized)
6157 /* Create msq_init = *(floor(p1)) in the loop preheader */
6158 gassign *new_stmt;
6160 gcc_assert (!compute_in_loop);
6161 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6162 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6163 loop_for_initial_load, NULL_TREE,
6164 &init_addr, NULL, &inc, true);
6165 if (TREE_CODE (ptr) == SSA_NAME)
6166 new_temp = copy_ssa_name (ptr);
6167 else
6168 new_temp = make_ssa_name (TREE_TYPE (ptr));
6169 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6170 tree type = TREE_TYPE (ptr);
6171 new_stmt = gimple_build_assign
6172 (new_temp, BIT_AND_EXPR, ptr,
6173 fold_build2 (MINUS_EXPR, type,
6174 build_int_cst (type, 0),
6175 build_int_cst (type, align)));
6176 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6177 gcc_assert (!new_bb);
6178 data_ref
6179 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6180 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6181 vect_copy_ref_info (data_ref, DR_REF (dr));
6182 new_stmt = gimple_build_assign (vec_dest, data_ref);
6183 new_temp = make_ssa_name (vec_dest, new_stmt);
6184 gimple_assign_set_lhs (new_stmt, new_temp);
6185 gimple_set_vuse (new_stmt, vuse);
6186 if (pe)
6188 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6189 gcc_assert (!new_bb);
6191 else
6192 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6194 msq_init = gimple_assign_lhs (new_stmt);
6197 /* 4. Create realignment token using a target builtin, if available.
6198 It is done either inside the containing loop, or before LOOP (as
6199 determined above). */
6201 if (targetm.vectorize.builtin_mask_for_load)
6203 gcall *new_stmt;
6204 tree builtin_decl;
6206 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6207 if (!init_addr)
6209 /* Generate the INIT_ADDR computation outside LOOP. */
6210 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6211 stmt_info, &stmts,
6212 NULL_TREE);
6213 if (loop)
6215 pe = loop_preheader_edge (loop);
6216 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6217 gcc_assert (!new_bb);
6219 else
6220 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6223 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6224 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6225 vec_dest =
6226 vect_create_destination_var (scalar_dest,
6227 gimple_call_return_type (new_stmt));
6228 new_temp = make_ssa_name (vec_dest, new_stmt);
6229 gimple_call_set_lhs (new_stmt, new_temp);
6231 if (compute_in_loop)
6232 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6233 else
6235 /* Generate the misalignment computation outside LOOP. */
6236 pe = loop_preheader_edge (loop);
6237 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6238 gcc_assert (!new_bb);
6241 *realignment_token = gimple_call_lhs (new_stmt);
6243 /* The result of the CALL_EXPR to this builtin is determined from
6244 the value of the parameter and no global variables are touched
6245 which makes the builtin a "const" function. Requiring the
6246 builtin to have the "const" attribute makes it unnecessary
6247 to call mark_call_clobbered. */
6248 gcc_assert (TREE_READONLY (builtin_decl));
6251 if (alignment_support_scheme == dr_explicit_realign)
6252 return msq;
6254 gcc_assert (!compute_in_loop);
6255 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6258 /* 5. Create msq = phi <msq_init, lsq> in loop */
6260 pe = loop_preheader_edge (containing_loop);
6261 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6262 msq = make_ssa_name (vec_dest);
6263 phi_stmt = create_phi_node (msq, containing_loop->header);
6264 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6266 return msq;
6270 /* Function vect_grouped_load_supported.
6272 COUNT is the size of the load group (the number of statements plus the
6273 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6274 only one statement, with a gap of COUNT - 1.
6276 Returns true if a suitable permute exists. */
6278 bool
6279 vect_grouped_load_supported (tree vectype, bool single_element_p,
6280 unsigned HOST_WIDE_INT count)
6282 machine_mode mode = TYPE_MODE (vectype);
6284 /* If this is single-element interleaving with an element distance
6285 that leaves unused vector loads around punt - we at least create
6286 very sub-optimal code in that case (and blow up memory,
6287 see PR65518). */
6288 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6290 if (dump_enabled_p ())
6291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292 "single-element interleaving not supported "
6293 "for not adjacent vector loads\n");
6294 return false;
6297 /* vect_permute_load_chain requires the group size to be equal to 3 or
6298 be a power of two. */
6299 if (count != 3 && exact_log2 (count) == -1)
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303 "the size of the group of accesses"
6304 " is not a power of 2 or not equal to 3\n");
6305 return false;
6308 /* Check that the permutation is supported. */
6309 if (VECTOR_MODE_P (mode))
6311 unsigned int i, j;
6312 if (count == 3)
6314 unsigned int nelt;
6315 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6317 if (dump_enabled_p ())
6318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6319 "cannot handle groups of 3 loads for"
6320 " variable-length vectors\n");
6321 return false;
6324 vec_perm_builder sel (nelt, nelt, 1);
6325 sel.quick_grow (nelt);
6326 vec_perm_indices indices;
6327 unsigned int k;
6328 for (k = 0; k < 3; k++)
6330 for (i = 0; i < nelt; i++)
6331 if (3 * i + k < 2 * nelt)
6332 sel[i] = 3 * i + k;
6333 else
6334 sel[i] = 0;
6335 indices.new_vector (sel, 2, nelt);
6336 if (!can_vec_perm_const_p (mode, mode, indices))
6338 if (dump_enabled_p ())
6339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6340 "shuffle of 3 loads is not supported by"
6341 " target\n");
6342 return false;
6344 for (i = 0, j = 0; i < nelt; i++)
6345 if (3 * i + k < 2 * nelt)
6346 sel[i] = i;
6347 else
6348 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6349 indices.new_vector (sel, 2, nelt);
6350 if (!can_vec_perm_const_p (mode, mode, indices))
6352 if (dump_enabled_p ())
6353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6354 "shuffle of 3 loads is not supported by"
6355 " target\n");
6356 return false;
6359 return true;
6361 else
6363 /* If length is not equal to 3 then only power of 2 is supported. */
6364 gcc_assert (pow2p_hwi (count));
6365 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6367 /* The encoding has a single stepped pattern. */
6368 vec_perm_builder sel (nelt, 1, 3);
6369 sel.quick_grow (3);
6370 for (i = 0; i < 3; i++)
6371 sel[i] = i * 2;
6372 vec_perm_indices indices (sel, 2, nelt);
6373 if (can_vec_perm_const_p (mode, mode, indices))
6375 for (i = 0; i < 3; i++)
6376 sel[i] = i * 2 + 1;
6377 indices.new_vector (sel, 2, nelt);
6378 if (can_vec_perm_const_p (mode, mode, indices))
6379 return true;
6384 if (dump_enabled_p ())
6385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6386 "extract even/odd not supported by target\n");
6387 return false;
6390 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6391 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6393 internal_fn
6394 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6395 bool masked_p)
6397 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6398 vec_mask_len_load_lanes_optab, vectype,
6399 count))
6400 return IFN_MASK_LEN_LOAD_LANES;
6401 else if (masked_p)
6403 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6404 vec_mask_load_lanes_optab, vectype,
6405 count))
6406 return IFN_MASK_LOAD_LANES;
6408 else
6410 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6411 vectype, count))
6412 return IFN_LOAD_LANES;
6414 return IFN_LAST;
6417 /* Function vect_permute_load_chain.
6419 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6420 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6421 the input data correctly. Return the final references for loads in
6422 RESULT_CHAIN.
6424 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6425 The input is 4 vectors each containing 8 elements. We assign a number to each
6426 element, the input sequence is:
6428 1st vec: 0 1 2 3 4 5 6 7
6429 2nd vec: 8 9 10 11 12 13 14 15
6430 3rd vec: 16 17 18 19 20 21 22 23
6431 4th vec: 24 25 26 27 28 29 30 31
6433 The output sequence should be:
6435 1st vec: 0 4 8 12 16 20 24 28
6436 2nd vec: 1 5 9 13 17 21 25 29
6437 3rd vec: 2 6 10 14 18 22 26 30
6438 4th vec: 3 7 11 15 19 23 27 31
6440 i.e., the first output vector should contain the first elements of each
6441 interleaving group, etc.
6443 We use extract_even/odd instructions to create such output. The input of
6444 each extract_even/odd operation is two vectors
6445 1st vec 2nd vec
6446 0 1 2 3 4 5 6 7
6448 and the output is the vector of extracted even/odd elements. The output of
6449 extract_even will be: 0 2 4 6
6450 and of extract_odd: 1 3 5 7
6453 The permutation is done in log LENGTH stages. In each stage extract_even
6454 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6455 their order. In our example,
6457 E1: extract_even (1st vec, 2nd vec)
6458 E2: extract_odd (1st vec, 2nd vec)
6459 E3: extract_even (3rd vec, 4th vec)
6460 E4: extract_odd (3rd vec, 4th vec)
6462 The output for the first stage will be:
6464 E1: 0 2 4 6 8 10 12 14
6465 E2: 1 3 5 7 9 11 13 15
6466 E3: 16 18 20 22 24 26 28 30
6467 E4: 17 19 21 23 25 27 29 31
6469 In order to proceed and create the correct sequence for the next stage (or
6470 for the correct output, if the second stage is the last one, as in our
6471 example), we first put the output of extract_even operation and then the
6472 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6473 The input for the second stage is:
6475 1st vec (E1): 0 2 4 6 8 10 12 14
6476 2nd vec (E3): 16 18 20 22 24 26 28 30
6477 3rd vec (E2): 1 3 5 7 9 11 13 15
6478 4th vec (E4): 17 19 21 23 25 27 29 31
6480 The output of the second stage:
6482 E1: 0 4 8 12 16 20 24 28
6483 E2: 2 6 10 14 18 22 26 30
6484 E3: 1 5 9 13 17 21 25 29
6485 E4: 3 7 11 15 19 23 27 31
6487 And RESULT_CHAIN after reordering:
6489 1st vec (E1): 0 4 8 12 16 20 24 28
6490 2nd vec (E3): 1 5 9 13 17 21 25 29
6491 3rd vec (E2): 2 6 10 14 18 22 26 30
6492 4th vec (E4): 3 7 11 15 19 23 27 31. */
6494 static void
6495 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6496 unsigned int length,
6497 stmt_vec_info stmt_info,
6498 gimple_stmt_iterator *gsi,
6499 vec<tree> *result_chain)
6501 tree data_ref, first_vect, second_vect;
6502 tree perm_mask_even, perm_mask_odd;
6503 tree perm3_mask_low, perm3_mask_high;
6504 gimple *perm_stmt;
6505 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6506 unsigned int i, j, log_length = exact_log2 (length);
6508 result_chain->quick_grow (length);
6509 memcpy (result_chain->address (), dr_chain.address (),
6510 length * sizeof (tree));
6512 if (length == 3)
6514 /* vect_grouped_load_supported ensures that this is constant. */
6515 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6516 unsigned int k;
6518 vec_perm_builder sel (nelt, nelt, 1);
6519 sel.quick_grow (nelt);
6520 vec_perm_indices indices;
6521 for (k = 0; k < 3; k++)
6523 for (i = 0; i < nelt; i++)
6524 if (3 * i + k < 2 * nelt)
6525 sel[i] = 3 * i + k;
6526 else
6527 sel[i] = 0;
6528 indices.new_vector (sel, 2, nelt);
6529 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6531 for (i = 0, j = 0; i < nelt; i++)
6532 if (3 * i + k < 2 * nelt)
6533 sel[i] = i;
6534 else
6535 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6536 indices.new_vector (sel, 2, nelt);
6537 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6539 first_vect = dr_chain[0];
6540 second_vect = dr_chain[1];
6542 /* Create interleaving stmt (low part of):
6543 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6544 ...}> */
6545 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6546 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6547 second_vect, perm3_mask_low);
6548 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6550 /* Create interleaving stmt (high part of):
6551 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6552 ...}> */
6553 first_vect = data_ref;
6554 second_vect = dr_chain[2];
6555 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6556 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6557 second_vect, perm3_mask_high);
6558 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6559 (*result_chain)[k] = data_ref;
6562 else
6564 /* If length is not equal to 3 then only power of 2 is supported. */
6565 gcc_assert (pow2p_hwi (length));
6567 /* The encoding has a single stepped pattern. */
6568 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6569 vec_perm_builder sel (nelt, 1, 3);
6570 sel.quick_grow (3);
6571 for (i = 0; i < 3; ++i)
6572 sel[i] = i * 2;
6573 vec_perm_indices indices (sel, 2, nelt);
6574 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6576 for (i = 0; i < 3; ++i)
6577 sel[i] = i * 2 + 1;
6578 indices.new_vector (sel, 2, nelt);
6579 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6581 for (i = 0; i < log_length; i++)
6583 for (j = 0; j < length; j += 2)
6585 first_vect = dr_chain[j];
6586 second_vect = dr_chain[j+1];
6588 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6589 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6590 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6591 first_vect, second_vect,
6592 perm_mask_even);
6593 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6594 (*result_chain)[j/2] = data_ref;
6596 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6597 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6598 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6599 first_vect, second_vect,
6600 perm_mask_odd);
6601 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6602 (*result_chain)[j/2+length/2] = data_ref;
6604 memcpy (dr_chain.address (), result_chain->address (),
6605 length * sizeof (tree));
6610 /* Function vect_shift_permute_load_chain.
6612 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6613 sequence of stmts to reorder the input data accordingly.
6614 Return the final references for loads in RESULT_CHAIN.
6615 Return true if successed, false otherwise.
6617 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6618 The input is 3 vectors each containing 8 elements. We assign a
6619 number to each element, the input sequence is:
6621 1st vec: 0 1 2 3 4 5 6 7
6622 2nd vec: 8 9 10 11 12 13 14 15
6623 3rd vec: 16 17 18 19 20 21 22 23
6625 The output sequence should be:
6627 1st vec: 0 3 6 9 12 15 18 21
6628 2nd vec: 1 4 7 10 13 16 19 22
6629 3rd vec: 2 5 8 11 14 17 20 23
6631 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6633 First we shuffle all 3 vectors to get correct elements order:
6635 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6636 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6637 3rd vec: (16 19 22) (17 20 23) (18 21)
6639 Next we unite and shift vector 3 times:
6641 1st step:
6642 shift right by 6 the concatenation of:
6643 "1st vec" and "2nd vec"
6644 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6645 "2nd vec" and "3rd vec"
6646 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6647 "3rd vec" and "1st vec"
6648 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6649 | New vectors |
6651 So that now new vectors are:
6653 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6654 2nd vec: (10 13) (16 19 22) (17 20 23)
6655 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6657 2nd step:
6658 shift right by 5 the concatenation of:
6659 "1st vec" and "3rd vec"
6660 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6661 "2nd vec" and "1st vec"
6662 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6663 "3rd vec" and "2nd vec"
6664 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6665 | New vectors |
6667 So that now new vectors are:
6669 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6670 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6671 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6673 3rd step:
6674 shift right by 5 the concatenation of:
6675 "1st vec" and "1st vec"
6676 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6677 shift right by 3 the concatenation of:
6678 "2nd vec" and "2nd vec"
6679 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6680 | New vectors |
6682 So that now all vectors are READY:
6683 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6684 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6685 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6687 This algorithm is faster than one in vect_permute_load_chain if:
6688 1. "shift of a concatination" is faster than general permutation.
6689 This is usually so.
6690 2. The TARGET machine can't execute vector instructions in parallel.
6691 This is because each step of the algorithm depends on previous.
6692 The algorithm in vect_permute_load_chain is much more parallel.
6694 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6697 static bool
6698 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6699 unsigned int length,
6700 stmt_vec_info stmt_info,
6701 gimple_stmt_iterator *gsi,
6702 vec<tree> *result_chain)
6704 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6705 tree perm2_mask1, perm2_mask2, perm3_mask;
6706 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6707 gimple *perm_stmt;
6709 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6710 machine_mode vmode = TYPE_MODE (vectype);
6711 unsigned int i;
6712 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6714 unsigned HOST_WIDE_INT nelt, vf;
6715 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6716 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6717 /* Not supported for variable-length vectors. */
6718 return false;
6720 vec_perm_builder sel (nelt, nelt, 1);
6721 sel.quick_grow (nelt);
6723 result_chain->quick_grow (length);
6724 memcpy (result_chain->address (), dr_chain.address (),
6725 length * sizeof (tree));
6727 if (pow2p_hwi (length) && vf > 4)
6729 unsigned int j, log_length = exact_log2 (length);
6730 for (i = 0; i < nelt / 2; ++i)
6731 sel[i] = i * 2;
6732 for (i = 0; i < nelt / 2; ++i)
6733 sel[nelt / 2 + i] = i * 2 + 1;
6734 vec_perm_indices indices (sel, 2, nelt);
6735 if (!can_vec_perm_const_p (vmode, vmode, indices))
6737 if (dump_enabled_p ())
6738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6739 "shuffle of 2 fields structure is not \
6740 supported by target\n");
6741 return false;
6743 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6745 for (i = 0; i < nelt / 2; ++i)
6746 sel[i] = i * 2 + 1;
6747 for (i = 0; i < nelt / 2; ++i)
6748 sel[nelt / 2 + i] = i * 2;
6749 indices.new_vector (sel, 2, nelt);
6750 if (!can_vec_perm_const_p (vmode, vmode, indices))
6752 if (dump_enabled_p ())
6753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6754 "shuffle of 2 fields structure is not \
6755 supported by target\n");
6756 return false;
6758 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6760 /* Generating permutation constant to shift all elements.
6761 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6762 for (i = 0; i < nelt; i++)
6763 sel[i] = nelt / 2 + i;
6764 indices.new_vector (sel, 2, nelt);
6765 if (!can_vec_perm_const_p (vmode, vmode, indices))
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 "shift permutation is not supported by target\n");
6770 return false;
6772 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6774 /* Generating permutation constant to select vector from 2.
6775 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6776 for (i = 0; i < nelt / 2; i++)
6777 sel[i] = i;
6778 for (i = nelt / 2; i < nelt; i++)
6779 sel[i] = nelt + i;
6780 indices.new_vector (sel, 2, nelt);
6781 if (!can_vec_perm_const_p (vmode, vmode, indices))
6783 if (dump_enabled_p ())
6784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785 "select is not supported by target\n");
6786 return false;
6788 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6790 for (i = 0; i < log_length; i++)
6792 for (j = 0; j < length; j += 2)
6794 first_vect = dr_chain[j];
6795 second_vect = dr_chain[j + 1];
6797 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6798 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6799 first_vect, first_vect,
6800 perm2_mask1);
6801 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6802 vect[0] = data_ref;
6804 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6805 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6806 second_vect, second_vect,
6807 perm2_mask2);
6808 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6809 vect[1] = data_ref;
6811 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6812 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6813 vect[0], vect[1], shift1_mask);
6814 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6815 (*result_chain)[j/2 + length/2] = data_ref;
6817 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6818 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6819 vect[0], vect[1], select_mask);
6820 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6821 (*result_chain)[j/2] = data_ref;
6823 memcpy (dr_chain.address (), result_chain->address (),
6824 length * sizeof (tree));
6826 return true;
6828 if (length == 3 && vf > 2)
6830 unsigned int k = 0, l = 0;
6832 /* Generating permutation constant to get all elements in rigth order.
6833 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6834 for (i = 0; i < nelt; i++)
6836 if (3 * k + (l % 3) >= nelt)
6838 k = 0;
6839 l += (3 - (nelt % 3));
6841 sel[i] = 3 * k + (l % 3);
6842 k++;
6844 vec_perm_indices indices (sel, 2, nelt);
6845 if (!can_vec_perm_const_p (vmode, vmode, indices))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 "shuffle of 3 fields structure is not \
6850 supported by target\n");
6851 return false;
6853 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6855 /* Generating permutation constant to shift all elements.
6856 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6857 for (i = 0; i < nelt; i++)
6858 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6859 indices.new_vector (sel, 2, nelt);
6860 if (!can_vec_perm_const_p (vmode, vmode, indices))
6862 if (dump_enabled_p ())
6863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6864 "shift permutation is not supported by target\n");
6865 return false;
6867 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6869 /* Generating permutation constant to shift all elements.
6870 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6871 for (i = 0; i < nelt; i++)
6872 sel[i] = 2 * (nelt / 3) + 1 + i;
6873 indices.new_vector (sel, 2, nelt);
6874 if (!can_vec_perm_const_p (vmode, vmode, indices))
6876 if (dump_enabled_p ())
6877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6878 "shift permutation is not supported by target\n");
6879 return false;
6881 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6883 /* Generating permutation constant to shift all elements.
6884 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6885 for (i = 0; i < nelt; i++)
6886 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6887 indices.new_vector (sel, 2, nelt);
6888 if (!can_vec_perm_const_p (vmode, vmode, indices))
6890 if (dump_enabled_p ())
6891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6892 "shift permutation is not supported by target\n");
6893 return false;
6895 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6897 /* Generating permutation constant to shift all elements.
6898 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6899 for (i = 0; i < nelt; i++)
6900 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6901 indices.new_vector (sel, 2, nelt);
6902 if (!can_vec_perm_const_p (vmode, vmode, indices))
6904 if (dump_enabled_p ())
6905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906 "shift permutation is not supported by target\n");
6907 return false;
6909 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6911 for (k = 0; k < 3; k++)
6913 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6914 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6915 dr_chain[k], dr_chain[k],
6916 perm3_mask);
6917 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6918 vect[k] = data_ref;
6921 for (k = 0; k < 3; k++)
6923 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6924 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6925 vect[k % 3], vect[(k + 1) % 3],
6926 shift1_mask);
6927 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6928 vect_shift[k] = data_ref;
6931 for (k = 0; k < 3; k++)
6933 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6934 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6935 vect_shift[(4 - k) % 3],
6936 vect_shift[(3 - k) % 3],
6937 shift2_mask);
6938 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6939 vect[k] = data_ref;
6942 (*result_chain)[3 - (nelt % 3)] = vect[2];
6944 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6945 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6946 vect[0], shift3_mask);
6947 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6948 (*result_chain)[nelt % 3] = data_ref;
6950 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6951 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6952 vect[1], shift4_mask);
6953 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6954 (*result_chain)[0] = data_ref;
6955 return true;
6957 return false;
6960 /* Function vect_transform_grouped_load.
6962 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6963 to perform their permutation and ascribe the result vectorized statements to
6964 the scalar statements.
6967 void
6968 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6969 vec<tree> dr_chain,
6970 int size, gimple_stmt_iterator *gsi)
6972 machine_mode mode;
6973 vec<tree> result_chain = vNULL;
6975 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6976 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6977 vectors, that are ready for vector computation. */
6978 result_chain.create (size);
6980 /* If reassociation width for vector type is 2 or greater target machine can
6981 execute 2 or more vector instructions in parallel. Otherwise try to
6982 get chain for loads group using vect_shift_permute_load_chain. */
6983 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6984 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6985 || pow2p_hwi (size)
6986 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6987 gsi, &result_chain))
6988 vect_permute_load_chain (vinfo, dr_chain,
6989 size, stmt_info, gsi, &result_chain);
6990 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6991 result_chain.release ();
6994 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6995 generated as part of the vectorization of STMT_INFO. Assign the statement
6996 for each vector to the associated scalar statement. */
6998 void
6999 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
7000 vec<tree> result_chain)
7002 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7003 unsigned int i, gap_count;
7004 tree tmp_data_ref;
7006 /* Put a permuted data-ref in the VECTORIZED_STMT field.
7007 Since we scan the chain starting from it's first node, their order
7008 corresponds the order of data-refs in RESULT_CHAIN. */
7009 stmt_vec_info next_stmt_info = first_stmt_info;
7010 gap_count = 1;
7011 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7013 if (!next_stmt_info)
7014 break;
7016 /* Skip the gaps. Loads created for the gaps will be removed by dead
7017 code elimination pass later. No need to check for the first stmt in
7018 the group, since it always exists.
7019 DR_GROUP_GAP is the number of steps in elements from the previous
7020 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
7021 correspond to the gaps. */
7022 if (next_stmt_info != first_stmt_info
7023 && gap_count < DR_GROUP_GAP (next_stmt_info))
7025 gap_count++;
7026 continue;
7029 /* ??? The following needs cleanup after the removal of
7030 DR_GROUP_SAME_DR_STMT. */
7031 if (next_stmt_info)
7033 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7034 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7035 copies, and we put the new vector statement last. */
7036 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7038 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7039 gap_count = 1;
7044 /* Function vect_force_dr_alignment_p.
7046 Returns whether the alignment of a DECL can be forced to be aligned
7047 on ALIGNMENT bit boundary. */
7049 bool
7050 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7052 if (!VAR_P (decl))
7053 return false;
7055 if (decl_in_symtab_p (decl)
7056 && !symtab_node::get (decl)->can_increase_alignment_p ())
7057 return false;
7059 if (TREE_STATIC (decl))
7060 return (known_le (alignment,
7061 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7062 else
7063 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7066 /* Return whether the data reference DR_INFO is supported with respect to its
7067 alignment.
7068 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7069 it is aligned, i.e., check if it is possible to vectorize it with different
7070 alignment. */
7072 enum dr_alignment_support
7073 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7074 tree vectype, int misalignment)
7076 data_reference *dr = dr_info->dr;
7077 stmt_vec_info stmt_info = dr_info->stmt;
7078 machine_mode mode = TYPE_MODE (vectype);
7079 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7080 class loop *vect_loop = NULL;
7081 bool nested_in_vect_loop = false;
7083 if (misalignment == 0)
7084 return dr_aligned;
7086 /* For now assume all conditional loads/stores support unaligned
7087 access without any special code. */
7088 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7089 if (gimple_call_internal_p (stmt)
7090 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7091 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7092 return dr_unaligned_supported;
7094 if (loop_vinfo)
7096 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7097 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7100 /* Possibly unaligned access. */
7102 /* We can choose between using the implicit realignment scheme (generating
7103 a misaligned_move stmt) and the explicit realignment scheme (generating
7104 aligned loads with a REALIGN_LOAD). There are two variants to the
7105 explicit realignment scheme: optimized, and unoptimized.
7106 We can optimize the realignment only if the step between consecutive
7107 vector loads is equal to the vector size. Since the vector memory
7108 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7109 is guaranteed that the misalignment amount remains the same throughout the
7110 execution of the vectorized loop. Therefore, we can create the
7111 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7112 at the loop preheader.
7114 However, in the case of outer-loop vectorization, when vectorizing a
7115 memory access in the inner-loop nested within the LOOP that is now being
7116 vectorized, while it is guaranteed that the misalignment of the
7117 vectorized memory access will remain the same in different outer-loop
7118 iterations, it is *not* guaranteed that is will remain the same throughout
7119 the execution of the inner-loop. This is because the inner-loop advances
7120 with the original scalar step (and not in steps of VS). If the inner-loop
7121 step happens to be a multiple of VS, then the misalignment remains fixed
7122 and we can use the optimized realignment scheme. For example:
7124 for (i=0; i<N; i++)
7125 for (j=0; j<M; j++)
7126 s += a[i+j];
7128 When vectorizing the i-loop in the above example, the step between
7129 consecutive vector loads is 1, and so the misalignment does not remain
7130 fixed across the execution of the inner-loop, and the realignment cannot
7131 be optimized (as illustrated in the following pseudo vectorized loop):
7133 for (i=0; i<N; i+=4)
7134 for (j=0; j<M; j++){
7135 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7136 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7137 // (assuming that we start from an aligned address).
7140 We therefore have to use the unoptimized realignment scheme:
7142 for (i=0; i<N; i+=4)
7143 for (j=k; j<M; j+=4)
7144 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7145 // that the misalignment of the initial address is
7146 // 0).
7148 The loop can then be vectorized as follows:
7150 for (k=0; k<4; k++){
7151 rt = get_realignment_token (&vp[k]);
7152 for (i=0; i<N; i+=4){
7153 v1 = vp[i+k];
7154 for (j=k; j<M; j+=4){
7155 v2 = vp[i+j+VS-1];
7156 va = REALIGN_LOAD <v1,v2,rt>;
7157 vs += va;
7158 v1 = v2;
7161 } */
7163 if (DR_IS_READ (dr))
7165 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7166 && (!targetm.vectorize.builtin_mask_for_load
7167 || targetm.vectorize.builtin_mask_for_load ()))
7169 /* If we are doing SLP then the accesses need not have the
7170 same alignment, instead it depends on the SLP group size. */
7171 if (loop_vinfo
7172 && STMT_SLP_TYPE (stmt_info)
7173 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7174 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7175 * (DR_GROUP_SIZE
7176 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7177 TYPE_VECTOR_SUBPARTS (vectype))))
7179 else if (!loop_vinfo
7180 || (nested_in_vect_loop
7181 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7182 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7183 return dr_explicit_realign;
7184 else
7185 return dr_explicit_realign_optimized;
7189 bool is_packed = false;
7190 tree type = TREE_TYPE (DR_REF (dr));
7191 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7192 is_packed = not_size_aligned (DR_REF (dr));
7193 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7194 is_packed))
7195 return dr_unaligned_supported;
7197 /* Unsupported. */
7198 return dr_unaligned_unsupported;