Emit .note.GNU-stack for hard-float linux targets.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob554ef892254c8d8a5ffa6b1e61429cd02f4a3a7f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode, array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
96 return true;
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
160 if (i < gimple_call_num_args (call))
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
182 static opt_result
183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return opt_result::success ();
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
205 static void
206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
224 static bool
225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
235 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
236 SLP group are emitted at the position of the last scalar load and
237 all loads in an interleaving group are emitted at the position
238 of the first scalar load.
239 Stores in a group are emitted at the position of the last scalar store.
240 Compute that position and check whether the resulting order matches
241 the current one.
242 We have not yet decided between SLP and interleaving so we have
243 to conservatively assume both. */
244 stmt_vec_info il_a;
245 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246 if (last_a)
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 last_a = get_later_stmt (last_a, s);
251 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 s = DR_GROUP_NEXT_ELEMENT (s))
255 if (get_later_stmt (il_a, s) == il_a)
256 il_a = s;
258 else
259 il_a = last_a;
261 else
262 last_a = il_a = stmtinfo_a;
263 stmt_vec_info il_b;
264 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
265 if (last_b)
267 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
268 s = DR_GROUP_NEXT_ELEMENT (s))
269 last_b = get_later_stmt (last_b, s);
270 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
272 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
273 s = DR_GROUP_NEXT_ELEMENT (s))
274 if (get_later_stmt (il_b, s) == il_b)
275 il_b = s;
277 else
278 il_b = last_b;
280 else
281 last_b = il_b = stmtinfo_b;
282 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
283 return (/* SLP */
284 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
285 /* Interleaving */
286 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
287 /* Mixed */
288 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
289 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
292 /* A subroutine of vect_analyze_data_ref_dependence. Handle
293 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
294 distances. These distances are conservatively correct but they don't
295 reflect a guaranteed dependence.
297 Return true if this function does all the work necessary to avoid
298 an alias or false if the caller should use the dependence distances
299 to limit the vectorization factor in the usual way. LOOP_DEPTH is
300 the depth of the loop described by LOOP_VINFO and the other arguments
301 are as for vect_analyze_data_ref_dependence. */
303 static bool
304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
305 loop_vec_info loop_vinfo,
306 int loop_depth, unsigned int *max_vf)
308 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309 lambda_vector dist_v;
310 unsigned int i;
311 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
313 int dist = dist_v[loop_depth];
314 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
316 /* If the user asserted safelen >= DIST consecutive iterations
317 can be executed concurrently, assume independence.
319 ??? An alternative would be to add the alias check even
320 in this case, and vectorize the fallback loop with the
321 maximum VF set to safelen. However, if the user has
322 explicitly given a length, it's less likely that that
323 would be a win. */
324 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
326 if ((unsigned int) loop->safelen < *max_vf)
327 *max_vf = loop->safelen;
328 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
329 continue;
332 /* For dependence distances of 2 or more, we have the option
333 of limiting VF or checking for an alias at runtime.
334 Prefer to check at runtime if we can, to avoid limiting
335 the VF unnecessarily when the bases are in fact independent.
337 Note that the alias checks will be removed if the VF ends up
338 being small enough. */
339 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
340 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
341 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
342 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
343 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
346 return true;
350 /* Function vect_analyze_data_ref_dependence.
352 FIXME: I needed to change the sense of the returned flag.
354 Return FALSE if there (might) exist a dependence between a memory-reference
355 DRA and a memory-reference DRB. When versioning for alias may check a
356 dependence at run-time, return TRUE. Adjust *MAX_VF according to
357 the data dependence. */
359 static opt_result
360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
361 loop_vec_info loop_vinfo,
362 unsigned int *max_vf)
364 unsigned int i;
365 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
366 struct data_reference *dra = DDR_A (ddr);
367 struct data_reference *drb = DDR_B (ddr);
368 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
369 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
370 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
371 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
372 lambda_vector dist_v;
373 unsigned int loop_depth;
375 /* In loop analysis all data references should be vectorizable. */
376 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
377 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
378 gcc_unreachable ();
380 /* Independent data accesses. */
381 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
382 return opt_result::success ();
384 if (dra == drb
385 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
386 return opt_result::success ();
388 /* We do not have to consider dependences between accesses that belong
389 to the same group, unless the stride could be smaller than the
390 group size. */
391 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
394 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
395 return opt_result::success ();
397 /* Even if we have an anti-dependence then, as the vectorized loop covers at
398 least two scalar iterations, there is always also a true dependence.
399 As the vectorizer does not re-order loads and stores we can ignore
400 the anti-dependence if TBAA can disambiguate both DRs similar to the
401 case with known negative distance anti-dependences (positive
402 distance anti-dependences would violate TBAA constraints). */
403 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
404 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
405 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
406 get_alias_set (DR_REF (drb))))
407 return opt_result::success ();
409 /* Unknown data dependence. */
410 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
412 /* If user asserted safelen consecutive iterations can be
413 executed concurrently, assume independence. */
414 if (loop->safelen >= 2)
416 if ((unsigned int) loop->safelen < *max_vf)
417 *max_vf = loop->safelen;
418 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
419 return opt_result::success ();
422 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
423 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
424 return opt_result::failure_at
425 (stmtinfo_a->stmt,
426 "versioning for alias not supported for: "
427 "can't determine dependence between %T and %T\n",
428 DR_REF (dra), DR_REF (drb));
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
432 "versioning for alias required: "
433 "can't determine dependence between %T and %T\n",
434 DR_REF (dra), DR_REF (drb));
436 /* Add to list of ddrs that need to be tested at run-time. */
437 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
440 /* Known data dependence. */
441 if (DDR_NUM_DIST_VECTS (ddr) == 0)
443 /* If user asserted safelen consecutive iterations can be
444 executed concurrently, assume independence. */
445 if (loop->safelen >= 2)
447 if ((unsigned int) loop->safelen < *max_vf)
448 *max_vf = loop->safelen;
449 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
450 return opt_result::success ();
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
455 return opt_result::failure_at
456 (stmtinfo_a->stmt,
457 "versioning for alias not supported for: "
458 "bad dist vector for %T and %T\n",
459 DR_REF (dra), DR_REF (drb));
461 if (dump_enabled_p ())
462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
463 "versioning for alias required: "
464 "bad dist vector for %T and %T\n",
465 DR_REF (dra), DR_REF (drb));
466 /* Add to list of ddrs that need to be tested at run-time. */
467 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
470 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
472 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
473 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
474 loop_depth, max_vf))
475 return opt_result::success ();
477 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
479 int dist = dist_v[loop_depth];
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_NOTE, vect_location,
483 "dependence distance = %d.\n", dist);
485 if (dist == 0)
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "dependence distance == 0 between %T and %T\n",
490 DR_REF (dra), DR_REF (drb));
492 /* When we perform grouped accesses and perform implicit CSE
493 by detecting equal accesses and doing disambiguation with
494 runtime alias tests like for
495 .. = a[i];
496 .. = a[i+1];
497 a[i] = ..;
498 a[i+1] = ..;
499 *p = ..;
500 .. = a[i];
501 .. = a[i+1];
502 where we will end up loading { a[i], a[i+1] } once, make
503 sure that inserting group loads before the first load and
504 stores after the last store will do the right thing.
505 Similar for groups like
506 a[i] = ...;
507 ... = a[i];
508 a[i+1] = ...;
509 where loads from the group interleave with the store. */
510 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
511 return opt_result::failure_at (stmtinfo_a->stmt,
512 "READ_WRITE dependence"
513 " in interleaving.\n");
515 if (loop->safelen < 2)
517 tree indicator = dr_zero_step_indicator (dra);
518 if (!indicator || integer_zerop (indicator))
519 return opt_result::failure_at (stmtinfo_a->stmt,
520 "access also has a zero step\n");
521 else if (TREE_CODE (indicator) != INTEGER_CST)
522 vect_check_nonzero_value (loop_vinfo, indicator);
524 continue;
527 if (dist > 0 && DDR_REVERSED_P (ddr))
529 /* If DDR_REVERSED_P the order of the data-refs in DDR was
530 reversed (to make distance vector positive), and the actual
531 distance is negative. */
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_NOTE, vect_location,
534 "dependence distance negative.\n");
535 /* When doing outer loop vectorization, we need to check if there is
536 a backward dependence at the inner loop level if the dependence
537 at the outer loop is reversed. See PR81740. */
538 if (nested_in_vect_loop_p (loop, stmtinfo_a)
539 || nested_in_vect_loop_p (loop, stmtinfo_b))
541 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
542 DDR_LOOP_NEST (ddr));
543 if (dist_v[inner_depth] < 0)
544 return opt_result::failure_at (stmtinfo_a->stmt,
545 "not vectorized, dependence "
546 "between data-refs %T and %T\n",
547 DR_REF (dra), DR_REF (drb));
549 /* Record a negative dependence distance to later limit the
550 amount of stmt copying / unrolling we can perform.
551 Only need to handle read-after-write dependence. */
552 if (DR_IS_READ (drb)
553 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
554 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
555 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
556 continue;
559 unsigned int abs_dist = abs (dist);
560 if (abs_dist >= 2 && abs_dist < *max_vf)
562 /* The dependence distance requires reduction of the maximal
563 vectorization factor. */
564 *max_vf = abs_dist;
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "adjusting maximal vectorization factor to %i\n",
568 *max_vf);
571 if (abs_dist >= *max_vf)
573 /* Dependence distance does not create dependence, as far as
574 vectorization is concerned, in this case. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location,
577 "dependence distance >= VF.\n");
578 continue;
581 return opt_result::failure_at (stmtinfo_a->stmt,
582 "not vectorized, possible dependence "
583 "between data-refs %T and %T\n",
584 DR_REF (dra), DR_REF (drb));
587 return opt_result::success ();
590 /* Function vect_analyze_data_ref_dependences.
592 Examine all the data references in the loop, and make sure there do not
593 exist any data dependences between them. Set *MAX_VF according to
594 the maximum vectorization factor the data dependences allow. */
596 opt_result
597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
598 unsigned int *max_vf)
600 unsigned int i;
601 struct data_dependence_relation *ddr;
603 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
605 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
607 LOOP_VINFO_DDRS (loop_vinfo)
608 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
609 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
610 /* We need read-read dependences to compute
611 STMT_VINFO_SAME_ALIGN_REFS. */
612 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
613 &LOOP_VINFO_DDRS (loop_vinfo),
614 LOOP_VINFO_LOOP_NEST (loop_vinfo),
615 true);
616 gcc_assert (res);
619 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
621 /* For epilogues we either have no aliases or alias versioning
622 was applied to original loop. Therefore we may just get max_vf
623 using VF of original loop. */
624 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
625 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
626 else
627 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
629 opt_result res
630 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
631 if (!res)
632 return res;
635 return opt_result::success ();
639 /* Function vect_slp_analyze_data_ref_dependence.
641 Return TRUE if there (might) exist a dependence between a memory-reference
642 DRA and a memory-reference DRB for VINFO. When versioning for alias
643 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
644 according to the data dependence. */
646 static bool
647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
648 struct data_dependence_relation *ddr)
650 struct data_reference *dra = DDR_A (ddr);
651 struct data_reference *drb = DDR_B (ddr);
652 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
653 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
655 /* We need to check dependences of statements marked as unvectorizable
656 as well, they still can prohibit vectorization. */
658 /* Independent data accesses. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
660 return false;
662 if (dra == drb)
663 return false;
665 /* Read-read is OK. */
666 if (DR_IS_READ (dra) && DR_IS_READ (drb))
667 return false;
669 /* If dra and drb are part of the same interleaving chain consider
670 them independent. */
671 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
672 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
673 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
674 return false;
676 /* Unknown data dependence. */
677 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681 "can't determine dependence between %T and %T\n",
682 DR_REF (dra), DR_REF (drb));
684 else if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE, vect_location,
686 "determined dependence between %T and %T\n",
687 DR_REF (dra), DR_REF (drb));
689 return true;
693 /* Analyze dependences involved in the transform of SLP NODE. STORES
694 contain the vector of scalar stores of this instance if we are
695 disambiguating the loads. */
697 static bool
698 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
699 vec<stmt_vec_info> stores,
700 stmt_vec_info last_store_info)
702 /* This walks over all stmts involved in the SLP load/store done
703 in NODE verifying we can sink them up to the last stmt in the
704 group. */
705 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
706 vec_info *vinfo = last_access_info->vinfo;
707 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
709 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
710 if (access_info == last_access_info)
711 continue;
712 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
713 ao_ref ref;
714 bool ref_initialized_p = false;
715 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
716 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
718 gimple *stmt = gsi_stmt (gsi);
719 if (! gimple_vuse (stmt)
720 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
721 continue;
723 /* If we couldn't record a (single) data reference for this
724 stmt we have to resort to the alias oracle. */
725 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
726 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
727 if (!dr_b)
729 /* We are moving a store or sinking a load - this means
730 we cannot use TBAA for disambiguation. */
731 if (!ref_initialized_p)
732 ao_ref_init (&ref, DR_REF (dr_a));
733 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
734 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
735 return false;
736 continue;
739 bool dependent = false;
740 /* If we run into a store of this same instance (we've just
741 marked those) then delay dependence checking until we run
742 into the last store because this is where it will have
743 been sunk to (and we verify if we can do that as well). */
744 if (gimple_visited_p (stmt))
746 if (stmt_info != last_store_info)
747 continue;
748 unsigned i;
749 stmt_vec_info store_info;
750 FOR_EACH_VEC_ELT (stores, i, store_info)
752 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
753 ddr_p ddr = initialize_data_dependence_relation
754 (dr_a, store_dr, vNULL);
755 dependent
756 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
757 free_dependence_relation (ddr);
758 if (dependent)
759 break;
762 else
764 ddr_p ddr = initialize_data_dependence_relation (dr_a,
765 dr_b, vNULL);
766 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
767 free_dependence_relation (ddr);
769 if (dependent)
770 return false;
773 return true;
777 /* Function vect_analyze_data_ref_dependences.
779 Examine all the data references in the basic-block, and make sure there
780 do not exist any data dependences between them. Set *MAX_VF according to
781 the maximum vectorization factor the data dependences allow. */
783 bool
784 vect_slp_analyze_instance_dependence (slp_instance instance)
786 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
788 /* The stores of this instance are at the root of the SLP tree. */
789 slp_tree store = SLP_INSTANCE_TREE (instance);
790 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
791 store = NULL;
793 /* Verify we can sink stores to the vectorized stmt insert location. */
794 stmt_vec_info last_store_info = NULL;
795 if (store)
797 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
798 return false;
800 /* Mark stores in this instance and remember the last one. */
801 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
802 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
803 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
806 bool res = true;
808 /* Verify we can sink loads to the vectorized stmt insert location,
809 special-casing stores of this instance. */
810 slp_tree load;
811 unsigned int i;
812 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
813 if (! vect_slp_analyze_node_dependences (instance, load,
814 store
815 ? SLP_TREE_SCALAR_STMTS (store)
816 : vNULL, last_store_info))
818 res = false;
819 break;
822 /* Unset the visited flag. */
823 if (store)
824 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
825 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
827 return res;
830 /* Record the base alignment guarantee given by DRB, which occurs
831 in STMT_INFO. */
833 static void
834 vect_record_base_alignment (stmt_vec_info stmt_info,
835 innermost_loop_behavior *drb)
837 vec_info *vinfo = stmt_info->vinfo;
838 bool existed;
839 innermost_loop_behavior *&entry
840 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
841 if (!existed || entry->base_alignment < drb->base_alignment)
843 entry = drb;
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_NOTE, vect_location,
846 "recording new base alignment for %T\n"
847 " alignment: %d\n"
848 " misalignment: %d\n"
849 " based on: %G",
850 drb->base_address,
851 drb->base_alignment,
852 drb->base_misalignment,
853 stmt_info->stmt);
857 /* If the region we're going to vectorize is reached, all unconditional
858 data references occur at least once. We can therefore pool the base
859 alignment guarantees from each unconditional reference. Do this by
860 going through all the data references in VINFO and checking whether
861 the containing statement makes the reference unconditionally. If so,
862 record the alignment of the base address in VINFO so that it can be
863 used for all other references with the same base. */
865 void
866 vect_record_base_alignments (vec_info *vinfo)
868 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
869 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
870 data_reference *dr;
871 unsigned int i;
872 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
874 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
875 stmt_vec_info stmt_info = dr_info->stmt;
876 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
877 && STMT_VINFO_VECTORIZABLE (stmt_info)
878 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
880 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
882 /* If DR is nested in the loop that is being vectorized, we can also
883 record the alignment of the base wrt the outer loop. */
884 if (loop && nested_in_vect_loop_p (loop, stmt_info))
885 vect_record_base_alignment
886 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
891 /* Return the target alignment for the vectorized form of DR_INFO. */
893 static poly_uint64
894 vect_calculate_target_alignment (dr_vec_info *dr_info)
896 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
897 return targetm.vectorize.preferred_vector_alignment (vectype);
900 /* Function vect_compute_data_ref_alignment
902 Compute the misalignment of the data reference DR_INFO.
904 Output:
905 1. DR_MISALIGNMENT (DR_INFO) is defined.
907 FOR NOW: No analysis is actually performed. Misalignment is calculated
908 only for trivial cases. TODO. */
910 static void
911 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
913 stmt_vec_info stmt_info = dr_info->stmt;
914 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
916 class loop *loop = NULL;
917 tree ref = DR_REF (dr_info->dr);
918 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
920 if (dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE, vect_location,
922 "vect_compute_data_ref_alignment:\n");
924 if (loop_vinfo)
925 loop = LOOP_VINFO_LOOP (loop_vinfo);
927 /* Initialize misalignment to unknown. */
928 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
930 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
931 return;
933 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
934 bool step_preserves_misalignment_p;
936 poly_uint64 vector_alignment
937 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
938 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
940 /* If the main loop has peeled for alignment we have no way of knowing
941 whether the data accesses in the epilogues are aligned. We can't at
942 compile time answer the question whether we have entered the main loop or
943 not. Fixes PR 92351. */
944 if (loop_vinfo)
946 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
947 if (orig_loop_vinfo
948 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
949 return;
952 unsigned HOST_WIDE_INT vect_align_c;
953 if (!vector_alignment.is_constant (&vect_align_c))
954 return;
956 /* No step for BB vectorization. */
957 if (!loop)
959 gcc_assert (integer_zerop (drb->step));
960 step_preserves_misalignment_p = true;
963 /* In case the dataref is in an inner-loop of the loop that is being
964 vectorized (LOOP), we use the base and misalignment information
965 relative to the outer-loop (LOOP). This is ok only if the misalignment
966 stays the same throughout the execution of the inner-loop, which is why
967 we have to check that the stride of the dataref in the inner-loop evenly
968 divides by the vector alignment. */
969 else if (nested_in_vect_loop_p (loop, stmt_info))
971 step_preserves_misalignment_p
972 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
974 if (dump_enabled_p ())
976 if (step_preserves_misalignment_p)
977 dump_printf_loc (MSG_NOTE, vect_location,
978 "inner step divides the vector alignment.\n");
979 else
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 "inner step doesn't divide the vector"
982 " alignment.\n");
986 /* Similarly we can only use base and misalignment information relative to
987 an innermost loop if the misalignment stays the same throughout the
988 execution of the loop. As above, this is the case if the stride of
989 the dataref evenly divides by the alignment. */
990 else
992 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
993 step_preserves_misalignment_p
994 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
996 if (!step_preserves_misalignment_p && dump_enabled_p ())
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 "step doesn't divide the vector alignment.\n");
1001 unsigned int base_alignment = drb->base_alignment;
1002 unsigned int base_misalignment = drb->base_misalignment;
1004 /* Calculate the maximum of the pooled base address alignment and the
1005 alignment that we can compute for DR itself. */
1006 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007 if (entry && base_alignment < (*entry)->base_alignment)
1009 base_alignment = (*entry)->base_alignment;
1010 base_misalignment = (*entry)->base_misalignment;
1013 if (drb->offset_alignment < vect_align_c
1014 || !step_preserves_misalignment_p
1015 /* We need to know whether the step wrt the vectorized loop is
1016 negative when computing the starting misalignment below. */
1017 || TREE_CODE (drb->step) != INTEGER_CST)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "Unknown alignment for access: %T\n", ref);
1022 return;
1025 if (base_alignment < vect_align_c)
1027 unsigned int max_alignment;
1028 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029 if (max_alignment < vect_align_c
1030 || !vect_can_force_dr_alignment_p (base,
1031 vect_align_c * BITS_PER_UNIT))
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_NOTE, vect_location,
1035 "can't force alignment of ref: %T\n", ref);
1036 return;
1039 /* Force the alignment of the decl.
1040 NOTE: This is the only change to the code we make during
1041 the analysis phase, before deciding to vectorize the loop. */
1042 if (dump_enabled_p ())
1043 dump_printf_loc (MSG_NOTE, vect_location,
1044 "force alignment of %T\n", ref);
1046 dr_info->base_decl = base;
1047 dr_info->base_misaligned = true;
1048 base_misalignment = 0;
1050 poly_int64 misalignment
1051 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1053 /* If this is a backward running DR then first access in the larger
1054 vectype actually is N-1 elements before the address in the DR.
1055 Adjust misalign accordingly. */
1056 if (tree_int_cst_sgn (drb->step) < 0)
1057 /* PLUS because STEP is negative. */
1058 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059 * TREE_INT_CST_LOW (drb->step));
1061 unsigned int const_misalignment;
1062 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "Non-constant misalignment for access: %T\n", ref);
1067 return;
1070 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "misalign = %d bytes of ref %T\n",
1075 DR_MISALIGNMENT (dr_info), ref);
1077 return;
1080 /* Function vect_update_misalignment_for_peel.
1081 Sets DR_INFO's misalignment
1082 - to 0 if it has the same alignment as DR_PEEL_INFO,
1083 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084 - to -1 (unknown) otherwise.
1086 DR_INFO - the data reference whose misalignment is to be adjusted.
1087 DR_PEEL_INFO - the data reference whose misalignment is being made
1088 zero in the vector loop by the peel.
1089 NPEEL - the number of iterations in the peel loop if the misalignment
1090 of DR_PEEL_INFO is known at compile time. */
1092 static void
1093 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094 dr_vec_info *dr_peel_info, int npeel)
1096 unsigned int i;
1097 vec<dr_p> same_aligned_drs;
1098 struct data_reference *current_dr;
1099 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1101 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102 it is aligned in the vector loop. */
1103 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1106 if (current_dr != dr_info->dr)
1107 continue;
1108 gcc_assert (!known_alignment_for_access_p (dr_info)
1109 || !known_alignment_for_access_p (dr_peel_info)
1110 || (DR_MISALIGNMENT (dr_info)
1111 == DR_MISALIGNMENT (dr_peel_info)));
1112 SET_DR_MISALIGNMENT (dr_info, 0);
1113 return;
1116 unsigned HOST_WIDE_INT alignment;
1117 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118 && known_alignment_for_access_p (dr_info)
1119 && known_alignment_for_access_p (dr_peel_info))
1121 int misal = DR_MISALIGNMENT (dr_info);
1122 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123 misal &= alignment - 1;
1124 SET_DR_MISALIGNMENT (dr_info, misal);
1125 return;
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130 "to unknown (-1).\n");
1131 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1135 /* Function verify_data_ref_alignment
1137 Return TRUE if DR_INFO can be handled with respect to alignment. */
1139 static opt_result
1140 verify_data_ref_alignment (dr_vec_info *dr_info)
1142 enum dr_alignment_support supportable_dr_alignment
1143 = vect_supportable_dr_alignment (dr_info, false);
1144 if (!supportable_dr_alignment)
1145 return opt_result::failure_at
1146 (dr_info->stmt->stmt,
1147 DR_IS_READ (dr_info->dr)
1148 ? "not vectorized: unsupported unaligned load: %T\n"
1149 : "not vectorized: unsupported unaligned store: %T\n",
1150 DR_REF (dr_info->dr));
1152 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153 dump_printf_loc (MSG_NOTE, vect_location,
1154 "Vectorizing an unaligned access.\n");
1156 return opt_result::success ();
1159 /* Function vect_verify_datarefs_alignment
1161 Return TRUE if all data references in the loop can be
1162 handled with respect to alignment. */
1164 opt_result
1165 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1167 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168 struct data_reference *dr;
1169 unsigned int i;
1171 FOR_EACH_VEC_ELT (datarefs, i, dr)
1173 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174 stmt_vec_info stmt_info = dr_info->stmt;
1176 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177 continue;
1179 /* For interleaving, only the alignment of the first access matters. */
1180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182 continue;
1184 /* Strided accesses perform only component accesses, alignment is
1185 irrelevant for them. */
1186 if (STMT_VINFO_STRIDED_P (stmt_info)
1187 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188 continue;
1190 opt_result res = verify_data_ref_alignment (dr_info);
1191 if (!res)
1192 return res;
1195 return opt_result::success ();
1198 /* Given an memory reference EXP return whether its alignment is less
1199 than its size. */
1201 static bool
1202 not_size_aligned (tree exp)
1204 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205 return true;
1207 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208 > get_object_alignment (exp));
1211 /* Function vector_alignment_reachable_p
1213 Return true if vector alignment for DR_INFO is reachable by peeling
1214 a few loop iterations. Return false otherwise. */
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1219 stmt_vec_info stmt_info = dr_info->stmt;
1220 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1222 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1224 /* For interleaved access we peel only if number of iterations in
1225 the prolog loop ({VF - misalignment}), is a multiple of the
1226 number of the interleaved accesses. */
1227 int elem_size, mis_in_elements;
1229 /* FORNOW: handle only known alignment. */
1230 if (!known_alignment_for_access_p (dr_info))
1231 return false;
1233 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235 elem_size = vector_element_size (vector_size, nelements);
1236 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1238 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239 return false;
1242 /* If misalignment is known at the compile time then allow peeling
1243 only if natural alignment is reachable through peeling. */
1244 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1246 HOST_WIDE_INT elmsize =
1247 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_NOTE, vect_location,
1251 "data size = %wd. misalignment = %d.\n", elmsize,
1252 DR_MISALIGNMENT (dr_info));
1254 if (DR_MISALIGNMENT (dr_info) % elmsize)
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "data size does not divide the misalignment.\n");
1259 return false;
1263 if (!known_alignment_for_access_p (dr_info))
1265 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Unknown misalignment, %snaturally aligned\n",
1270 is_packed ? "not " : "");
1271 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1274 return true;
1278 /* Calculate the cost of the memory access represented by DR_INFO. */
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282 unsigned int *inside_cost,
1283 unsigned int *outside_cost,
1284 stmt_vector_for_cost *body_cost_vec,
1285 stmt_vector_for_cost *prologue_cost_vec)
1287 stmt_vec_info stmt_info = dr_info->stmt;
1288 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289 int ncopies;
1291 if (PURE_SLP_STMT (stmt_info))
1292 ncopies = 1;
1293 else
1294 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1296 if (DR_IS_READ (dr_info->dr))
1297 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298 prologue_cost_vec, body_cost_vec, false);
1299 else
1300 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_NOTE, vect_location,
1304 "vect_get_data_access_cost: inside_cost = %d, "
1305 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1309 typedef struct _vect_peel_info
1311 dr_vec_info *dr_info;
1312 int npeel;
1313 unsigned int count;
1314 } *vect_peel_info;
1316 typedef struct _vect_peel_extended_info
1318 struct _vect_peel_info peel_info;
1319 unsigned int inside_cost;
1320 unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1324 /* Peeling hashtable helpers. */
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1328 static inline hashval_t hash (const _vect_peel_info *);
1329 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1335 return (hashval_t) peel_info->npeel;
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1341 return (a->npeel == b->npeel);
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350 int npeel)
1352 struct _vect_peel_info elem, *slot;
1353 _vect_peel_info **new_slot;
1354 bool supportable_dr_alignment
1355 = vect_supportable_dr_alignment (dr_info, true);
1357 elem.npeel = npeel;
1358 slot = peeling_htab->find (&elem);
1359 if (slot)
1360 slot->count++;
1361 else
1363 slot = XNEW (struct _vect_peel_info);
1364 slot->npeel = npeel;
1365 slot->dr_info = dr_info;
1366 slot->count = 1;
1367 new_slot = peeling_htab->find_slot (slot, INSERT);
1368 *new_slot = slot;
1371 if (!supportable_dr_alignment
1372 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373 slot->count += VECT_MAX_COST;
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378 number of data accesses. */
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382 _vect_peel_extended_info *max)
1384 vect_peel_info elem = *slot;
1386 if (elem->count > max->peel_info.count
1387 || (elem->count == max->peel_info.count
1388 && max->peel_info.npeel > elem->npeel))
1390 max->peel_info.npeel = elem->npeel;
1391 max->peel_info.count = elem->count;
1392 max->peel_info.dr_info = elem->dr_info;
1395 return 1;
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1400 we assume DR0_INFO's misalignment will be zero after peeling. */
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404 dr_vec_info *dr0_info,
1405 unsigned int *inside_cost,
1406 unsigned int *outside_cost,
1407 stmt_vector_for_cost *body_cost_vec,
1408 stmt_vector_for_cost *prologue_cost_vec,
1409 unsigned int npeel,
1410 bool unknown_misalignment)
1412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413 unsigned i;
1414 data_reference *dr;
1416 FOR_EACH_VEC_ELT (datarefs, i, dr)
1418 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419 stmt_vec_info stmt_info = dr_info->stmt;
1420 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421 continue;
1423 /* For interleaving, only the alignment of the first access
1424 matters. */
1425 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427 continue;
1429 /* Strided accesses perform only component accesses, alignment is
1430 irrelevant for them. */
1431 if (STMT_VINFO_STRIDED_P (stmt_info)
1432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433 continue;
1435 int save_misalignment;
1436 save_misalignment = DR_MISALIGNMENT (dr_info);
1437 if (npeel == 0)
1439 else if (unknown_misalignment && dr_info == dr0_info)
1440 SET_DR_MISALIGNMENT (dr_info, 0);
1441 else
1442 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444 body_cost_vec, prologue_cost_vec);
1445 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450 Find the one with the lowest cost. */
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454 _vect_peel_extended_info *min)
1456 vect_peel_info elem = *slot;
1457 int dummy;
1458 unsigned int inside_cost = 0, outside_cost = 0;
1459 stmt_vec_info stmt_info = elem->dr_info->stmt;
1460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462 epilogue_cost_vec;
1464 prologue_cost_vec.create (2);
1465 body_cost_vec.create (2);
1466 epilogue_cost_vec.create (2);
1468 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469 &outside_cost, &body_cost_vec,
1470 &prologue_cost_vec, elem->npeel, false);
1472 body_cost_vec.release ();
1474 outside_cost += vect_get_known_peeling_cost
1475 (loop_vinfo, elem->npeel, &dummy,
1476 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477 &prologue_cost_vec, &epilogue_cost_vec);
1479 /* Prologue and epilogue costs are added to the target model later.
1480 These costs depend only on the scalar iteration cost, the
1481 number of peeling iterations finally chosen, and the number of
1482 misaligned statements. So discard the information found here. */
1483 prologue_cost_vec.release ();
1484 epilogue_cost_vec.release ();
1486 if (inside_cost < min->inside_cost
1487 || (inside_cost == min->inside_cost
1488 && outside_cost < min->outside_cost))
1490 min->inside_cost = inside_cost;
1491 min->outside_cost = outside_cost;
1492 min->peel_info.dr_info = elem->dr_info;
1493 min->peel_info.npeel = elem->npeel;
1494 min->peel_info.count = elem->count;
1497 return 1;
1501 /* Choose best peeling option by traversing peeling hash table and either
1502 choosing an option with the lowest cost (if cost model is enabled) or the
1503 option that aligns as many accesses as possible. */
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507 loop_vec_info loop_vinfo)
1509 struct _vect_peel_extended_info res;
1511 res.peel_info.dr_info = NULL;
1513 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1515 res.inside_cost = INT_MAX;
1516 res.outside_cost = INT_MAX;
1517 peeling_htab->traverse <_vect_peel_extended_info *,
1518 vect_peeling_hash_get_lowest_cost> (&res);
1520 else
1522 res.peel_info.count = 0;
1523 peeling_htab->traverse <_vect_peel_extended_info *,
1524 vect_peeling_hash_get_most_frequent> (&res);
1525 res.inside_cost = 0;
1526 res.outside_cost = 0;
1529 return res;
1532 /* Return true if the new peeling NPEEL is supported. */
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536 unsigned npeel)
1538 unsigned i;
1539 struct data_reference *dr = NULL;
1540 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541 enum dr_alignment_support supportable_dr_alignment;
1543 /* Ensure that all data refs can be vectorized after the peel. */
1544 FOR_EACH_VEC_ELT (datarefs, i, dr)
1546 int save_misalignment;
1548 if (dr == dr0_info->dr)
1549 continue;
1551 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552 stmt_vec_info stmt_info = dr_info->stmt;
1553 /* For interleaving, only the alignment of the first access
1554 matters. */
1555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557 continue;
1559 /* Strided accesses perform only component accesses, alignment is
1560 irrelevant for them. */
1561 if (STMT_VINFO_STRIDED_P (stmt_info)
1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 continue;
1565 save_misalignment = DR_MISALIGNMENT (dr_info);
1566 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567 supportable_dr_alignment
1568 = vect_supportable_dr_alignment (dr_info, false);
1569 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1571 if (!supportable_dr_alignment)
1572 return false;
1575 return true;
1578 /* Function vect_enhance_data_refs_alignment
1580 This pass will use loop versioning and loop peeling in order to enhance
1581 the alignment of data references in the loop.
1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584 original loop is to be vectorized. Any other loops that are created by
1585 the transformations performed in this pass - are not supposed to be
1586 vectorized. This restriction will be relaxed.
1588 This pass will require a cost model to guide it whether to apply peeling
1589 or versioning or a combination of the two. For example, the scheme that
1590 intel uses when given a loop with several memory accesses, is as follows:
1591 choose one memory access ('p') which alignment you want to force by doing
1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1593 other accesses are not necessarily aligned, or (2) use loop versioning to
1594 generate one loop in which all accesses are aligned, and another loop in
1595 which only 'p' is necessarily aligned.
1597 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1601 Devising a cost model is the most critical aspect of this work. It will
1602 guide us on which access to peel for, whether to use loop versioning, how
1603 many versions to create, etc. The cost model will probably consist of
1604 generic considerations as well as target specific considerations (on
1605 powerpc for example, misaligned stores are more painful than misaligned
1606 loads).
1608 Here are the general steps involved in alignment enhancements:
1610 -- original loop, before alignment analysis:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1616 -- After vect_compute_data_refs_alignment:
1617 for (i=0; i<N; i++){
1618 x = q[i]; # DR_MISALIGNMENT(q) = 3
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1622 -- Possibility 1: we do loop versioning:
1623 if (p is aligned) {
1624 for (i=0; i<N; i++){ # loop 1A
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = 0
1629 else {
1630 for (i=0; i<N; i++){ # loop 1B
1631 x = q[i]; # DR_MISALIGNMENT(q) = 3
1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1636 -- Possibility 2: we do loop peeling:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1641 for (i = 3; i < N; i++){ # loop 2A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1646 -- Possibility 3: combination of loop peeling and versioning:
1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1648 x = q[i];
1649 p[i] = y;
1651 if (p is aligned) {
1652 for (i = 3; i<N; i++){ # loop 3A
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = 0
1657 else {
1658 for (i = 3; i<N; i++){ # loop 3B
1659 x = q[i]; # DR_MISALIGNMENT(q) = 0
1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1664 These loops are later passed to loop_transform to be vectorized. The
1665 vectorizer will use the alignment information to guide the transformation
1666 (whether to generate regular loads/stores, or with special handling for
1667 misalignment). */
1669 opt_result
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1672 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 enum dr_alignment_support supportable_dr_alignment;
1675 dr_vec_info *first_store = NULL;
1676 dr_vec_info *dr0_info = NULL;
1677 struct data_reference *dr;
1678 unsigned int i, j;
1679 bool do_peeling = false;
1680 bool do_versioning = false;
1681 unsigned int npeel = 0;
1682 bool one_misalignment_known = false;
1683 bool one_misalignment_unknown = false;
1684 bool one_dr_unsupportable = false;
1685 dr_vec_info *unsupportable_dr_info = NULL;
1686 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1687 unsigned possible_npeel_number = 1;
1688 tree vectype;
1689 unsigned int mis, same_align_drs_max = 0;
1690 hash_table<peel_info_hasher> peeling_htab (1);
1692 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1694 /* Reset data so we can safely be called multiple times. */
1695 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1696 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1698 /* While cost model enhancements are expected in the future, the high level
1699 view of the code at this time is as follows:
1701 A) If there is a misaligned access then see if peeling to align
1702 this access can make all data references satisfy
1703 vect_supportable_dr_alignment. If so, update data structures
1704 as needed and return true.
1706 B) If peeling wasn't possible and there is a data reference with an
1707 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1708 then see if loop versioning checks can be used to make all data
1709 references satisfy vect_supportable_dr_alignment. If so, update
1710 data structures as needed and return true.
1712 C) If neither peeling nor versioning were successful then return false if
1713 any data reference does not satisfy vect_supportable_dr_alignment.
1715 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1717 Note, Possibility 3 above (which is peeling and versioning together) is not
1718 being done at this time. */
1720 /* (1) Peeling to force alignment. */
1722 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1723 Considerations:
1724 + How many accesses will become aligned due to the peeling
1725 - How many accesses will become unaligned due to the peeling,
1726 and the cost of misaligned accesses.
1727 - The cost of peeling (the extra runtime checks, the increase
1728 in code size). */
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1732 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1733 stmt_vec_info stmt_info = dr_info->stmt;
1735 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1736 continue;
1738 /* For interleaving, only the alignment of the first access
1739 matters. */
1740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1741 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1742 continue;
1744 /* For scatter-gather or invariant accesses there is nothing
1745 to enhance. */
1746 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1747 || integer_zerop (DR_STEP (dr)))
1748 continue;
1750 /* Strided accesses perform only component accesses, alignment is
1751 irrelevant for them. */
1752 if (STMT_VINFO_STRIDED_P (stmt_info)
1753 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1754 continue;
1756 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1757 do_peeling = vector_alignment_reachable_p (dr_info);
1758 if (do_peeling)
1760 if (known_alignment_for_access_p (dr_info))
1762 unsigned int npeel_tmp = 0;
1763 bool negative = tree_int_cst_compare (DR_STEP (dr),
1764 size_zero_node) < 0;
1766 vectype = STMT_VINFO_VECTYPE (stmt_info);
1767 /* If known_alignment_for_access_p then we have set
1768 DR_MISALIGNMENT which is only done if we know it at compiler
1769 time, so it is safe to assume target alignment is constant.
1771 unsigned int target_align =
1772 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774 mis = (negative
1775 ? DR_MISALIGNMENT (dr_info)
1776 : -DR_MISALIGNMENT (dr_info));
1777 if (DR_MISALIGNMENT (dr_info) != 0)
1778 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1780 /* For multiple types, it is possible that the bigger type access
1781 will have more than one peeling option. E.g., a loop with two
1782 types: one of size (vector size / 4), and the other one of
1783 size (vector size / 8). Vectorization factor will 8. If both
1784 accesses are misaligned by 3, the first one needs one scalar
1785 iteration to be aligned, and the second one needs 5. But the
1786 first one will be aligned also by peeling 5 scalar
1787 iterations, and in that case both accesses will be aligned.
1788 Hence, except for the immediate peeling amount, we also want
1789 to try to add full vector size, while we don't exceed
1790 vectorization factor.
1791 We do this automatically for cost model, since we calculate
1792 cost for every peeling option. */
1793 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1795 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797 possible_npeel_number
1798 = vect_get_num_vectors (nscalars, vectype);
1800 /* NPEEL_TMP is 0 when there is no misalignment, but also
1801 allow peeling NELEMENTS. */
1802 if (DR_MISALIGNMENT (dr_info) == 0)
1803 possible_npeel_number++;
1806 /* Save info about DR in the hash table. Also include peeling
1807 amounts according to the explanation above. */
1808 for (j = 0; j < possible_npeel_number; j++)
1810 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811 dr_info, npeel_tmp);
1812 npeel_tmp += target_align / dr_size;
1815 one_misalignment_known = true;
1817 else
1819 /* If we don't know any misalignment values, we prefer
1820 peeling for data-ref that has the maximum number of data-refs
1821 with the same alignment, unless the target prefers to align
1822 stores over load. */
1823 unsigned same_align_drs
1824 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825 if (!dr0_info
1826 || same_align_drs_max < same_align_drs)
1828 same_align_drs_max = same_align_drs;
1829 dr0_info = dr_info;
1831 /* For data-refs with the same number of related
1832 accesses prefer the one where the misalign
1833 computation will be invariant in the outermost loop. */
1834 else if (same_align_drs_max == same_align_drs)
1836 class loop *ivloop0, *ivloop;
1837 ivloop0 = outermost_invariant_loop_for_expr
1838 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839 ivloop = outermost_invariant_loop_for_expr
1840 (loop, DR_BASE_ADDRESS (dr));
1841 if ((ivloop && !ivloop0)
1842 || (ivloop && ivloop0
1843 && flow_loop_nested_p (ivloop, ivloop0)))
1844 dr0_info = dr_info;
1847 one_misalignment_unknown = true;
1849 /* Check for data refs with unsupportable alignment that
1850 can be peeled. */
1851 if (!supportable_dr_alignment)
1853 one_dr_unsupportable = true;
1854 unsupportable_dr_info = dr_info;
1857 if (!first_store && DR_IS_WRITE (dr))
1858 first_store = dr_info;
1861 else
1863 if (!aligned_access_p (dr_info))
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "vector alignment may not be reachable\n");
1868 break;
1873 /* Check if we can possibly peel the loop. */
1874 if (!vect_can_advance_ivs_p (loop_vinfo)
1875 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876 || loop->inner)
1877 do_peeling = false;
1879 struct _vect_peel_extended_info peel_for_known_alignment;
1880 struct _vect_peel_extended_info peel_for_unknown_alignment;
1881 struct _vect_peel_extended_info best_peel;
1883 peel_for_unknown_alignment.inside_cost = INT_MAX;
1884 peel_for_unknown_alignment.outside_cost = INT_MAX;
1885 peel_for_unknown_alignment.peel_info.count = 0;
1887 if (do_peeling
1888 && one_misalignment_unknown)
1890 /* Check if the target requires to prefer stores over loads, i.e., if
1891 misaligned stores are more expensive than misaligned loads (taking
1892 drs with same alignment into account). */
1893 unsigned int load_inside_cost = 0;
1894 unsigned int load_outside_cost = 0;
1895 unsigned int store_inside_cost = 0;
1896 unsigned int store_outside_cost = 0;
1897 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1899 stmt_vector_for_cost dummy;
1900 dummy.create (2);
1901 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902 &load_inside_cost,
1903 &load_outside_cost,
1904 &dummy, &dummy, estimated_npeels, true);
1905 dummy.release ();
1907 if (first_store)
1909 dummy.create (2);
1910 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911 &store_inside_cost,
1912 &store_outside_cost,
1913 &dummy, &dummy,
1914 estimated_npeels, true);
1915 dummy.release ();
1917 else
1919 store_inside_cost = INT_MAX;
1920 store_outside_cost = INT_MAX;
1923 if (load_inside_cost > store_inside_cost
1924 || (load_inside_cost == store_inside_cost
1925 && load_outside_cost > store_outside_cost))
1927 dr0_info = first_store;
1928 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1931 else
1933 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1937 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938 prologue_cost_vec.create (2);
1939 epilogue_cost_vec.create (2);
1941 int dummy2;
1942 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943 (loop_vinfo, estimated_npeels, &dummy2,
1944 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945 &prologue_cost_vec, &epilogue_cost_vec);
1947 prologue_cost_vec.release ();
1948 epilogue_cost_vec.release ();
1950 peel_for_unknown_alignment.peel_info.count = 1
1951 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1954 peel_for_unknown_alignment.peel_info.npeel = 0;
1955 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1957 best_peel = peel_for_unknown_alignment;
1959 peel_for_known_alignment.inside_cost = INT_MAX;
1960 peel_for_known_alignment.outside_cost = INT_MAX;
1961 peel_for_known_alignment.peel_info.count = 0;
1962 peel_for_known_alignment.peel_info.dr_info = NULL;
1964 if (do_peeling && one_misalignment_known)
1966 /* Peeling is possible, but there is no data access that is not supported
1967 unless aligned. So we try to choose the best possible peeling from
1968 the hash table. */
1969 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970 (&peeling_htab, loop_vinfo);
1973 /* Compare costs of peeling for known and unknown alignment. */
1974 if (peel_for_known_alignment.peel_info.dr_info != NULL
1975 && peel_for_unknown_alignment.inside_cost
1976 >= peel_for_known_alignment.inside_cost)
1978 best_peel = peel_for_known_alignment;
1980 /* If the best peeling for known alignment has NPEEL == 0, perform no
1981 peeling at all except if there is an unsupportable dr that we can
1982 align. */
1983 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984 do_peeling = false;
1987 /* If there is an unsupportable data ref, prefer this over all choices so far
1988 since we'd have to discard a chosen peeling except when it accidentally
1989 aligned the unsupportable data ref. */
1990 if (one_dr_unsupportable)
1991 dr0_info = unsupportable_dr_info;
1992 else if (do_peeling)
1994 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995 TODO: Use nopeel_outside_cost or get rid of it? */
1996 unsigned nopeel_inside_cost = 0;
1997 unsigned nopeel_outside_cost = 0;
1999 stmt_vector_for_cost dummy;
2000 dummy.create (2);
2001 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002 &nopeel_outside_cost, &dummy, &dummy,
2003 0, false);
2004 dummy.release ();
2006 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2007 costs will be recorded. */
2008 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009 prologue_cost_vec.create (2);
2010 epilogue_cost_vec.create (2);
2012 int dummy2;
2013 nopeel_outside_cost += vect_get_known_peeling_cost
2014 (loop_vinfo, 0, &dummy2,
2015 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016 &prologue_cost_vec, &epilogue_cost_vec);
2018 prologue_cost_vec.release ();
2019 epilogue_cost_vec.release ();
2021 npeel = best_peel.peel_info.npeel;
2022 dr0_info = best_peel.peel_info.dr_info;
2024 /* If no peeling is not more expensive than the best peeling we
2025 have so far, don't perform any peeling. */
2026 if (nopeel_inside_cost <= best_peel.inside_cost)
2027 do_peeling = false;
2030 if (do_peeling)
2032 stmt_vec_info stmt_info = dr0_info->stmt;
2033 vectype = STMT_VINFO_VECTYPE (stmt_info);
2035 if (known_alignment_for_access_p (dr0_info))
2037 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038 size_zero_node) < 0;
2039 if (!npeel)
2041 /* Since it's known at compile time, compute the number of
2042 iterations in the peeled loop (the peeling factor) for use in
2043 updating DR_MISALIGNMENT values. The peeling factor is the
2044 vectorization factor minus the misalignment as an element
2045 count. */
2046 mis = (negative
2047 ? DR_MISALIGNMENT (dr0_info)
2048 : -DR_MISALIGNMENT (dr0_info));
2049 /* If known_alignment_for_access_p then we have set
2050 DR_MISALIGNMENT which is only done if we know it at compiler
2051 time, so it is safe to assume target alignment is constant.
2053 unsigned int target_align =
2054 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055 npeel = ((mis & (target_align - 1))
2056 / vect_get_scalar_dr_size (dr0_info));
2059 /* For interleaved data access every iteration accesses all the
2060 members of the group, therefore we divide the number of iterations
2061 by the group size. */
2062 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063 npeel /= DR_GROUP_SIZE (stmt_info);
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE, vect_location,
2067 "Try peeling by %d\n", npeel);
2070 /* Ensure that all datarefs can be vectorized after the peel. */
2071 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072 do_peeling = false;
2074 /* Check if all datarefs are supportable and log. */
2075 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2077 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078 if (!stat)
2079 do_peeling = false;
2080 else
2081 return stat;
2084 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2085 if (do_peeling)
2087 unsigned max_allowed_peel
2088 = param_vect_max_peeling_for_alignment;
2089 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090 max_allowed_peel = 0;
2091 if (max_allowed_peel != (unsigned)-1)
2093 unsigned max_peel = npeel;
2094 if (max_peel == 0)
2096 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097 unsigned HOST_WIDE_INT target_align_c;
2098 if (target_align.is_constant (&target_align_c))
2099 max_peel =
2100 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101 else
2103 do_peeling = false;
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "Disable peeling, max peels set and vector"
2107 " alignment unknown\n");
2110 if (max_peel > max_allowed_peel)
2112 do_peeling = false;
2113 if (dump_enabled_p ())
2114 dump_printf_loc (MSG_NOTE, vect_location,
2115 "Disable peeling, max peels reached: %d\n", max_peel);
2120 /* Cost model #2 - if peeling may result in a remaining loop not
2121 iterating enough to be vectorized then do not peel. Since this
2122 is a cost heuristic rather than a correctness decision, use the
2123 most likely runtime value for variable vectorization factors. */
2124 if (do_peeling
2125 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2127 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130 < assumed_vf + max_peel)
2131 do_peeling = false;
2134 if (do_peeling)
2136 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137 If the misalignment of DR_i is identical to that of dr0 then set
2138 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2139 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140 by the peeling factor times the element size of DR_i (MOD the
2141 vectorization factor times the size). Otherwise, the
2142 misalignment of DR_i must be set to unknown. */
2143 FOR_EACH_VEC_ELT (datarefs, i, dr)
2144 if (dr != dr0_info->dr)
2146 /* Strided accesses perform only component accesses, alignment
2147 is irrelevant for them. */
2148 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149 stmt_info = dr_info->stmt;
2150 if (STMT_VINFO_STRIDED_P (stmt_info)
2151 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152 continue;
2154 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2157 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158 if (npeel)
2159 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160 else
2161 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162 = DR_MISALIGNMENT (dr0_info);
2163 SET_DR_MISALIGNMENT (dr0_info, 0);
2164 if (dump_enabled_p ())
2166 dump_printf_loc (MSG_NOTE, vect_location,
2167 "Alignment of access forced using peeling.\n");
2168 dump_printf_loc (MSG_NOTE, vect_location,
2169 "Peeling for alignment will be applied.\n");
2172 /* The inside-loop cost will be accounted for in vectorizable_load
2173 and vectorizable_store correctly with adjusted alignments.
2174 Drop the body_cst_vec on the floor here. */
2175 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176 gcc_assert (stat);
2177 return stat;
2181 /* (2) Versioning to force alignment. */
2183 /* Try versioning if:
2184 1) optimize loop for speed and the cost-model is not cheap
2185 2) there is at least one unsupported misaligned data ref with an unknown
2186 misalignment, and
2187 3) all misaligned data refs with a known misalignment are supported, and
2188 4) the number of runtime alignment checks is within reason. */
2190 do_versioning
2191 = (optimize_loop_nest_for_speed_p (loop)
2192 && !loop->inner /* FORNOW */
2193 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2195 if (do_versioning)
2197 FOR_EACH_VEC_ELT (datarefs, i, dr)
2199 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200 stmt_vec_info stmt_info = dr_info->stmt;
2202 /* For interleaving, only the alignment of the first access
2203 matters. */
2204 if (aligned_access_p (dr_info)
2205 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207 continue;
2209 if (STMT_VINFO_STRIDED_P (stmt_info))
2211 /* Strided loads perform only component accesses, alignment is
2212 irrelevant for them. */
2213 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214 continue;
2215 do_versioning = false;
2216 break;
2219 supportable_dr_alignment
2220 = vect_supportable_dr_alignment (dr_info, false);
2222 if (!supportable_dr_alignment)
2224 int mask;
2225 tree vectype;
2227 if (known_alignment_for_access_p (dr_info)
2228 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229 >= (unsigned) param_vect_max_version_for_alignment_checks)
2231 do_versioning = false;
2232 break;
2235 vectype = STMT_VINFO_VECTYPE (stmt_info);
2236 gcc_assert (vectype);
2238 /* At present we don't support versioning for alignment
2239 with variable VF, since there's no guarantee that the
2240 VF is a power of two. We could relax this if we added
2241 a way of enforcing a power-of-two size. */
2242 unsigned HOST_WIDE_INT size;
2243 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2245 do_versioning = false;
2246 break;
2249 /* Forcing alignment in the first iteration is no good if
2250 we don't keep it across iterations. For now, just disable
2251 versioning in this case.
2252 ?? We could actually unroll the loop to achieve the required
2253 overall step alignment, and forcing the alignment could be
2254 done by doing some iterations of the non-vectorized loop. */
2255 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256 * DR_STEP_ALIGNMENT (dr),
2257 DR_TARGET_ALIGNMENT (dr_info)))
2259 do_versioning = false;
2260 break;
2263 /* The rightmost bits of an aligned address must be zeros.
2264 Construct the mask needed for this test. For example,
2265 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266 mask must be 15 = 0xf. */
2267 mask = size - 1;
2269 /* FORNOW: use the same mask to test all potentially unaligned
2270 references in the loop. */
2271 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2272 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2274 do_versioning = false;
2275 break;
2278 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2279 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2283 /* Versioning requires at least one misaligned data reference. */
2284 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2285 do_versioning = false;
2286 else if (!do_versioning)
2287 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2290 if (do_versioning)
2292 vec<stmt_vec_info> may_misalign_stmts
2293 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2294 stmt_vec_info stmt_info;
2296 /* It can now be assumed that the data references in the statements
2297 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2298 of the loop being vectorized. */
2299 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2301 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2302 SET_DR_MISALIGNMENT (dr_info, 0);
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE, vect_location,
2305 "Alignment of access forced using versioning.\n");
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_NOTE, vect_location,
2310 "Versioning for alignment will be applied.\n");
2312 /* Peeling and versioning can't be done together at this time. */
2313 gcc_assert (! (do_peeling && do_versioning));
2315 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2316 gcc_assert (stat);
2317 return stat;
2320 /* This point is reached if neither peeling nor versioning is being done. */
2321 gcc_assert (! (do_peeling || do_versioning));
2323 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2324 return stat;
2328 /* Function vect_find_same_alignment_drs.
2330 Update group and alignment relations in VINFO according to the chosen
2331 vectorization factor. */
2333 static void
2334 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2336 struct data_reference *dra = DDR_A (ddr);
2337 struct data_reference *drb = DDR_B (ddr);
2338 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2339 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2340 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2341 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2343 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2344 return;
2346 if (dra == drb)
2347 return;
2349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2351 return;
2353 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2354 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2355 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2356 return;
2358 /* Two references with distance zero have the same alignment. */
2359 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2360 - wi::to_poly_offset (DR_INIT (drb)));
2361 if (maybe_ne (diff, 0))
2363 /* Get the wider of the two alignments. */
2364 poly_uint64 align_a =
2365 exact_div (vect_calculate_target_alignment (dr_info_a),
2366 BITS_PER_UNIT);
2367 poly_uint64 align_b =
2368 exact_div (vect_calculate_target_alignment (dr_info_b),
2369 BITS_PER_UNIT);
2370 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2371 if (!align_a.is_constant (&align_a_c)
2372 || !align_b.is_constant (&align_b_c))
2373 return;
2375 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2377 /* Require the gap to be a multiple of the larger vector alignment. */
2378 if (!multiple_p (diff, max_align))
2379 return;
2382 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2383 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_NOTE, vect_location,
2386 "accesses have the same alignment: %T and %T\n",
2387 DR_REF (dra), DR_REF (drb));
2391 /* Function vect_analyze_data_refs_alignment
2393 Analyze the alignment of the data-references in the loop.
2394 Return FALSE if a data reference is found that cannot be vectorized. */
2396 opt_result
2397 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2399 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2401 /* Mark groups of data references with same alignment using
2402 data dependence information. */
2403 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2404 struct data_dependence_relation *ddr;
2405 unsigned int i;
2407 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2408 vect_find_same_alignment_drs (vinfo, ddr);
2410 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2411 struct data_reference *dr;
2413 vect_record_base_alignments (vinfo);
2414 FOR_EACH_VEC_ELT (datarefs, i, dr)
2416 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2417 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2418 vect_compute_data_ref_alignment (dr_info);
2421 return opt_result::success ();
2425 /* Analyze alignment of DRs of stmts in NODE. */
2427 static bool
2428 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2430 /* We vectorize from the first scalar stmt in the node unless
2431 the node is permuted in which case we start from the first
2432 element in the group. */
2433 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2434 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2435 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2436 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439 vect_compute_data_ref_alignment (dr_info);
2440 /* For creating the data-ref pointer we need alignment of the
2441 first element anyway. */
2442 if (dr_info != first_dr_info)
2443 vect_compute_data_ref_alignment (first_dr_info);
2444 if (! verify_data_ref_alignment (dr_info))
2446 if (dump_enabled_p ())
2447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448 "not vectorized: bad data alignment in basic "
2449 "block.\n");
2450 return false;
2453 return true;
2456 /* Function vect_slp_analyze_instance_alignment
2458 Analyze the alignment of the data-references in the SLP instance.
2459 Return FALSE if a data reference is found that cannot be vectorized. */
2461 bool
2462 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2464 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2466 slp_tree node;
2467 unsigned i;
2468 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2469 if (! vect_slp_analyze_and_verify_node_alignment (node))
2470 return false;
2472 node = SLP_INSTANCE_TREE (instance);
2473 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2474 && ! vect_slp_analyze_and_verify_node_alignment
2475 (SLP_INSTANCE_TREE (instance)))
2476 return false;
2478 return true;
2482 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2483 accesses of legal size, step, etc. Detect gaps, single element
2484 interleaving, and other special cases. Set grouped access info.
2485 Collect groups of strided stores for further use in SLP analysis.
2486 Worker for vect_analyze_group_access. */
2488 static bool
2489 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2491 data_reference *dr = dr_info->dr;
2492 tree step = DR_STEP (dr);
2493 tree scalar_type = TREE_TYPE (DR_REF (dr));
2494 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2495 stmt_vec_info stmt_info = dr_info->stmt;
2496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2497 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2498 HOST_WIDE_INT dr_step = -1;
2499 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2500 bool slp_impossible = false;
2502 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2503 size of the interleaving group (including gaps). */
2504 if (tree_fits_shwi_p (step))
2506 dr_step = tree_to_shwi (step);
2507 /* Check that STEP is a multiple of type size. Otherwise there is
2508 a non-element-sized gap at the end of the group which we
2509 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2510 ??? As we can handle non-constant step fine here we should
2511 simply remove uses of DR_GROUP_GAP between the last and first
2512 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2513 simply not include that gap. */
2514 if ((dr_step % type_size) != 0)
2516 if (dump_enabled_p ())
2517 dump_printf_loc (MSG_NOTE, vect_location,
2518 "Step %T is not a multiple of the element size"
2519 " for %T\n",
2520 step, DR_REF (dr));
2521 return false;
2523 groupsize = absu_hwi (dr_step) / type_size;
2525 else
2526 groupsize = 0;
2528 /* Not consecutive access is possible only if it is a part of interleaving. */
2529 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2531 /* Check if it this DR is a part of interleaving, and is a single
2532 element of the group that is accessed in the loop. */
2534 /* Gaps are supported only for loads. STEP must be a multiple of the type
2535 size. */
2536 if (DR_IS_READ (dr)
2537 && (dr_step % type_size) == 0
2538 && groupsize > 0)
2540 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2541 DR_GROUP_SIZE (stmt_info) = groupsize;
2542 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2543 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_NOTE, vect_location,
2545 "Detected single element interleaving %T"
2546 " step %T\n",
2547 DR_REF (dr), step);
2549 return true;
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 "not consecutive access %G", stmt_info->stmt);
2556 if (bb_vinfo)
2558 /* Mark the statement as unvectorizable. */
2559 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2560 return true;
2563 if (dump_enabled_p ())
2564 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2565 STMT_VINFO_STRIDED_P (stmt_info) = true;
2566 return true;
2569 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2571 /* First stmt in the interleaving chain. Check the chain. */
2572 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2573 struct data_reference *data_ref = dr;
2574 unsigned int count = 1;
2575 tree prev_init = DR_INIT (data_ref);
2576 HOST_WIDE_INT diff, gaps = 0;
2578 /* By construction, all group members have INTEGER_CST DR_INITs. */
2579 while (next)
2581 /* We never have the same DR multiple times. */
2582 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2583 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2585 data_ref = STMT_VINFO_DATA_REF (next);
2587 /* All group members have the same STEP by construction. */
2588 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2590 /* Check that the distance between two accesses is equal to the type
2591 size. Otherwise, we have gaps. */
2592 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2593 - TREE_INT_CST_LOW (prev_init)) / type_size;
2594 if (diff != 1)
2596 /* FORNOW: SLP of accesses with gaps is not supported. */
2597 slp_impossible = true;
2598 if (DR_IS_WRITE (data_ref))
2600 if (dump_enabled_p ())
2601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2602 "interleaved store with gaps\n");
2603 return false;
2606 gaps += diff - 1;
2609 last_accessed_element += diff;
2611 /* Store the gap from the previous member of the group. If there is no
2612 gap in the access, DR_GROUP_GAP is always 1. */
2613 DR_GROUP_GAP (next) = diff;
2615 prev_init = DR_INIT (data_ref);
2616 next = DR_GROUP_NEXT_ELEMENT (next);
2617 /* Count the number of data-refs in the chain. */
2618 count++;
2621 if (groupsize == 0)
2622 groupsize = count + gaps;
2624 /* This could be UINT_MAX but as we are generating code in a very
2625 inefficient way we have to cap earlier. See PR78699 for example. */
2626 if (groupsize > 4096)
2628 if (dump_enabled_p ())
2629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630 "group is too large\n");
2631 return false;
2634 /* Check that the size of the interleaving is equal to count for stores,
2635 i.e., that there are no gaps. */
2636 if (groupsize != count
2637 && !DR_IS_READ (dr))
2639 groupsize = count;
2640 STMT_VINFO_STRIDED_P (stmt_info) = true;
2643 /* If there is a gap after the last load in the group it is the
2644 difference between the groupsize and the last accessed
2645 element.
2646 When there is no gap, this difference should be 0. */
2647 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2649 DR_GROUP_SIZE (stmt_info) = groupsize;
2650 if (dump_enabled_p ())
2652 dump_printf_loc (MSG_NOTE, vect_location,
2653 "Detected interleaving ");
2654 if (DR_IS_READ (dr))
2655 dump_printf (MSG_NOTE, "load ");
2656 else if (STMT_VINFO_STRIDED_P (stmt_info))
2657 dump_printf (MSG_NOTE, "strided store ");
2658 else
2659 dump_printf (MSG_NOTE, "store ");
2660 dump_printf (MSG_NOTE, "of size %u\n",
2661 (unsigned)groupsize);
2662 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2663 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2664 while (next)
2666 if (DR_GROUP_GAP (next) != 1)
2667 dump_printf_loc (MSG_NOTE, vect_location,
2668 "\t<gap of %d elements>\n",
2669 DR_GROUP_GAP (next) - 1);
2670 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2671 next = DR_GROUP_NEXT_ELEMENT (next);
2673 if (DR_GROUP_GAP (stmt_info) != 0)
2674 dump_printf_loc (MSG_NOTE, vect_location,
2675 "\t<gap of %d elements>\n",
2676 DR_GROUP_GAP (stmt_info));
2679 /* SLP: create an SLP data structure for every interleaving group of
2680 stores for further analysis in vect_analyse_slp. */
2681 if (DR_IS_WRITE (dr) && !slp_impossible)
2683 if (loop_vinfo)
2684 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2685 if (bb_vinfo)
2686 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2690 return true;
2693 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2694 accesses of legal size, step, etc. Detect gaps, single element
2695 interleaving, and other special cases. Set grouped access info.
2696 Collect groups of strided stores for further use in SLP analysis. */
2698 static bool
2699 vect_analyze_group_access (dr_vec_info *dr_info)
2701 if (!vect_analyze_group_access_1 (dr_info))
2703 /* Dissolve the group if present. */
2704 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2705 while (stmt_info)
2707 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2708 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2709 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2710 stmt_info = next;
2712 return false;
2714 return true;
2717 /* Analyze the access pattern of the data-reference DR_INFO.
2718 In case of non-consecutive accesses call vect_analyze_group_access() to
2719 analyze groups of accesses. */
2721 static bool
2722 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2724 data_reference *dr = dr_info->dr;
2725 tree step = DR_STEP (dr);
2726 tree scalar_type = TREE_TYPE (DR_REF (dr));
2727 stmt_vec_info stmt_info = dr_info->stmt;
2728 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2729 class loop *loop = NULL;
2731 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2732 return true;
2734 if (loop_vinfo)
2735 loop = LOOP_VINFO_LOOP (loop_vinfo);
2737 if (loop_vinfo && !step)
2739 if (dump_enabled_p ())
2740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2741 "bad data-ref access in loop\n");
2742 return false;
2745 /* Allow loads with zero step in inner-loop vectorization. */
2746 if (loop_vinfo && integer_zerop (step))
2748 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2749 if (!nested_in_vect_loop_p (loop, stmt_info))
2750 return DR_IS_READ (dr);
2751 /* Allow references with zero step for outer loops marked
2752 with pragma omp simd only - it guarantees absence of
2753 loop-carried dependencies between inner loop iterations. */
2754 if (loop->safelen < 2)
2756 if (dump_enabled_p ())
2757 dump_printf_loc (MSG_NOTE, vect_location,
2758 "zero step in inner loop of nest\n");
2759 return false;
2763 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2765 /* Interleaved accesses are not yet supported within outer-loop
2766 vectorization for references in the inner-loop. */
2767 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2769 /* For the rest of the analysis we use the outer-loop step. */
2770 step = STMT_VINFO_DR_STEP (stmt_info);
2771 if (integer_zerop (step))
2773 if (dump_enabled_p ())
2774 dump_printf_loc (MSG_NOTE, vect_location,
2775 "zero step in outer loop.\n");
2776 return DR_IS_READ (dr);
2780 /* Consecutive? */
2781 if (TREE_CODE (step) == INTEGER_CST)
2783 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2784 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2785 || (dr_step < 0
2786 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2788 /* Mark that it is not interleaving. */
2789 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2790 return true;
2794 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "grouped access in outer loop.\n");
2799 return false;
2803 /* Assume this is a DR handled by non-constant strided load case. */
2804 if (TREE_CODE (step) != INTEGER_CST)
2805 return (STMT_VINFO_STRIDED_P (stmt_info)
2806 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2807 || vect_analyze_group_access (dr_info)));
2809 /* Not consecutive access - check if it's a part of interleaving group. */
2810 return vect_analyze_group_access (dr_info);
2813 /* Compare two data-references DRA and DRB to group them into chunks
2814 suitable for grouping. */
2816 static int
2817 dr_group_sort_cmp (const void *dra_, const void *drb_)
2819 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2820 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2821 int cmp;
2823 /* Stabilize sort. */
2824 if (dra == drb)
2825 return 0;
2827 /* DRs in different loops never belong to the same group. */
2828 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2829 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2830 if (loopa != loopb)
2831 return loopa->num < loopb->num ? -1 : 1;
2833 /* Ordering of DRs according to base. */
2834 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2835 DR_BASE_ADDRESS (drb));
2836 if (cmp != 0)
2837 return cmp;
2839 /* And according to DR_OFFSET. */
2840 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2841 if (cmp != 0)
2842 return cmp;
2844 /* Put reads before writes. */
2845 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2846 return DR_IS_READ (dra) ? -1 : 1;
2848 /* Then sort after access size. */
2849 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2850 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2851 if (cmp != 0)
2852 return cmp;
2854 /* And after step. */
2855 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2856 if (cmp != 0)
2857 return cmp;
2859 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2860 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2861 if (cmp == 0)
2862 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2863 return cmp;
2866 /* If OP is the result of a conversion, return the unconverted value,
2867 otherwise return null. */
2869 static tree
2870 strip_conversion (tree op)
2872 if (TREE_CODE (op) != SSA_NAME)
2873 return NULL_TREE;
2874 gimple *stmt = SSA_NAME_DEF_STMT (op);
2875 if (!is_gimple_assign (stmt)
2876 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2877 return NULL_TREE;
2878 return gimple_assign_rhs1 (stmt);
2881 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2882 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2883 be grouped in SLP mode. */
2885 static bool
2886 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2887 bool allow_slp_p)
2889 if (gimple_assign_single_p (stmt1_info->stmt))
2890 return gimple_assign_single_p (stmt2_info->stmt);
2892 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2893 if (call1 && gimple_call_internal_p (call1))
2895 /* Check for two masked loads or two masked stores. */
2896 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2897 if (!call2 || !gimple_call_internal_p (call2))
2898 return false;
2899 internal_fn ifn = gimple_call_internal_fn (call1);
2900 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2901 return false;
2902 if (ifn != gimple_call_internal_fn (call2))
2903 return false;
2905 /* Check that the masks are the same. Cope with casts of masks,
2906 like those created by build_mask_conversion. */
2907 tree mask1 = gimple_call_arg (call1, 2);
2908 tree mask2 = gimple_call_arg (call2, 2);
2909 if (!operand_equal_p (mask1, mask2, 0)
2910 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2912 mask1 = strip_conversion (mask1);
2913 if (!mask1)
2914 return false;
2915 mask2 = strip_conversion (mask2);
2916 if (!mask2)
2917 return false;
2918 if (!operand_equal_p (mask1, mask2, 0))
2919 return false;
2921 return true;
2924 return false;
2927 /* Function vect_analyze_data_ref_accesses.
2929 Analyze the access pattern of all the data references in the loop.
2931 FORNOW: the only access pattern that is considered vectorizable is a
2932 simple step 1 (consecutive) access.
2934 FORNOW: handle only arrays and pointer accesses. */
2936 opt_result
2937 vect_analyze_data_ref_accesses (vec_info *vinfo)
2939 unsigned int i;
2940 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2941 struct data_reference *dr;
2943 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2945 if (datarefs.is_empty ())
2946 return opt_result::success ();
2948 /* Sort the array of datarefs to make building the interleaving chains
2949 linear. Don't modify the original vector's order, it is needed for
2950 determining what dependencies are reversed. */
2951 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2952 datarefs_copy.qsort (dr_group_sort_cmp);
2953 hash_set<stmt_vec_info> to_fixup;
2955 /* Build the interleaving chains. */
2956 for (i = 0; i < datarefs_copy.length () - 1;)
2958 data_reference_p dra = datarefs_copy[i];
2959 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2960 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2961 stmt_vec_info lastinfo = NULL;
2962 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2963 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2965 ++i;
2966 continue;
2968 for (i = i + 1; i < datarefs_copy.length (); ++i)
2970 data_reference_p drb = datarefs_copy[i];
2971 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2972 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2973 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2974 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2975 break;
2977 /* ??? Imperfect sorting (non-compatible types, non-modulo
2978 accesses, same accesses) can lead to a group to be artificially
2979 split here as we don't just skip over those. If it really
2980 matters we can push those to a worklist and re-iterate
2981 over them. The we can just skip ahead to the next DR here. */
2983 /* DRs in a different loop should not be put into the same
2984 interleaving group. */
2985 if (gimple_bb (DR_STMT (dra))->loop_father
2986 != gimple_bb (DR_STMT (drb))->loop_father)
2987 break;
2989 /* Check that the data-refs have same first location (except init)
2990 and they are both either store or load (not load and store,
2991 not masked loads or stores). */
2992 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2993 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2994 DR_BASE_ADDRESS (drb)) != 0
2995 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2996 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2997 break;
2999 /* Check that the data-refs have the same constant size. */
3000 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3001 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3002 if (!tree_fits_uhwi_p (sza)
3003 || !tree_fits_uhwi_p (szb)
3004 || !tree_int_cst_equal (sza, szb))
3005 break;
3007 /* Check that the data-refs have the same step. */
3008 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3009 break;
3011 /* Check the types are compatible.
3012 ??? We don't distinguish this during sorting. */
3013 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3014 TREE_TYPE (DR_REF (drb))))
3015 break;
3017 /* Check that the DR_INITs are compile-time constants. */
3018 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3019 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3020 break;
3022 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3023 just hold extra information. */
3024 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3025 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3026 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3027 break;
3029 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3030 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3031 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3032 HOST_WIDE_INT init_prev
3033 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3034 gcc_assert (init_a <= init_b
3035 && init_a <= init_prev
3036 && init_prev <= init_b);
3038 /* Do not place the same access in the interleaving chain twice. */
3039 if (init_b == init_prev)
3041 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3042 < gimple_uid (DR_STMT (drb)));
3043 /* Simply link in duplicates and fix up the chain below. */
3045 else
3047 /* If init_b == init_a + the size of the type * k, we have an
3048 interleaving, and DRA is accessed before DRB. */
3049 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3050 if (type_size_a == 0
3051 || (init_b - init_a) % type_size_a != 0)
3052 break;
3054 /* If we have a store, the accesses are adjacent. This splits
3055 groups into chunks we support (we don't support vectorization
3056 of stores with gaps). */
3057 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3058 break;
3060 /* If the step (if not zero or non-constant) is greater than the
3061 difference between data-refs' inits this splits groups into
3062 suitable sizes. */
3063 if (tree_fits_shwi_p (DR_STEP (dra)))
3065 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3066 if (step != 0 && step <= (init_b - init_a))
3067 break;
3071 if (dump_enabled_p ())
3072 dump_printf_loc (MSG_NOTE, vect_location,
3073 DR_IS_READ (dra)
3074 ? "Detected interleaving load %T and %T\n"
3075 : "Detected interleaving store %T and %T\n",
3076 DR_REF (dra), DR_REF (drb));
3078 /* Link the found element into the group list. */
3079 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3081 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3082 lastinfo = stmtinfo_a;
3084 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3085 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3086 lastinfo = stmtinfo_b;
3088 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3089 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3091 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3092 dump_printf_loc (MSG_NOTE, vect_location,
3093 "Load suitable for SLP vectorization only.\n");
3095 if (init_b == init_prev
3096 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3097 && dump_enabled_p ())
3098 dump_printf_loc (MSG_NOTE, vect_location,
3099 "Queuing group with duplicate access for fixup\n");
3103 /* Fixup groups with duplicate entries by splitting it. */
3104 while (1)
3106 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3107 if (!(it != to_fixup.end ()))
3108 break;
3109 stmt_vec_info grp = *it;
3110 to_fixup.remove (grp);
3112 /* Find the earliest duplicate group member. */
3113 unsigned first_duplicate = -1u;
3114 stmt_vec_info next, g = grp;
3115 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3117 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3118 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3119 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3120 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3121 g = next;
3123 if (first_duplicate == -1U)
3124 continue;
3126 /* Then move all stmts after the first duplicate to a new group.
3127 Note this is a heuristic but one with the property that *it
3128 is fixed up completely. */
3129 g = grp;
3130 stmt_vec_info newgroup = NULL, ng = grp;
3131 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3133 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3135 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3136 if (!newgroup)
3137 newgroup = next;
3138 else
3139 DR_GROUP_NEXT_ELEMENT (ng) = next;
3140 ng = next;
3141 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3143 else
3144 g = DR_GROUP_NEXT_ELEMENT (g);
3146 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3148 /* Fixup the new group which still may contain duplicates. */
3149 to_fixup.add (newgroup);
3152 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3154 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3155 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3156 && !vect_analyze_data_ref_access (dr_info))
3158 if (dump_enabled_p ())
3159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3160 "not vectorized: complicated access pattern.\n");
3162 if (is_a <bb_vec_info> (vinfo))
3164 /* Mark the statement as not vectorizable. */
3165 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3166 continue;
3168 else
3170 datarefs_copy.release ();
3171 return opt_result::failure_at (dr_info->stmt->stmt,
3172 "not vectorized:"
3173 " complicated access pattern.\n");
3178 datarefs_copy.release ();
3179 return opt_result::success ();
3182 /* Function vect_vfa_segment_size.
3184 Input:
3185 DR_INFO: The data reference.
3186 LENGTH_FACTOR: segment length to consider.
3188 Return a value suitable for the dr_with_seg_len::seg_len field.
3189 This is the "distance travelled" by the pointer from the first
3190 iteration in the segment to the last. Note that it does not include
3191 the size of the access; in effect it only describes the first byte. */
3193 static tree
3194 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3196 length_factor = size_binop (MINUS_EXPR,
3197 fold_convert (sizetype, length_factor),
3198 size_one_node);
3199 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3200 length_factor);
3203 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3204 gives the worst-case number of bytes covered by the segment. */
3206 static unsigned HOST_WIDE_INT
3207 vect_vfa_access_size (dr_vec_info *dr_info)
3209 stmt_vec_info stmt_vinfo = dr_info->stmt;
3210 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3211 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3212 unsigned HOST_WIDE_INT access_size = ref_size;
3213 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3215 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3216 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3218 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3219 && (vect_supportable_dr_alignment (dr_info, false)
3220 == dr_explicit_realign_optimized))
3222 /* We might access a full vector's worth. */
3223 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3224 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3226 return access_size;
3229 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3230 describes. */
3232 static unsigned int
3233 vect_vfa_align (dr_vec_info *dr_info)
3235 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3238 /* Function vect_no_alias_p.
3240 Given data references A and B with equal base and offset, see whether
3241 the alias relation can be decided at compilation time. Return 1 if
3242 it can and the references alias, 0 if it can and the references do
3243 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3244 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3245 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3247 static int
3248 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3249 tree segment_length_a, tree segment_length_b,
3250 unsigned HOST_WIDE_INT access_size_a,
3251 unsigned HOST_WIDE_INT access_size_b)
3253 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3254 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3255 poly_uint64 const_length_a;
3256 poly_uint64 const_length_b;
3258 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3259 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3260 [a, a+12) */
3261 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3263 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3264 offset_a = (offset_a + access_size_a) - const_length_a;
3266 else
3267 const_length_a = tree_to_poly_uint64 (segment_length_a);
3268 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3270 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3271 offset_b = (offset_b + access_size_b) - const_length_b;
3273 else
3274 const_length_b = tree_to_poly_uint64 (segment_length_b);
3276 const_length_a += access_size_a;
3277 const_length_b += access_size_b;
3279 if (ranges_known_overlap_p (offset_a, const_length_a,
3280 offset_b, const_length_b))
3281 return 1;
3283 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3284 offset_b, const_length_b))
3285 return 0;
3287 return -1;
3290 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3291 in DDR is >= VF. */
3293 static bool
3294 dependence_distance_ge_vf (data_dependence_relation *ddr,
3295 unsigned int loop_depth, poly_uint64 vf)
3297 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3298 || DDR_NUM_DIST_VECTS (ddr) == 0)
3299 return false;
3301 /* If the dependence is exact, we should have limited the VF instead. */
3302 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3304 unsigned int i;
3305 lambda_vector dist_v;
3306 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3308 HOST_WIDE_INT dist = dist_v[loop_depth];
3309 if (dist != 0
3310 && !(dist > 0 && DDR_REVERSED_P (ddr))
3311 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3312 return false;
3315 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE, vect_location,
3317 "dependence distance between %T and %T is >= VF\n",
3318 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3320 return true;
3323 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3325 static void
3326 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3328 dump_printf (dump_kind, "%s (%T) >= ",
3329 lower_bound.unsigned_p ? "unsigned" : "abs",
3330 lower_bound.expr);
3331 dump_dec (dump_kind, lower_bound.min_value);
3334 /* Record that the vectorized loop requires the vec_lower_bound described
3335 by EXPR, UNSIGNED_P and MIN_VALUE. */
3337 static void
3338 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3339 poly_uint64 min_value)
3341 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3342 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3343 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3345 unsigned_p &= lower_bounds[i].unsigned_p;
3346 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3347 if (lower_bounds[i].unsigned_p != unsigned_p
3348 || maybe_lt (lower_bounds[i].min_value, min_value))
3350 lower_bounds[i].unsigned_p = unsigned_p;
3351 lower_bounds[i].min_value = min_value;
3352 if (dump_enabled_p ())
3354 dump_printf_loc (MSG_NOTE, vect_location,
3355 "updating run-time check to ");
3356 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3357 dump_printf (MSG_NOTE, "\n");
3360 return;
3363 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3364 if (dump_enabled_p ())
3366 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3367 dump_lower_bound (MSG_NOTE, lower_bound);
3368 dump_printf (MSG_NOTE, "\n");
3370 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3373 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3374 will span fewer than GAP bytes. */
3376 static bool
3377 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3378 poly_int64 gap)
3380 stmt_vec_info stmt_info = dr_info->stmt;
3381 HOST_WIDE_INT count
3382 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3383 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3384 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3385 return (estimated_poly_value (gap)
3386 <= count * vect_get_scalar_dr_size (dr_info));
3389 /* Return true if we know that there is no alias between DR_INFO_A and
3390 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3391 When returning true, set *LOWER_BOUND_OUT to this N. */
3393 static bool
3394 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3395 poly_uint64 *lower_bound_out)
3397 /* Check that there is a constant gap of known sign between DR_A
3398 and DR_B. */
3399 data_reference *dr_a = dr_info_a->dr;
3400 data_reference *dr_b = dr_info_b->dr;
3401 poly_int64 init_a, init_b;
3402 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3403 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3404 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3405 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3406 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3407 || !ordered_p (init_a, init_b))
3408 return false;
3410 /* Sort DR_A and DR_B by the address they access. */
3411 if (maybe_lt (init_b, init_a))
3413 std::swap (init_a, init_b);
3414 std::swap (dr_info_a, dr_info_b);
3415 std::swap (dr_a, dr_b);
3418 /* If the two accesses could be dependent within a scalar iteration,
3419 make sure that we'd retain their order. */
3420 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3421 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3422 return false;
3424 /* There is no alias if abs (DR_STEP) is greater than or equal to
3425 the bytes spanned by the combination of the two accesses. */
3426 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3427 return true;
3430 /* Function vect_prune_runtime_alias_test_list.
3432 Prune a list of ddrs to be tested at run-time by versioning for alias.
3433 Merge several alias checks into one if possible.
3434 Return FALSE if resulting list of ddrs is longer then allowed by
3435 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3437 opt_result
3438 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3440 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3441 hash_set <tree_pair_hash> compared_objects;
3443 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3444 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3445 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3446 vec<vec_object_pair> &check_unequal_addrs
3447 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3448 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3449 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3451 ddr_p ddr;
3452 unsigned int i;
3453 tree length_factor;
3455 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3457 /* Step values are irrelevant for aliasing if the number of vector
3458 iterations is equal to the number of scalar iterations (which can
3459 happen for fully-SLP loops). */
3460 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3462 if (!ignore_step_p)
3464 /* Convert the checks for nonzero steps into bound tests. */
3465 tree value;
3466 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3467 vect_check_lower_bound (loop_vinfo, value, true, 1);
3470 if (may_alias_ddrs.is_empty ())
3471 return opt_result::success ();
3473 comp_alias_ddrs.create (may_alias_ddrs.length ());
3475 unsigned int loop_depth
3476 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3477 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3479 /* First, we collect all data ref pairs for aliasing checks. */
3480 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3482 poly_uint64 lower_bound;
3483 tree segment_length_a, segment_length_b;
3484 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3485 unsigned int align_a, align_b;
3487 /* Ignore the alias if the VF we chose ended up being no greater
3488 than the dependence distance. */
3489 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3490 continue;
3492 if (DDR_OBJECT_A (ddr))
3494 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3495 if (!compared_objects.add (new_pair))
3497 if (dump_enabled_p ())
3498 dump_printf_loc (MSG_NOTE, vect_location,
3499 "checking that %T and %T"
3500 " have different addresses\n",
3501 new_pair.first, new_pair.second);
3502 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3504 continue;
3507 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3508 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3510 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3511 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3513 bool preserves_scalar_order_p
3514 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3516 /* Skip the pair if inter-iteration dependencies are irrelevant
3517 and intra-iteration dependencies are guaranteed to be honored. */
3518 if (ignore_step_p
3519 && (preserves_scalar_order_p
3520 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3521 &lower_bound)))
3523 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE, vect_location,
3525 "no need for alias check between "
3526 "%T and %T when VF is 1\n",
3527 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3528 continue;
3531 /* See whether we can handle the alias using a bounds check on
3532 the step, and whether that's likely to be the best approach.
3533 (It might not be, for example, if the minimum step is much larger
3534 than the number of bytes handled by one vector iteration.) */
3535 if (!ignore_step_p
3536 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3537 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3538 &lower_bound)
3539 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3540 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3542 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3543 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3546 "%T and %T when the step %T is outside ",
3547 DR_REF (dr_info_a->dr),
3548 DR_REF (dr_info_b->dr),
3549 DR_STEP (dr_info_a->dr));
3550 if (unsigned_p)
3551 dump_printf (MSG_NOTE, "[0");
3552 else
3554 dump_printf (MSG_NOTE, "(");
3555 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3557 dump_printf (MSG_NOTE, ", ");
3558 dump_dec (MSG_NOTE, lower_bound);
3559 dump_printf (MSG_NOTE, ")\n");
3561 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3562 unsigned_p, lower_bound);
3563 continue;
3566 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3567 if (dr_group_first_a)
3569 stmt_info_a = dr_group_first_a;
3570 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3573 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3574 if (dr_group_first_b)
3576 stmt_info_b = dr_group_first_b;
3577 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3580 if (ignore_step_p)
3582 segment_length_a = size_zero_node;
3583 segment_length_b = size_zero_node;
3585 else
3587 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3588 DR_STEP (dr_info_b->dr), 0))
3589 length_factor = scalar_loop_iters;
3590 else
3591 length_factor = size_int (vect_factor);
3592 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3593 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3595 access_size_a = vect_vfa_access_size (dr_info_a);
3596 access_size_b = vect_vfa_access_size (dr_info_b);
3597 align_a = vect_vfa_align (dr_info_a);
3598 align_b = vect_vfa_align (dr_info_b);
3600 /* See whether the alias is known at compilation time. */
3601 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3602 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3603 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3604 DR_OFFSET (dr_info_b->dr), 0)
3605 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3606 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3607 && poly_int_tree_p (segment_length_a)
3608 && poly_int_tree_p (segment_length_b))
3610 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3611 segment_length_a,
3612 segment_length_b,
3613 access_size_a,
3614 access_size_b);
3615 if (res >= 0 && dump_enabled_p ())
3617 dump_printf_loc (MSG_NOTE, vect_location,
3618 "can tell at compile time that %T and %T",
3619 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3620 if (res == 0)
3621 dump_printf (MSG_NOTE, " do not alias\n");
3622 else
3623 dump_printf (MSG_NOTE, " alias\n");
3626 if (res == 0)
3627 continue;
3629 if (res == 1)
3630 return opt_result::failure_at (stmt_info_b->stmt,
3631 "not vectorized:"
3632 " compilation time alias: %G%G",
3633 stmt_info_a->stmt,
3634 stmt_info_b->stmt);
3637 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3638 access_size_a, align_a);
3639 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3640 access_size_b, align_b);
3641 /* Canonicalize the order to be the one that's needed for accurate
3642 RAW, WAR and WAW flags, in cases where the data references are
3643 well-ordered. The order doesn't really matter otherwise,
3644 but we might as well be consistent. */
3645 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3646 std::swap (dr_a, dr_b);
3648 dr_with_seg_len_pair_t dr_with_seg_len_pair
3649 (dr_a, dr_b, (preserves_scalar_order_p
3650 ? dr_with_seg_len_pair_t::WELL_ORDERED
3651 : dr_with_seg_len_pair_t::REORDERED));
3653 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3656 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3658 unsigned int count = (comp_alias_ddrs.length ()
3659 + check_unequal_addrs.length ());
3661 if (dump_enabled_p ())
3662 dump_printf_loc (MSG_NOTE, vect_location,
3663 "improved number of alias checks from %d to %d\n",
3664 may_alias_ddrs.length (), count);
3665 unsigned limit = param_vect_max_version_for_alias_checks;
3666 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3667 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3668 if (count > limit)
3669 return opt_result::failure_at
3670 (vect_location,
3671 "number of versioning for alias run-time tests exceeds %d "
3672 "(--param vect-max-version-for-alias-checks)\n", limit);
3674 return opt_result::success ();
3677 /* Check whether we can use an internal function for a gather load
3678 or scatter store. READ_P is true for loads and false for stores.
3679 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3680 the type of the memory elements being loaded or stored. OFFSET_TYPE
3681 is the type of the offset that is being applied to the invariant
3682 base address. SCALE is the amount by which the offset should
3683 be multiplied *after* it has been converted to address width.
3685 Return true if the function is supported, storing the function id in
3686 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3688 bool
3689 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3690 tree vectype, tree memory_type, tree offset_type,
3691 int scale, internal_fn *ifn_out,
3692 tree *offset_vectype_out)
3694 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3695 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3696 if (element_bits != memory_bits)
3697 /* For now the vector elements must be the same width as the
3698 memory elements. */
3699 return false;
3701 /* Work out which function we need. */
3702 internal_fn ifn;
3703 if (read_p)
3704 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3705 else
3706 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3708 for (;;)
3710 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3711 if (!offset_vectype)
3712 return false;
3714 /* Test whether the target supports this combination. */
3715 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3716 offset_vectype, scale))
3718 *ifn_out = ifn;
3719 *offset_vectype_out = offset_vectype;
3720 return true;
3723 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3724 && TYPE_PRECISION (offset_type) >= element_bits)
3725 return false;
3727 offset_type = build_nonstandard_integer_type
3728 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3732 /* STMT_INFO is a call to an internal gather load or scatter store function.
3733 Describe the operation in INFO. */
3735 static void
3736 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3737 gather_scatter_info *info)
3739 gcall *call = as_a <gcall *> (stmt_info->stmt);
3740 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3741 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3743 info->ifn = gimple_call_internal_fn (call);
3744 info->decl = NULL_TREE;
3745 info->base = gimple_call_arg (call, 0);
3746 info->offset = gimple_call_arg (call, 1);
3747 info->offset_dt = vect_unknown_def_type;
3748 info->offset_vectype = NULL_TREE;
3749 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3750 info->element_type = TREE_TYPE (vectype);
3751 info->memory_type = TREE_TYPE (DR_REF (dr));
3754 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3755 gather load or scatter store. Describe the operation in *INFO if so. */
3757 bool
3758 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3759 gather_scatter_info *info)
3761 HOST_WIDE_INT scale = 1;
3762 poly_int64 pbitpos, pbitsize;
3763 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3764 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3765 tree offtype = NULL_TREE;
3766 tree decl = NULL_TREE, base, off;
3767 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3768 tree memory_type = TREE_TYPE (DR_REF (dr));
3769 machine_mode pmode;
3770 int punsignedp, reversep, pvolatilep = 0;
3771 internal_fn ifn;
3772 tree offset_vectype;
3773 bool masked_p = false;
3775 /* See whether this is already a call to a gather/scatter internal function.
3776 If not, see whether it's a masked load or store. */
3777 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3778 if (call && gimple_call_internal_p (call))
3780 ifn = gimple_call_internal_fn (call);
3781 if (internal_gather_scatter_fn_p (ifn))
3783 vect_describe_gather_scatter_call (stmt_info, info);
3784 return true;
3786 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3789 /* True if we should aim to use internal functions rather than
3790 built-in functions. */
3791 bool use_ifn_p = (DR_IS_READ (dr)
3792 ? supports_vec_gather_load_p ()
3793 : supports_vec_scatter_store_p ());
3795 base = DR_REF (dr);
3796 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3797 see if we can use the def stmt of the address. */
3798 if (masked_p
3799 && TREE_CODE (base) == MEM_REF
3800 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3801 && integer_zerop (TREE_OPERAND (base, 1))
3802 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3804 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3805 if (is_gimple_assign (def_stmt)
3806 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3807 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3810 /* The gather and scatter builtins need address of the form
3811 loop_invariant + vector * {1, 2, 4, 8}
3813 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3814 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3815 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3816 multiplications and additions in it. To get a vector, we need
3817 a single SSA_NAME that will be defined in the loop and will
3818 contain everything that is not loop invariant and that can be
3819 vectorized. The following code attempts to find such a preexistng
3820 SSA_NAME OFF and put the loop invariants into a tree BASE
3821 that can be gimplified before the loop. */
3822 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3823 &punsignedp, &reversep, &pvolatilep);
3824 if (reversep)
3825 return false;
3827 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3829 if (TREE_CODE (base) == MEM_REF)
3831 if (!integer_zerop (TREE_OPERAND (base, 1)))
3833 if (off == NULL_TREE)
3834 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3835 else
3836 off = size_binop (PLUS_EXPR, off,
3837 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3839 base = TREE_OPERAND (base, 0);
3841 else
3842 base = build_fold_addr_expr (base);
3844 if (off == NULL_TREE)
3845 off = size_zero_node;
3847 /* If base is not loop invariant, either off is 0, then we start with just
3848 the constant offset in the loop invariant BASE and continue with base
3849 as OFF, otherwise give up.
3850 We could handle that case by gimplifying the addition of base + off
3851 into some SSA_NAME and use that as off, but for now punt. */
3852 if (!expr_invariant_in_loop_p (loop, base))
3854 if (!integer_zerop (off))
3855 return false;
3856 off = base;
3857 base = size_int (pbytepos);
3859 /* Otherwise put base + constant offset into the loop invariant BASE
3860 and continue with OFF. */
3861 else
3863 base = fold_convert (sizetype, base);
3864 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3867 /* OFF at this point may be either a SSA_NAME or some tree expression
3868 from get_inner_reference. Try to peel off loop invariants from it
3869 into BASE as long as possible. */
3870 STRIP_NOPS (off);
3871 while (offtype == NULL_TREE)
3873 enum tree_code code;
3874 tree op0, op1, add = NULL_TREE;
3876 if (TREE_CODE (off) == SSA_NAME)
3878 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3880 if (expr_invariant_in_loop_p (loop, off))
3881 return false;
3883 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3884 break;
3886 op0 = gimple_assign_rhs1 (def_stmt);
3887 code = gimple_assign_rhs_code (def_stmt);
3888 op1 = gimple_assign_rhs2 (def_stmt);
3890 else
3892 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3893 return false;
3894 code = TREE_CODE (off);
3895 extract_ops_from_tree (off, &code, &op0, &op1);
3897 switch (code)
3899 case POINTER_PLUS_EXPR:
3900 case PLUS_EXPR:
3901 if (expr_invariant_in_loop_p (loop, op0))
3903 add = op0;
3904 off = op1;
3905 do_add:
3906 add = fold_convert (sizetype, add);
3907 if (scale != 1)
3908 add = size_binop (MULT_EXPR, add, size_int (scale));
3909 base = size_binop (PLUS_EXPR, base, add);
3910 continue;
3912 if (expr_invariant_in_loop_p (loop, op1))
3914 add = op1;
3915 off = op0;
3916 goto do_add;
3918 break;
3919 case MINUS_EXPR:
3920 if (expr_invariant_in_loop_p (loop, op1))
3922 add = fold_convert (sizetype, op1);
3923 add = size_binop (MINUS_EXPR, size_zero_node, add);
3924 off = op0;
3925 goto do_add;
3927 break;
3928 case MULT_EXPR:
3929 if (scale == 1 && tree_fits_shwi_p (op1))
3931 int new_scale = tree_to_shwi (op1);
3932 /* Only treat this as a scaling operation if the target
3933 supports it for at least some offset type. */
3934 if (use_ifn_p
3935 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3936 masked_p, vectype, memory_type,
3937 signed_char_type_node,
3938 new_scale, &ifn,
3939 &offset_vectype)
3940 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3941 masked_p, vectype, memory_type,
3942 unsigned_char_type_node,
3943 new_scale, &ifn,
3944 &offset_vectype))
3945 break;
3946 scale = new_scale;
3947 off = op0;
3948 continue;
3950 break;
3951 case SSA_NAME:
3952 off = op0;
3953 continue;
3954 CASE_CONVERT:
3955 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3956 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3957 break;
3959 /* Don't include the conversion if the target is happy with
3960 the current offset type. */
3961 if (use_ifn_p
3962 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3963 masked_p, vectype, memory_type,
3964 TREE_TYPE (off), scale, &ifn,
3965 &offset_vectype))
3966 break;
3968 if (TYPE_PRECISION (TREE_TYPE (op0))
3969 == TYPE_PRECISION (TREE_TYPE (off)))
3971 off = op0;
3972 continue;
3975 if (TYPE_PRECISION (TREE_TYPE (op0))
3976 < TYPE_PRECISION (TREE_TYPE (off)))
3978 off = op0;
3979 offtype = TREE_TYPE (off);
3980 STRIP_NOPS (off);
3981 continue;
3983 break;
3984 default:
3985 break;
3987 break;
3990 /* If at the end OFF still isn't a SSA_NAME or isn't
3991 defined in the loop, punt. */
3992 if (TREE_CODE (off) != SSA_NAME
3993 || expr_invariant_in_loop_p (loop, off))
3994 return false;
3996 if (offtype == NULL_TREE)
3997 offtype = TREE_TYPE (off);
3999 if (use_ifn_p)
4001 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4002 vectype, memory_type, offtype, scale,
4003 &ifn, &offset_vectype))
4004 return false;
4006 else
4008 if (DR_IS_READ (dr))
4010 if (targetm.vectorize.builtin_gather)
4011 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4013 else
4015 if (targetm.vectorize.builtin_scatter)
4016 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4019 if (!decl)
4020 return false;
4022 ifn = IFN_LAST;
4023 /* The offset vector type will be read from DECL when needed. */
4024 offset_vectype = NULL_TREE;
4027 info->ifn = ifn;
4028 info->decl = decl;
4029 info->base = base;
4030 info->offset = off;
4031 info->offset_dt = vect_unknown_def_type;
4032 info->offset_vectype = offset_vectype;
4033 info->scale = scale;
4034 info->element_type = TREE_TYPE (vectype);
4035 info->memory_type = memory_type;
4036 return true;
4039 /* Find the data references in STMT, analyze them with respect to LOOP and
4040 append them to DATAREFS. Return false if datarefs in this stmt cannot
4041 be handled. */
4043 opt_result
4044 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4045 vec<data_reference_p> *datarefs)
4047 /* We can ignore clobbers for dataref analysis - they are removed during
4048 loop vectorization and BB vectorization checks dependences with a
4049 stmt walk. */
4050 if (gimple_clobber_p (stmt))
4051 return opt_result::success ();
4053 if (gimple_has_volatile_ops (stmt))
4054 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4055 stmt);
4057 if (stmt_can_throw_internal (cfun, stmt))
4058 return opt_result::failure_at (stmt,
4059 "not vectorized:"
4060 " statement can throw an exception: %G",
4061 stmt);
4063 auto_vec<data_reference_p, 2> refs;
4064 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4065 if (!res)
4066 return res;
4068 if (refs.is_empty ())
4069 return opt_result::success ();
4071 if (refs.length () > 1)
4072 return opt_result::failure_at (stmt,
4073 "not vectorized:"
4074 " more than one data ref in stmt: %G", stmt);
4076 if (gcall *call = dyn_cast <gcall *> (stmt))
4077 if (!gimple_call_internal_p (call)
4078 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4079 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4080 return opt_result::failure_at (stmt,
4081 "not vectorized: dr in a call %G", stmt);
4083 data_reference_p dr = refs.pop ();
4084 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4085 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4086 return opt_result::failure_at (stmt,
4087 "not vectorized:"
4088 " statement is bitfield access %G", stmt);
4090 if (DR_BASE_ADDRESS (dr)
4091 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4092 return opt_result::failure_at (stmt,
4093 "not vectorized:"
4094 " base addr of dr is a constant\n");
4096 /* Check whether this may be a SIMD lane access and adjust the
4097 DR to make it easier for us to handle it. */
4098 if (loop
4099 && loop->simduid
4100 && (!DR_BASE_ADDRESS (dr)
4101 || !DR_OFFSET (dr)
4102 || !DR_INIT (dr)
4103 || !DR_STEP (dr)))
4105 struct data_reference *newdr
4106 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4107 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4108 if (DR_BASE_ADDRESS (newdr)
4109 && DR_OFFSET (newdr)
4110 && DR_INIT (newdr)
4111 && DR_STEP (newdr)
4112 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4113 && integer_zerop (DR_STEP (newdr)))
4115 tree base_address = DR_BASE_ADDRESS (newdr);
4116 tree off = DR_OFFSET (newdr);
4117 tree step = ssize_int (1);
4118 if (integer_zerop (off)
4119 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4121 off = TREE_OPERAND (base_address, 1);
4122 base_address = TREE_OPERAND (base_address, 0);
4124 STRIP_NOPS (off);
4125 if (TREE_CODE (off) == MULT_EXPR
4126 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4128 step = TREE_OPERAND (off, 1);
4129 off = TREE_OPERAND (off, 0);
4130 STRIP_NOPS (off);
4132 if (CONVERT_EXPR_P (off)
4133 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4134 < TYPE_PRECISION (TREE_TYPE (off))))
4135 off = TREE_OPERAND (off, 0);
4136 if (TREE_CODE (off) == SSA_NAME)
4138 gimple *def = SSA_NAME_DEF_STMT (off);
4139 /* Look through widening conversion. */
4140 if (is_gimple_assign (def)
4141 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4143 tree rhs1 = gimple_assign_rhs1 (def);
4144 if (TREE_CODE (rhs1) == SSA_NAME
4145 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4146 && (TYPE_PRECISION (TREE_TYPE (off))
4147 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4148 def = SSA_NAME_DEF_STMT (rhs1);
4150 if (is_gimple_call (def)
4151 && gimple_call_internal_p (def)
4152 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4154 tree arg = gimple_call_arg (def, 0);
4155 tree reft = TREE_TYPE (DR_REF (newdr));
4156 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4157 arg = SSA_NAME_VAR (arg);
4158 if (arg == loop->simduid
4159 /* For now. */
4160 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4162 DR_BASE_ADDRESS (newdr) = base_address;
4163 DR_OFFSET (newdr) = ssize_int (0);
4164 DR_STEP (newdr) = step;
4165 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4166 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4167 /* Mark as simd-lane access. */
4168 tree arg2 = gimple_call_arg (def, 1);
4169 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4170 free_data_ref (dr);
4171 datarefs->safe_push (newdr);
4172 return opt_result::success ();
4177 free_data_ref (newdr);
4180 datarefs->safe_push (dr);
4181 return opt_result::success ();
4184 /* Function vect_analyze_data_refs.
4186 Find all the data references in the loop or basic block.
4188 The general structure of the analysis of data refs in the vectorizer is as
4189 follows:
4190 1- vect_analyze_data_refs(loop/bb): call
4191 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4192 in the loop/bb and their dependences.
4193 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4194 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4195 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4199 opt_result
4200 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4202 class loop *loop = NULL;
4203 unsigned int i;
4204 struct data_reference *dr;
4205 tree scalar_type;
4207 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4209 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4210 loop = LOOP_VINFO_LOOP (loop_vinfo);
4212 /* Go through the data-refs, check that the analysis succeeded. Update
4213 pointer from stmt_vec_info struct to DR and vectype. */
4215 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4216 FOR_EACH_VEC_ELT (datarefs, i, dr)
4218 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4219 poly_uint64 vf;
4221 gcc_assert (DR_REF (dr));
4222 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4223 gcc_assert (!stmt_info->dr_aux.dr);
4224 stmt_info->dr_aux.dr = dr;
4225 stmt_info->dr_aux.stmt = stmt_info;
4227 /* Check that analysis of the data-ref succeeded. */
4228 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4229 || !DR_STEP (dr))
4231 bool maybe_gather
4232 = DR_IS_READ (dr)
4233 && !TREE_THIS_VOLATILE (DR_REF (dr))
4234 && (targetm.vectorize.builtin_gather != NULL
4235 || supports_vec_gather_load_p ());
4236 bool maybe_scatter
4237 = DR_IS_WRITE (dr)
4238 && !TREE_THIS_VOLATILE (DR_REF (dr))
4239 && (targetm.vectorize.builtin_scatter != NULL
4240 || supports_vec_scatter_store_p ());
4242 /* If target supports vector gather loads or scatter stores,
4243 see if they can't be used. */
4244 if (is_a <loop_vec_info> (vinfo)
4245 && !nested_in_vect_loop_p (loop, stmt_info))
4247 if (maybe_gather || maybe_scatter)
4249 if (maybe_gather)
4250 gatherscatter = GATHER;
4251 else
4252 gatherscatter = SCATTER;
4256 if (gatherscatter == SG_NONE)
4258 if (dump_enabled_p ())
4259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4260 "not vectorized: data ref analysis "
4261 "failed %G", stmt_info->stmt);
4262 if (is_a <bb_vec_info> (vinfo))
4264 /* In BB vectorization the ref can still participate
4265 in dependence analysis, we just can't vectorize it. */
4266 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4267 continue;
4269 return opt_result::failure_at (stmt_info->stmt,
4270 "not vectorized:"
4271 " data ref analysis failed: %G",
4272 stmt_info->stmt);
4276 /* See if this was detected as SIMD lane access. */
4277 if (dr->aux == (void *)-1
4278 || dr->aux == (void *)-2
4279 || dr->aux == (void *)-3
4280 || dr->aux == (void *)-4)
4282 if (nested_in_vect_loop_p (loop, stmt_info))
4283 return opt_result::failure_at (stmt_info->stmt,
4284 "not vectorized:"
4285 " data ref analysis failed: %G",
4286 stmt_info->stmt);
4287 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4288 = -(uintptr_t) dr->aux;
4291 tree base = get_base_address (DR_REF (dr));
4292 if (base && VAR_P (base) && DECL_NONALIASED (base))
4294 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4296 "not vectorized: base object not addressable "
4297 "for stmt: %G", stmt_info->stmt);
4298 if (is_a <bb_vec_info> (vinfo))
4300 /* In BB vectorization the ref can still participate
4301 in dependence analysis, we just can't vectorize it. */
4302 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4303 continue;
4305 return opt_result::failure_at (stmt_info->stmt,
4306 "not vectorized: base object not"
4307 " addressable for stmt: %G",
4308 stmt_info->stmt);
4311 if (is_a <loop_vec_info> (vinfo)
4312 && DR_STEP (dr)
4313 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4315 if (nested_in_vect_loop_p (loop, stmt_info))
4316 return opt_result::failure_at (stmt_info->stmt,
4317 "not vectorized: "
4318 "not suitable for strided load %G",
4319 stmt_info->stmt);
4320 STMT_VINFO_STRIDED_P (stmt_info) = true;
4323 /* Update DR field in stmt_vec_info struct. */
4325 /* If the dataref is in an inner-loop of the loop that is considered for
4326 for vectorization, we also want to analyze the access relative to
4327 the outer-loop (DR contains information only relative to the
4328 inner-most enclosing loop). We do that by building a reference to the
4329 first location accessed by the inner-loop, and analyze it relative to
4330 the outer-loop. */
4331 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4333 /* Build a reference to the first location accessed by the
4334 inner loop: *(BASE + INIT + OFFSET). By construction,
4335 this address must be invariant in the inner loop, so we
4336 can consider it as being used in the outer loop. */
4337 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4338 tree offset = unshare_expr (DR_OFFSET (dr));
4339 tree init = unshare_expr (DR_INIT (dr));
4340 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4341 init, offset);
4342 tree init_addr = fold_build_pointer_plus (base, init_offset);
4343 tree init_ref = build_fold_indirect_ref (init_addr);
4345 if (dump_enabled_p ())
4346 dump_printf_loc (MSG_NOTE, vect_location,
4347 "analyze in outer loop: %T\n", init_ref);
4349 opt_result res
4350 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4351 init_ref, loop, stmt_info->stmt);
4352 if (!res)
4353 /* dr_analyze_innermost already explained the failure. */
4354 return res;
4356 if (dump_enabled_p ())
4357 dump_printf_loc (MSG_NOTE, vect_location,
4358 "\touter base_address: %T\n"
4359 "\touter offset from base address: %T\n"
4360 "\touter constant offset from base address: %T\n"
4361 "\touter step: %T\n"
4362 "\touter base alignment: %d\n\n"
4363 "\touter base misalignment: %d\n"
4364 "\touter offset alignment: %d\n"
4365 "\touter step alignment: %d\n",
4366 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4367 STMT_VINFO_DR_OFFSET (stmt_info),
4368 STMT_VINFO_DR_INIT (stmt_info),
4369 STMT_VINFO_DR_STEP (stmt_info),
4370 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4371 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4372 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4373 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4376 /* Set vectype for STMT. */
4377 scalar_type = TREE_TYPE (DR_REF (dr));
4378 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4379 if (!vectype)
4381 if (dump_enabled_p ())
4383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4384 "not vectorized: no vectype for stmt: %G",
4385 stmt_info->stmt);
4386 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4387 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4388 scalar_type);
4389 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4392 if (is_a <bb_vec_info> (vinfo))
4394 /* No vector type is fine, the ref can still participate
4395 in dependence analysis, we just can't vectorize it. */
4396 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4397 continue;
4399 if (fatal)
4400 *fatal = false;
4401 return opt_result::failure_at (stmt_info->stmt,
4402 "not vectorized:"
4403 " no vectype for stmt: %G"
4404 " scalar_type: %T\n",
4405 stmt_info->stmt, scalar_type);
4407 else
4409 if (dump_enabled_p ())
4410 dump_printf_loc (MSG_NOTE, vect_location,
4411 "got vectype for stmt: %G%T\n",
4412 stmt_info->stmt, vectype);
4415 /* Adjust the minimal vectorization factor according to the
4416 vector type. */
4417 vf = TYPE_VECTOR_SUBPARTS (vectype);
4418 *min_vf = upper_bound (*min_vf, vf);
4420 /* Leave the BB vectorizer to pick the vector type later, based on
4421 the final dataref group size and SLP node size. */
4422 if (is_a <loop_vec_info> (vinfo))
4423 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4425 if (gatherscatter != SG_NONE)
4427 gather_scatter_info gs_info;
4428 if (!vect_check_gather_scatter (stmt_info,
4429 as_a <loop_vec_info> (vinfo),
4430 &gs_info)
4431 || !get_vectype_for_scalar_type (vinfo,
4432 TREE_TYPE (gs_info.offset)))
4434 if (fatal)
4435 *fatal = false;
4436 return opt_result::failure_at
4437 (stmt_info->stmt,
4438 (gatherscatter == GATHER)
4439 ? "not vectorized: not suitable for gather load %G"
4440 : "not vectorized: not suitable for scatter store %G",
4441 stmt_info->stmt);
4443 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4447 /* We used to stop processing and prune the list here. Verify we no
4448 longer need to. */
4449 gcc_assert (i == datarefs.length ());
4451 return opt_result::success ();
4455 /* Function vect_get_new_vect_var.
4457 Returns a name for a new variable. The current naming scheme appends the
4458 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4459 the name of vectorizer generated variables, and appends that to NAME if
4460 provided. */
4462 tree
4463 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4465 const char *prefix;
4466 tree new_vect_var;
4468 switch (var_kind)
4470 case vect_simple_var:
4471 prefix = "vect";
4472 break;
4473 case vect_scalar_var:
4474 prefix = "stmp";
4475 break;
4476 case vect_mask_var:
4477 prefix = "mask";
4478 break;
4479 case vect_pointer_var:
4480 prefix = "vectp";
4481 break;
4482 default:
4483 gcc_unreachable ();
4486 if (name)
4488 char* tmp = concat (prefix, "_", name, NULL);
4489 new_vect_var = create_tmp_reg (type, tmp);
4490 free (tmp);
4492 else
4493 new_vect_var = create_tmp_reg (type, prefix);
4495 return new_vect_var;
4498 /* Like vect_get_new_vect_var but return an SSA name. */
4500 tree
4501 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4503 const char *prefix;
4504 tree new_vect_var;
4506 switch (var_kind)
4508 case vect_simple_var:
4509 prefix = "vect";
4510 break;
4511 case vect_scalar_var:
4512 prefix = "stmp";
4513 break;
4514 case vect_pointer_var:
4515 prefix = "vectp";
4516 break;
4517 default:
4518 gcc_unreachable ();
4521 if (name)
4523 char* tmp = concat (prefix, "_", name, NULL);
4524 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4525 free (tmp);
4527 else
4528 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4530 return new_vect_var;
4533 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4535 static void
4536 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4538 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4539 int misalign = DR_MISALIGNMENT (dr_info);
4540 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4541 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4542 else
4543 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4544 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4545 misalign);
4548 /* Function vect_create_addr_base_for_vector_ref.
4550 Create an expression that computes the address of the first memory location
4551 that will be accessed for a data reference.
4553 Input:
4554 STMT_INFO: The statement containing the data reference.
4555 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4556 OFFSET: Optional. If supplied, it is be added to the initial address.
4557 LOOP: Specify relative to which loop-nest should the address be computed.
4558 For example, when the dataref is in an inner-loop nested in an
4559 outer-loop that is now being vectorized, LOOP can be either the
4560 outer-loop, or the inner-loop. The first memory location accessed
4561 by the following dataref ('in' points to short):
4563 for (i=0; i<N; i++)
4564 for (j=0; j<M; j++)
4565 s += in[i+j]
4567 is as follows:
4568 if LOOP=i_loop: &in (relative to i_loop)
4569 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4570 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4571 initial address. Unlike OFFSET, which is number of elements to
4572 be added, BYTE_OFFSET is measured in bytes.
4574 Output:
4575 1. Return an SSA_NAME whose value is the address of the memory location of
4576 the first vector of the data reference.
4577 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4578 these statement(s) which define the returned SSA_NAME.
4580 FORNOW: We are only handling array accesses with step 1. */
4582 tree
4583 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4584 gimple_seq *new_stmt_list,
4585 tree offset,
4586 tree byte_offset)
4588 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4589 struct data_reference *dr = dr_info->dr;
4590 const char *base_name;
4591 tree addr_base;
4592 tree dest;
4593 gimple_seq seq = NULL;
4594 tree vect_ptr_type;
4595 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4596 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4597 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4599 tree data_ref_base = unshare_expr (drb->base_address);
4600 tree base_offset = unshare_expr (get_dr_vinfo_offset (dr_info, true));
4601 tree init = unshare_expr (drb->init);
4603 if (loop_vinfo)
4604 base_name = get_name (data_ref_base);
4605 else
4607 base_offset = ssize_int (0);
4608 init = ssize_int (0);
4609 base_name = get_name (DR_REF (dr));
4612 /* Create base_offset */
4613 base_offset = size_binop (PLUS_EXPR,
4614 fold_convert (sizetype, base_offset),
4615 fold_convert (sizetype, init));
4617 if (offset)
4619 offset = fold_build2 (MULT_EXPR, sizetype,
4620 fold_convert (sizetype, offset), step);
4621 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4622 base_offset, offset);
4624 if (byte_offset)
4626 byte_offset = fold_convert (sizetype, byte_offset);
4627 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4628 base_offset, byte_offset);
4631 /* base + base_offset */
4632 if (loop_vinfo)
4633 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4634 else
4636 addr_base = build1 (ADDR_EXPR,
4637 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4638 unshare_expr (DR_REF (dr)));
4641 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4642 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4643 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4644 gimple_seq_add_seq (new_stmt_list, seq);
4646 if (DR_PTR_INFO (dr)
4647 && TREE_CODE (addr_base) == SSA_NAME
4648 && !SSA_NAME_PTR_INFO (addr_base))
4650 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4651 if (offset || byte_offset)
4652 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4655 if (dump_enabled_p ())
4656 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4658 return addr_base;
4662 /* Function vect_create_data_ref_ptr.
4664 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4665 location accessed in the loop by STMT_INFO, along with the def-use update
4666 chain to appropriately advance the pointer through the loop iterations.
4667 Also set aliasing information for the pointer. This pointer is used by
4668 the callers to this function to create a memory reference expression for
4669 vector load/store access.
4671 Input:
4672 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4673 GIMPLE_ASSIGN <name, data-ref> or
4674 GIMPLE_ASSIGN <data-ref, name>.
4675 2. AGGR_TYPE: the type of the reference, which should be either a vector
4676 or an array.
4677 3. AT_LOOP: the loop where the vector memref is to be created.
4678 4. OFFSET (optional): an offset to be added to the initial address accessed
4679 by the data-ref in STMT_INFO.
4680 5. BSI: location where the new stmts are to be placed if there is no loop
4681 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4682 pointing to the initial address.
4683 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4684 to the initial address accessed by the data-ref in STMT_INFO. This is
4685 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4686 in bytes.
4687 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4688 to the IV during each iteration of the loop. NULL says to move
4689 by one copy of AGGR_TYPE up or down, depending on the step of the
4690 data reference.
4692 Output:
4693 1. Declare a new ptr to vector_type, and have it point to the base of the
4694 data reference (initial addressed accessed by the data reference).
4695 For example, for vector of type V8HI, the following code is generated:
4697 v8hi *ap;
4698 ap = (v8hi *)initial_address;
4700 if OFFSET is not supplied:
4701 initial_address = &a[init];
4702 if OFFSET is supplied:
4703 initial_address = &a[init + OFFSET];
4704 if BYTE_OFFSET is supplied:
4705 initial_address = &a[init] + BYTE_OFFSET;
4707 Return the initial_address in INITIAL_ADDRESS.
4709 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4710 update the pointer in each iteration of the loop.
4712 Return the increment stmt that updates the pointer in PTR_INCR.
4714 3. Return the pointer. */
4716 tree
4717 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4718 class loop *at_loop, tree offset,
4719 tree *initial_address, gimple_stmt_iterator *gsi,
4720 gimple **ptr_incr, bool only_init,
4721 tree byte_offset, tree iv_step)
4723 const char *base_name;
4724 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4725 class loop *loop = NULL;
4726 bool nested_in_vect_loop = false;
4727 class loop *containing_loop = NULL;
4728 tree aggr_ptr_type;
4729 tree aggr_ptr;
4730 tree new_temp;
4731 gimple_seq new_stmt_list = NULL;
4732 edge pe = NULL;
4733 basic_block new_bb;
4734 tree aggr_ptr_init;
4735 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4736 struct data_reference *dr = dr_info->dr;
4737 tree aptr;
4738 gimple_stmt_iterator incr_gsi;
4739 bool insert_after;
4740 tree indx_before_incr, indx_after_incr;
4741 gimple *incr;
4742 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4744 gcc_assert (iv_step != NULL_TREE
4745 || TREE_CODE (aggr_type) == ARRAY_TYPE
4746 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4748 if (loop_vinfo)
4750 loop = LOOP_VINFO_LOOP (loop_vinfo);
4751 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4752 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4753 pe = loop_preheader_edge (loop);
4755 else
4757 gcc_assert (bb_vinfo);
4758 only_init = true;
4759 *ptr_incr = NULL;
4762 /* Create an expression for the first address accessed by this load
4763 in LOOP. */
4764 base_name = get_name (DR_BASE_ADDRESS (dr));
4766 if (dump_enabled_p ())
4768 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4769 dump_printf_loc (MSG_NOTE, vect_location,
4770 "create %s-pointer variable to type: %T",
4771 get_tree_code_name (TREE_CODE (aggr_type)),
4772 aggr_type);
4773 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4774 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4775 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4776 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4777 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4778 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4779 else
4780 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4781 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4784 /* (1) Create the new aggregate-pointer variable.
4785 Vector and array types inherit the alias set of their component
4786 type by default so we need to use a ref-all pointer if the data
4787 reference does not conflict with the created aggregated data
4788 reference because it is not addressable. */
4789 bool need_ref_all = false;
4790 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4791 get_alias_set (DR_REF (dr))))
4792 need_ref_all = true;
4793 /* Likewise for any of the data references in the stmt group. */
4794 else if (DR_GROUP_SIZE (stmt_info) > 1)
4796 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4799 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4800 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4801 get_alias_set (DR_REF (sdr))))
4803 need_ref_all = true;
4804 break;
4806 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4808 while (sinfo);
4810 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4811 need_ref_all);
4812 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4815 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4816 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4817 def-use update cycles for the pointer: one relative to the outer-loop
4818 (LOOP), which is what steps (3) and (4) below do. The other is relative
4819 to the inner-loop (which is the inner-most loop containing the dataref),
4820 and this is done be step (5) below.
4822 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4823 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4824 redundant. Steps (3),(4) create the following:
4826 vp0 = &base_addr;
4827 LOOP: vp1 = phi(vp0,vp2)
4830 vp2 = vp1 + step
4831 goto LOOP
4833 If there is an inner-loop nested in loop, then step (5) will also be
4834 applied, and an additional update in the inner-loop will be created:
4836 vp0 = &base_addr;
4837 LOOP: vp1 = phi(vp0,vp2)
4839 inner: vp3 = phi(vp1,vp4)
4840 vp4 = vp3 + inner_step
4841 if () goto inner
4843 vp2 = vp1 + step
4844 if () goto LOOP */
4846 /* (2) Calculate the initial address of the aggregate-pointer, and set
4847 the aggregate-pointer to point to it before the loop. */
4849 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4851 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4852 offset, byte_offset);
4853 if (new_stmt_list)
4855 if (pe)
4857 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4858 gcc_assert (!new_bb);
4860 else
4861 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4864 *initial_address = new_temp;
4865 aggr_ptr_init = new_temp;
4867 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4868 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4869 inner-loop nested in LOOP (during outer-loop vectorization). */
4871 /* No update in loop is required. */
4872 if (only_init && (!loop_vinfo || at_loop == loop))
4873 aptr = aggr_ptr_init;
4874 else
4876 /* Accesses to invariant addresses should be handled specially
4877 by the caller. */
4878 tree step = vect_dr_behavior (dr_info)->step;
4879 gcc_assert (!integer_zerop (step));
4881 if (iv_step == NULL_TREE)
4883 /* The step of the aggregate pointer is the type size,
4884 negated for downward accesses. */
4885 iv_step = TYPE_SIZE_UNIT (aggr_type);
4886 if (tree_int_cst_sgn (step) == -1)
4887 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4890 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4892 create_iv (aggr_ptr_init,
4893 fold_convert (aggr_ptr_type, iv_step),
4894 aggr_ptr, loop, &incr_gsi, insert_after,
4895 &indx_before_incr, &indx_after_incr);
4896 incr = gsi_stmt (incr_gsi);
4897 loop_vinfo->add_stmt (incr);
4899 /* Copy the points-to information if it exists. */
4900 if (DR_PTR_INFO (dr))
4902 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4903 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4905 if (ptr_incr)
4906 *ptr_incr = incr;
4908 aptr = indx_before_incr;
4911 if (!nested_in_vect_loop || only_init)
4912 return aptr;
4915 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4916 nested in LOOP, if exists. */
4918 gcc_assert (nested_in_vect_loop);
4919 if (!only_init)
4921 standard_iv_increment_position (containing_loop, &incr_gsi,
4922 &insert_after);
4923 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4924 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4925 &indx_after_incr);
4926 incr = gsi_stmt (incr_gsi);
4927 loop_vinfo->add_stmt (incr);
4929 /* Copy the points-to information if it exists. */
4930 if (DR_PTR_INFO (dr))
4932 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4933 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4935 if (ptr_incr)
4936 *ptr_incr = incr;
4938 return indx_before_incr;
4940 else
4941 gcc_unreachable ();
4945 /* Function bump_vector_ptr
4947 Increment a pointer (to a vector type) by vector-size. If requested,
4948 i.e. if PTR-INCR is given, then also connect the new increment stmt
4949 to the existing def-use update-chain of the pointer, by modifying
4950 the PTR_INCR as illustrated below:
4952 The pointer def-use update-chain before this function:
4953 DATAREF_PTR = phi (p_0, p_2)
4954 ....
4955 PTR_INCR: p_2 = DATAREF_PTR + step
4957 The pointer def-use update-chain after this function:
4958 DATAREF_PTR = phi (p_0, p_2)
4959 ....
4960 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4961 ....
4962 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4964 Input:
4965 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4966 in the loop.
4967 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4968 the loop. The increment amount across iterations is expected
4969 to be vector_size.
4970 BSI - location where the new update stmt is to be placed.
4971 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4972 BUMP - optional. The offset by which to bump the pointer. If not given,
4973 the offset is assumed to be vector_size.
4975 Output: Return NEW_DATAREF_PTR as illustrated above.
4979 tree
4980 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4981 stmt_vec_info stmt_info, tree bump)
4983 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4984 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4985 tree update = TYPE_SIZE_UNIT (vectype);
4986 gassign *incr_stmt;
4987 ssa_op_iter iter;
4988 use_operand_p use_p;
4989 tree new_dataref_ptr;
4991 if (bump)
4992 update = bump;
4994 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4995 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4996 else
4997 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4998 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4999 dataref_ptr, update);
5000 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
5002 /* Copy the points-to information if it exists. */
5003 if (DR_PTR_INFO (dr))
5005 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5006 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5009 if (!ptr_incr)
5010 return new_dataref_ptr;
5012 /* Update the vector-pointer's cross-iteration increment. */
5013 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5015 tree use = USE_FROM_PTR (use_p);
5017 if (use == dataref_ptr)
5018 SET_USE (use_p, new_dataref_ptr);
5019 else
5020 gcc_assert (operand_equal_p (use, update, 0));
5023 return new_dataref_ptr;
5027 /* Copy memory reference info such as base/clique from the SRC reference
5028 to the DEST MEM_REF. */
5030 void
5031 vect_copy_ref_info (tree dest, tree src)
5033 if (TREE_CODE (dest) != MEM_REF)
5034 return;
5036 tree src_base = src;
5037 while (handled_component_p (src_base))
5038 src_base = TREE_OPERAND (src_base, 0);
5039 if (TREE_CODE (src_base) != MEM_REF
5040 && TREE_CODE (src_base) != TARGET_MEM_REF)
5041 return;
5043 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5044 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5048 /* Function vect_create_destination_var.
5050 Create a new temporary of type VECTYPE. */
5052 tree
5053 vect_create_destination_var (tree scalar_dest, tree vectype)
5055 tree vec_dest;
5056 const char *name;
5057 char *new_name;
5058 tree type;
5059 enum vect_var_kind kind;
5061 kind = vectype
5062 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5063 ? vect_mask_var
5064 : vect_simple_var
5065 : vect_scalar_var;
5066 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5068 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5070 name = get_name (scalar_dest);
5071 if (name)
5072 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5073 else
5074 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5075 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5076 free (new_name);
5078 return vec_dest;
5081 /* Function vect_grouped_store_supported.
5083 Returns TRUE if interleave high and interleave low permutations
5084 are supported, and FALSE otherwise. */
5086 bool
5087 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5089 machine_mode mode = TYPE_MODE (vectype);
5091 /* vect_permute_store_chain requires the group size to be equal to 3 or
5092 be a power of two. */
5093 if (count != 3 && exact_log2 (count) == -1)
5095 if (dump_enabled_p ())
5096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097 "the size of the group of accesses"
5098 " is not a power of 2 or not eqaul to 3\n");
5099 return false;
5102 /* Check that the permutation is supported. */
5103 if (VECTOR_MODE_P (mode))
5105 unsigned int i;
5106 if (count == 3)
5108 unsigned int j0 = 0, j1 = 0, j2 = 0;
5109 unsigned int i, j;
5111 unsigned int nelt;
5112 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5114 if (dump_enabled_p ())
5115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5116 "cannot handle groups of 3 stores for"
5117 " variable-length vectors\n");
5118 return false;
5121 vec_perm_builder sel (nelt, nelt, 1);
5122 sel.quick_grow (nelt);
5123 vec_perm_indices indices;
5124 for (j = 0; j < 3; j++)
5126 int nelt0 = ((3 - j) * nelt) % 3;
5127 int nelt1 = ((3 - j) * nelt + 1) % 3;
5128 int nelt2 = ((3 - j) * nelt + 2) % 3;
5129 for (i = 0; i < nelt; i++)
5131 if (3 * i + nelt0 < nelt)
5132 sel[3 * i + nelt0] = j0++;
5133 if (3 * i + nelt1 < nelt)
5134 sel[3 * i + nelt1] = nelt + j1++;
5135 if (3 * i + nelt2 < nelt)
5136 sel[3 * i + nelt2] = 0;
5138 indices.new_vector (sel, 2, nelt);
5139 if (!can_vec_perm_const_p (mode, indices))
5141 if (dump_enabled_p ())
5142 dump_printf (MSG_MISSED_OPTIMIZATION,
5143 "permutation op not supported by target.\n");
5144 return false;
5147 for (i = 0; i < nelt; i++)
5149 if (3 * i + nelt0 < nelt)
5150 sel[3 * i + nelt0] = 3 * i + nelt0;
5151 if (3 * i + nelt1 < nelt)
5152 sel[3 * i + nelt1] = 3 * i + nelt1;
5153 if (3 * i + nelt2 < nelt)
5154 sel[3 * i + nelt2] = nelt + j2++;
5156 indices.new_vector (sel, 2, nelt);
5157 if (!can_vec_perm_const_p (mode, indices))
5159 if (dump_enabled_p ())
5160 dump_printf (MSG_MISSED_OPTIMIZATION,
5161 "permutation op not supported by target.\n");
5162 return false;
5165 return true;
5167 else
5169 /* If length is not equal to 3 then only power of 2 is supported. */
5170 gcc_assert (pow2p_hwi (count));
5171 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5173 /* The encoding has 2 interleaved stepped patterns. */
5174 vec_perm_builder sel (nelt, 2, 3);
5175 sel.quick_grow (6);
5176 for (i = 0; i < 3; i++)
5178 sel[i * 2] = i;
5179 sel[i * 2 + 1] = i + nelt;
5181 vec_perm_indices indices (sel, 2, nelt);
5182 if (can_vec_perm_const_p (mode, indices))
5184 for (i = 0; i < 6; i++)
5185 sel[i] += exact_div (nelt, 2);
5186 indices.new_vector (sel, 2, nelt);
5187 if (can_vec_perm_const_p (mode, indices))
5188 return true;
5193 if (dump_enabled_p ())
5194 dump_printf (MSG_MISSED_OPTIMIZATION,
5195 "permutation op not supported by target.\n");
5196 return false;
5200 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5201 type VECTYPE. MASKED_P says whether the masked form is needed. */
5203 bool
5204 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5205 bool masked_p)
5207 if (masked_p)
5208 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5209 vec_mask_store_lanes_optab,
5210 vectype, count);
5211 else
5212 return vect_lanes_optab_supported_p ("vec_store_lanes",
5213 vec_store_lanes_optab,
5214 vectype, count);
5218 /* Function vect_permute_store_chain.
5220 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5221 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5222 the data correctly for the stores. Return the final references for stores
5223 in RESULT_CHAIN.
5225 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5226 The input is 4 vectors each containing 8 elements. We assign a number to
5227 each element, the input sequence is:
5229 1st vec: 0 1 2 3 4 5 6 7
5230 2nd vec: 8 9 10 11 12 13 14 15
5231 3rd vec: 16 17 18 19 20 21 22 23
5232 4th vec: 24 25 26 27 28 29 30 31
5234 The output sequence should be:
5236 1st vec: 0 8 16 24 1 9 17 25
5237 2nd vec: 2 10 18 26 3 11 19 27
5238 3rd vec: 4 12 20 28 5 13 21 30
5239 4th vec: 6 14 22 30 7 15 23 31
5241 i.e., we interleave the contents of the four vectors in their order.
5243 We use interleave_high/low instructions to create such output. The input of
5244 each interleave_high/low operation is two vectors:
5245 1st vec 2nd vec
5246 0 1 2 3 4 5 6 7
5247 the even elements of the result vector are obtained left-to-right from the
5248 high/low elements of the first vector. The odd elements of the result are
5249 obtained left-to-right from the high/low elements of the second vector.
5250 The output of interleave_high will be: 0 4 1 5
5251 and of interleave_low: 2 6 3 7
5254 The permutation is done in log LENGTH stages. In each stage interleave_high
5255 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5256 where the first argument is taken from the first half of DR_CHAIN and the
5257 second argument from it's second half.
5258 In our example,
5260 I1: interleave_high (1st vec, 3rd vec)
5261 I2: interleave_low (1st vec, 3rd vec)
5262 I3: interleave_high (2nd vec, 4th vec)
5263 I4: interleave_low (2nd vec, 4th vec)
5265 The output for the first stage is:
5267 I1: 0 16 1 17 2 18 3 19
5268 I2: 4 20 5 21 6 22 7 23
5269 I3: 8 24 9 25 10 26 11 27
5270 I4: 12 28 13 29 14 30 15 31
5272 The output of the second stage, i.e. the final result is:
5274 I1: 0 8 16 24 1 9 17 25
5275 I2: 2 10 18 26 3 11 19 27
5276 I3: 4 12 20 28 5 13 21 30
5277 I4: 6 14 22 30 7 15 23 31. */
5279 void
5280 vect_permute_store_chain (vec<tree> dr_chain,
5281 unsigned int length,
5282 stmt_vec_info stmt_info,
5283 gimple_stmt_iterator *gsi,
5284 vec<tree> *result_chain)
5286 tree vect1, vect2, high, low;
5287 gimple *perm_stmt;
5288 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5289 tree perm_mask_low, perm_mask_high;
5290 tree data_ref;
5291 tree perm3_mask_low, perm3_mask_high;
5292 unsigned int i, j, n, log_length = exact_log2 (length);
5294 result_chain->quick_grow (length);
5295 memcpy (result_chain->address (), dr_chain.address (),
5296 length * sizeof (tree));
5298 if (length == 3)
5300 /* vect_grouped_store_supported ensures that this is constant. */
5301 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5302 unsigned int j0 = 0, j1 = 0, j2 = 0;
5304 vec_perm_builder sel (nelt, nelt, 1);
5305 sel.quick_grow (nelt);
5306 vec_perm_indices indices;
5307 for (j = 0; j < 3; j++)
5309 int nelt0 = ((3 - j) * nelt) % 3;
5310 int nelt1 = ((3 - j) * nelt + 1) % 3;
5311 int nelt2 = ((3 - j) * nelt + 2) % 3;
5313 for (i = 0; i < nelt; i++)
5315 if (3 * i + nelt0 < nelt)
5316 sel[3 * i + nelt0] = j0++;
5317 if (3 * i + nelt1 < nelt)
5318 sel[3 * i + nelt1] = nelt + j1++;
5319 if (3 * i + nelt2 < nelt)
5320 sel[3 * i + nelt2] = 0;
5322 indices.new_vector (sel, 2, nelt);
5323 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5325 for (i = 0; i < nelt; i++)
5327 if (3 * i + nelt0 < nelt)
5328 sel[3 * i + nelt0] = 3 * i + nelt0;
5329 if (3 * i + nelt1 < nelt)
5330 sel[3 * i + nelt1] = 3 * i + nelt1;
5331 if (3 * i + nelt2 < nelt)
5332 sel[3 * i + nelt2] = nelt + j2++;
5334 indices.new_vector (sel, 2, nelt);
5335 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5337 vect1 = dr_chain[0];
5338 vect2 = dr_chain[1];
5340 /* Create interleaving stmt:
5341 low = VEC_PERM_EXPR <vect1, vect2,
5342 {j, nelt, *, j + 1, nelt + j + 1, *,
5343 j + 2, nelt + j + 2, *, ...}> */
5344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5346 vect2, perm3_mask_low);
5347 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5349 vect1 = data_ref;
5350 vect2 = dr_chain[2];
5351 /* Create interleaving stmt:
5352 low = VEC_PERM_EXPR <vect1, vect2,
5353 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5354 6, 7, nelt + j + 2, ...}> */
5355 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5356 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5357 vect2, perm3_mask_high);
5358 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5359 (*result_chain)[j] = data_ref;
5362 else
5364 /* If length is not equal to 3 then only power of 2 is supported. */
5365 gcc_assert (pow2p_hwi (length));
5367 /* The encoding has 2 interleaved stepped patterns. */
5368 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5369 vec_perm_builder sel (nelt, 2, 3);
5370 sel.quick_grow (6);
5371 for (i = 0; i < 3; i++)
5373 sel[i * 2] = i;
5374 sel[i * 2 + 1] = i + nelt;
5376 vec_perm_indices indices (sel, 2, nelt);
5377 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5379 for (i = 0; i < 6; i++)
5380 sel[i] += exact_div (nelt, 2);
5381 indices.new_vector (sel, 2, nelt);
5382 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5384 for (i = 0, n = log_length; i < n; i++)
5386 for (j = 0; j < length/2; j++)
5388 vect1 = dr_chain[j];
5389 vect2 = dr_chain[j+length/2];
5391 /* Create interleaving stmt:
5392 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5393 ...}> */
5394 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5395 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5396 vect2, perm_mask_high);
5397 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5398 (*result_chain)[2*j] = high;
5400 /* Create interleaving stmt:
5401 low = VEC_PERM_EXPR <vect1, vect2,
5402 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5403 ...}> */
5404 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5405 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5406 vect2, perm_mask_low);
5407 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5408 (*result_chain)[2*j+1] = low;
5410 memcpy (dr_chain.address (), result_chain->address (),
5411 length * sizeof (tree));
5416 /* Function vect_setup_realignment
5418 This function is called when vectorizing an unaligned load using
5419 the dr_explicit_realign[_optimized] scheme.
5420 This function generates the following code at the loop prolog:
5422 p = initial_addr;
5423 x msq_init = *(floor(p)); # prolog load
5424 realignment_token = call target_builtin;
5425 loop:
5426 x msq = phi (msq_init, ---)
5428 The stmts marked with x are generated only for the case of
5429 dr_explicit_realign_optimized.
5431 The code above sets up a new (vector) pointer, pointing to the first
5432 location accessed by STMT_INFO, and a "floor-aligned" load using that
5433 pointer. It also generates code to compute the "realignment-token"
5434 (if the relevant target hook was defined), and creates a phi-node at the
5435 loop-header bb whose arguments are the result of the prolog-load (created
5436 by this function) and the result of a load that takes place in the loop
5437 (to be created by the caller to this function).
5439 For the case of dr_explicit_realign_optimized:
5440 The caller to this function uses the phi-result (msq) to create the
5441 realignment code inside the loop, and sets up the missing phi argument,
5442 as follows:
5443 loop:
5444 msq = phi (msq_init, lsq)
5445 lsq = *(floor(p')); # load in loop
5446 result = realign_load (msq, lsq, realignment_token);
5448 For the case of dr_explicit_realign:
5449 loop:
5450 msq = *(floor(p)); # load in loop
5451 p' = p + (VS-1);
5452 lsq = *(floor(p')); # load in loop
5453 result = realign_load (msq, lsq, realignment_token);
5455 Input:
5456 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5457 a memory location that may be unaligned.
5458 BSI - place where new code is to be inserted.
5459 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5460 is used.
5462 Output:
5463 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5464 target hook, if defined.
5465 Return value - the result of the loop-header phi node. */
5467 tree
5468 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5469 tree *realignment_token,
5470 enum dr_alignment_support alignment_support_scheme,
5471 tree init_addr,
5472 class loop **at_loop)
5474 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5475 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5476 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5477 struct data_reference *dr = dr_info->dr;
5478 class loop *loop = NULL;
5479 edge pe = NULL;
5480 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5481 tree vec_dest;
5482 gimple *inc;
5483 tree ptr;
5484 tree data_ref;
5485 basic_block new_bb;
5486 tree msq_init = NULL_TREE;
5487 tree new_temp;
5488 gphi *phi_stmt;
5489 tree msq = NULL_TREE;
5490 gimple_seq stmts = NULL;
5491 bool compute_in_loop = false;
5492 bool nested_in_vect_loop = false;
5493 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5494 class loop *loop_for_initial_load = NULL;
5496 if (loop_vinfo)
5498 loop = LOOP_VINFO_LOOP (loop_vinfo);
5499 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5502 gcc_assert (alignment_support_scheme == dr_explicit_realign
5503 || alignment_support_scheme == dr_explicit_realign_optimized);
5505 /* We need to generate three things:
5506 1. the misalignment computation
5507 2. the extra vector load (for the optimized realignment scheme).
5508 3. the phi node for the two vectors from which the realignment is
5509 done (for the optimized realignment scheme). */
5511 /* 1. Determine where to generate the misalignment computation.
5513 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5514 calculation will be generated by this function, outside the loop (in the
5515 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5516 caller, inside the loop.
5518 Background: If the misalignment remains fixed throughout the iterations of
5519 the loop, then both realignment schemes are applicable, and also the
5520 misalignment computation can be done outside LOOP. This is because we are
5521 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5522 are a multiple of VS (the Vector Size), and therefore the misalignment in
5523 different vectorized LOOP iterations is always the same.
5524 The problem arises only if the memory access is in an inner-loop nested
5525 inside LOOP, which is now being vectorized using outer-loop vectorization.
5526 This is the only case when the misalignment of the memory access may not
5527 remain fixed throughout the iterations of the inner-loop (as explained in
5528 detail in vect_supportable_dr_alignment). In this case, not only is the
5529 optimized realignment scheme not applicable, but also the misalignment
5530 computation (and generation of the realignment token that is passed to
5531 REALIGN_LOAD) have to be done inside the loop.
5533 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5534 or not, which in turn determines if the misalignment is computed inside
5535 the inner-loop, or outside LOOP. */
5537 if (init_addr != NULL_TREE || !loop_vinfo)
5539 compute_in_loop = true;
5540 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5544 /* 2. Determine where to generate the extra vector load.
5546 For the optimized realignment scheme, instead of generating two vector
5547 loads in each iteration, we generate a single extra vector load in the
5548 preheader of the loop, and in each iteration reuse the result of the
5549 vector load from the previous iteration. In case the memory access is in
5550 an inner-loop nested inside LOOP, which is now being vectorized using
5551 outer-loop vectorization, we need to determine whether this initial vector
5552 load should be generated at the preheader of the inner-loop, or can be
5553 generated at the preheader of LOOP. If the memory access has no evolution
5554 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5555 to be generated inside LOOP (in the preheader of the inner-loop). */
5557 if (nested_in_vect_loop)
5559 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5560 bool invariant_in_outerloop =
5561 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5562 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5564 else
5565 loop_for_initial_load = loop;
5566 if (at_loop)
5567 *at_loop = loop_for_initial_load;
5569 if (loop_for_initial_load)
5570 pe = loop_preheader_edge (loop_for_initial_load);
5572 /* 3. For the case of the optimized realignment, create the first vector
5573 load at the loop preheader. */
5575 if (alignment_support_scheme == dr_explicit_realign_optimized)
5577 /* Create msq_init = *(floor(p1)) in the loop preheader */
5578 gassign *new_stmt;
5580 gcc_assert (!compute_in_loop);
5581 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5582 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5583 loop_for_initial_load, NULL_TREE,
5584 &init_addr, NULL, &inc, true);
5585 if (TREE_CODE (ptr) == SSA_NAME)
5586 new_temp = copy_ssa_name (ptr);
5587 else
5588 new_temp = make_ssa_name (TREE_TYPE (ptr));
5589 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5590 tree type = TREE_TYPE (ptr);
5591 new_stmt = gimple_build_assign
5592 (new_temp, BIT_AND_EXPR, ptr,
5593 fold_build2 (MINUS_EXPR, type,
5594 build_int_cst (type, 0),
5595 build_int_cst (type, align)));
5596 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5597 gcc_assert (!new_bb);
5598 data_ref
5599 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5600 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5601 vect_copy_ref_info (data_ref, DR_REF (dr));
5602 new_stmt = gimple_build_assign (vec_dest, data_ref);
5603 new_temp = make_ssa_name (vec_dest, new_stmt);
5604 gimple_assign_set_lhs (new_stmt, new_temp);
5605 if (pe)
5607 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5608 gcc_assert (!new_bb);
5610 else
5611 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5613 msq_init = gimple_assign_lhs (new_stmt);
5616 /* 4. Create realignment token using a target builtin, if available.
5617 It is done either inside the containing loop, or before LOOP (as
5618 determined above). */
5620 if (targetm.vectorize.builtin_mask_for_load)
5622 gcall *new_stmt;
5623 tree builtin_decl;
5625 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5626 if (!init_addr)
5628 /* Generate the INIT_ADDR computation outside LOOP. */
5629 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5630 NULL_TREE);
5631 if (loop)
5633 pe = loop_preheader_edge (loop);
5634 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5635 gcc_assert (!new_bb);
5637 else
5638 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5641 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5642 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5643 vec_dest =
5644 vect_create_destination_var (scalar_dest,
5645 gimple_call_return_type (new_stmt));
5646 new_temp = make_ssa_name (vec_dest, new_stmt);
5647 gimple_call_set_lhs (new_stmt, new_temp);
5649 if (compute_in_loop)
5650 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5651 else
5653 /* Generate the misalignment computation outside LOOP. */
5654 pe = loop_preheader_edge (loop);
5655 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5656 gcc_assert (!new_bb);
5659 *realignment_token = gimple_call_lhs (new_stmt);
5661 /* The result of the CALL_EXPR to this builtin is determined from
5662 the value of the parameter and no global variables are touched
5663 which makes the builtin a "const" function. Requiring the
5664 builtin to have the "const" attribute makes it unnecessary
5665 to call mark_call_clobbered. */
5666 gcc_assert (TREE_READONLY (builtin_decl));
5669 if (alignment_support_scheme == dr_explicit_realign)
5670 return msq;
5672 gcc_assert (!compute_in_loop);
5673 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5676 /* 5. Create msq = phi <msq_init, lsq> in loop */
5678 pe = loop_preheader_edge (containing_loop);
5679 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5680 msq = make_ssa_name (vec_dest);
5681 phi_stmt = create_phi_node (msq, containing_loop->header);
5682 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5684 return msq;
5688 /* Function vect_grouped_load_supported.
5690 COUNT is the size of the load group (the number of statements plus the
5691 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5692 only one statement, with a gap of COUNT - 1.
5694 Returns true if a suitable permute exists. */
5696 bool
5697 vect_grouped_load_supported (tree vectype, bool single_element_p,
5698 unsigned HOST_WIDE_INT count)
5700 machine_mode mode = TYPE_MODE (vectype);
5702 /* If this is single-element interleaving with an element distance
5703 that leaves unused vector loads around punt - we at least create
5704 very sub-optimal code in that case (and blow up memory,
5705 see PR65518). */
5706 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5708 if (dump_enabled_p ())
5709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710 "single-element interleaving not supported "
5711 "for not adjacent vector loads\n");
5712 return false;
5715 /* vect_permute_load_chain requires the group size to be equal to 3 or
5716 be a power of two. */
5717 if (count != 3 && exact_log2 (count) == -1)
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721 "the size of the group of accesses"
5722 " is not a power of 2 or not equal to 3\n");
5723 return false;
5726 /* Check that the permutation is supported. */
5727 if (VECTOR_MODE_P (mode))
5729 unsigned int i, j;
5730 if (count == 3)
5732 unsigned int nelt;
5733 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5735 if (dump_enabled_p ())
5736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5737 "cannot handle groups of 3 loads for"
5738 " variable-length vectors\n");
5739 return false;
5742 vec_perm_builder sel (nelt, nelt, 1);
5743 sel.quick_grow (nelt);
5744 vec_perm_indices indices;
5745 unsigned int k;
5746 for (k = 0; k < 3; k++)
5748 for (i = 0; i < nelt; i++)
5749 if (3 * i + k < 2 * nelt)
5750 sel[i] = 3 * i + k;
5751 else
5752 sel[i] = 0;
5753 indices.new_vector (sel, 2, nelt);
5754 if (!can_vec_perm_const_p (mode, indices))
5756 if (dump_enabled_p ())
5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 "shuffle of 3 loads is not supported by"
5759 " target\n");
5760 return false;
5762 for (i = 0, j = 0; i < nelt; i++)
5763 if (3 * i + k < 2 * nelt)
5764 sel[i] = i;
5765 else
5766 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5767 indices.new_vector (sel, 2, nelt);
5768 if (!can_vec_perm_const_p (mode, indices))
5770 if (dump_enabled_p ())
5771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5772 "shuffle of 3 loads is not supported by"
5773 " target\n");
5774 return false;
5777 return true;
5779 else
5781 /* If length is not equal to 3 then only power of 2 is supported. */
5782 gcc_assert (pow2p_hwi (count));
5783 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5785 /* The encoding has a single stepped pattern. */
5786 vec_perm_builder sel (nelt, 1, 3);
5787 sel.quick_grow (3);
5788 for (i = 0; i < 3; i++)
5789 sel[i] = i * 2;
5790 vec_perm_indices indices (sel, 2, nelt);
5791 if (can_vec_perm_const_p (mode, indices))
5793 for (i = 0; i < 3; i++)
5794 sel[i] = i * 2 + 1;
5795 indices.new_vector (sel, 2, nelt);
5796 if (can_vec_perm_const_p (mode, indices))
5797 return true;
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 "extract even/odd not supported by target\n");
5805 return false;
5808 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5809 type VECTYPE. MASKED_P says whether the masked form is needed. */
5811 bool
5812 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5813 bool masked_p)
5815 if (masked_p)
5816 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5817 vec_mask_load_lanes_optab,
5818 vectype, count);
5819 else
5820 return vect_lanes_optab_supported_p ("vec_load_lanes",
5821 vec_load_lanes_optab,
5822 vectype, count);
5825 /* Function vect_permute_load_chain.
5827 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5828 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5829 the input data correctly. Return the final references for loads in
5830 RESULT_CHAIN.
5832 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5833 The input is 4 vectors each containing 8 elements. We assign a number to each
5834 element, the input sequence is:
5836 1st vec: 0 1 2 3 4 5 6 7
5837 2nd vec: 8 9 10 11 12 13 14 15
5838 3rd vec: 16 17 18 19 20 21 22 23
5839 4th vec: 24 25 26 27 28 29 30 31
5841 The output sequence should be:
5843 1st vec: 0 4 8 12 16 20 24 28
5844 2nd vec: 1 5 9 13 17 21 25 29
5845 3rd vec: 2 6 10 14 18 22 26 30
5846 4th vec: 3 7 11 15 19 23 27 31
5848 i.e., the first output vector should contain the first elements of each
5849 interleaving group, etc.
5851 We use extract_even/odd instructions to create such output. The input of
5852 each extract_even/odd operation is two vectors
5853 1st vec 2nd vec
5854 0 1 2 3 4 5 6 7
5856 and the output is the vector of extracted even/odd elements. The output of
5857 extract_even will be: 0 2 4 6
5858 and of extract_odd: 1 3 5 7
5861 The permutation is done in log LENGTH stages. In each stage extract_even
5862 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5863 their order. In our example,
5865 E1: extract_even (1st vec, 2nd vec)
5866 E2: extract_odd (1st vec, 2nd vec)
5867 E3: extract_even (3rd vec, 4th vec)
5868 E4: extract_odd (3rd vec, 4th vec)
5870 The output for the first stage will be:
5872 E1: 0 2 4 6 8 10 12 14
5873 E2: 1 3 5 7 9 11 13 15
5874 E3: 16 18 20 22 24 26 28 30
5875 E4: 17 19 21 23 25 27 29 31
5877 In order to proceed and create the correct sequence for the next stage (or
5878 for the correct output, if the second stage is the last one, as in our
5879 example), we first put the output of extract_even operation and then the
5880 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5881 The input for the second stage is:
5883 1st vec (E1): 0 2 4 6 8 10 12 14
5884 2nd vec (E3): 16 18 20 22 24 26 28 30
5885 3rd vec (E2): 1 3 5 7 9 11 13 15
5886 4th vec (E4): 17 19 21 23 25 27 29 31
5888 The output of the second stage:
5890 E1: 0 4 8 12 16 20 24 28
5891 E2: 2 6 10 14 18 22 26 30
5892 E3: 1 5 9 13 17 21 25 29
5893 E4: 3 7 11 15 19 23 27 31
5895 And RESULT_CHAIN after reordering:
5897 1st vec (E1): 0 4 8 12 16 20 24 28
5898 2nd vec (E3): 1 5 9 13 17 21 25 29
5899 3rd vec (E2): 2 6 10 14 18 22 26 30
5900 4th vec (E4): 3 7 11 15 19 23 27 31. */
5902 static void
5903 vect_permute_load_chain (vec<tree> dr_chain,
5904 unsigned int length,
5905 stmt_vec_info stmt_info,
5906 gimple_stmt_iterator *gsi,
5907 vec<tree> *result_chain)
5909 tree data_ref, first_vect, second_vect;
5910 tree perm_mask_even, perm_mask_odd;
5911 tree perm3_mask_low, perm3_mask_high;
5912 gimple *perm_stmt;
5913 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5914 unsigned int i, j, log_length = exact_log2 (length);
5916 result_chain->quick_grow (length);
5917 memcpy (result_chain->address (), dr_chain.address (),
5918 length * sizeof (tree));
5920 if (length == 3)
5922 /* vect_grouped_load_supported ensures that this is constant. */
5923 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5924 unsigned int k;
5926 vec_perm_builder sel (nelt, nelt, 1);
5927 sel.quick_grow (nelt);
5928 vec_perm_indices indices;
5929 for (k = 0; k < 3; k++)
5931 for (i = 0; i < nelt; i++)
5932 if (3 * i + k < 2 * nelt)
5933 sel[i] = 3 * i + k;
5934 else
5935 sel[i] = 0;
5936 indices.new_vector (sel, 2, nelt);
5937 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5939 for (i = 0, j = 0; i < nelt; i++)
5940 if (3 * i + k < 2 * nelt)
5941 sel[i] = i;
5942 else
5943 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5944 indices.new_vector (sel, 2, nelt);
5945 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5947 first_vect = dr_chain[0];
5948 second_vect = dr_chain[1];
5950 /* Create interleaving stmt (low part of):
5951 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5952 ...}> */
5953 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5954 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5955 second_vect, perm3_mask_low);
5956 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5958 /* Create interleaving stmt (high part of):
5959 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5960 ...}> */
5961 first_vect = data_ref;
5962 second_vect = dr_chain[2];
5963 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5964 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5965 second_vect, perm3_mask_high);
5966 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5967 (*result_chain)[k] = data_ref;
5970 else
5972 /* If length is not equal to 3 then only power of 2 is supported. */
5973 gcc_assert (pow2p_hwi (length));
5975 /* The encoding has a single stepped pattern. */
5976 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5977 vec_perm_builder sel (nelt, 1, 3);
5978 sel.quick_grow (3);
5979 for (i = 0; i < 3; ++i)
5980 sel[i] = i * 2;
5981 vec_perm_indices indices (sel, 2, nelt);
5982 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5984 for (i = 0; i < 3; ++i)
5985 sel[i] = i * 2 + 1;
5986 indices.new_vector (sel, 2, nelt);
5987 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5989 for (i = 0; i < log_length; i++)
5991 for (j = 0; j < length; j += 2)
5993 first_vect = dr_chain[j];
5994 second_vect = dr_chain[j+1];
5996 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5997 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5998 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5999 first_vect, second_vect,
6000 perm_mask_even);
6001 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6002 (*result_chain)[j/2] = data_ref;
6004 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6005 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6006 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6007 first_vect, second_vect,
6008 perm_mask_odd);
6009 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6010 (*result_chain)[j/2+length/2] = data_ref;
6012 memcpy (dr_chain.address (), result_chain->address (),
6013 length * sizeof (tree));
6018 /* Function vect_shift_permute_load_chain.
6020 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6021 sequence of stmts to reorder the input data accordingly.
6022 Return the final references for loads in RESULT_CHAIN.
6023 Return true if successed, false otherwise.
6025 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6026 The input is 3 vectors each containing 8 elements. We assign a
6027 number to each element, the input sequence is:
6029 1st vec: 0 1 2 3 4 5 6 7
6030 2nd vec: 8 9 10 11 12 13 14 15
6031 3rd vec: 16 17 18 19 20 21 22 23
6033 The output sequence should be:
6035 1st vec: 0 3 6 9 12 15 18 21
6036 2nd vec: 1 4 7 10 13 16 19 22
6037 3rd vec: 2 5 8 11 14 17 20 23
6039 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6041 First we shuffle all 3 vectors to get correct elements order:
6043 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6044 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6045 3rd vec: (16 19 22) (17 20 23) (18 21)
6047 Next we unite and shift vector 3 times:
6049 1st step:
6050 shift right by 6 the concatenation of:
6051 "1st vec" and "2nd vec"
6052 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6053 "2nd vec" and "3rd vec"
6054 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6055 "3rd vec" and "1st vec"
6056 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6057 | New vectors |
6059 So that now new vectors are:
6061 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6062 2nd vec: (10 13) (16 19 22) (17 20 23)
6063 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6065 2nd step:
6066 shift right by 5 the concatenation of:
6067 "1st vec" and "3rd vec"
6068 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6069 "2nd vec" and "1st vec"
6070 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6071 "3rd vec" and "2nd vec"
6072 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6073 | New vectors |
6075 So that now new vectors are:
6077 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6078 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6079 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6081 3rd step:
6082 shift right by 5 the concatenation of:
6083 "1st vec" and "1st vec"
6084 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6085 shift right by 3 the concatenation of:
6086 "2nd vec" and "2nd vec"
6087 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6088 | New vectors |
6090 So that now all vectors are READY:
6091 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6092 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6093 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6095 This algorithm is faster than one in vect_permute_load_chain if:
6096 1. "shift of a concatination" is faster than general permutation.
6097 This is usually so.
6098 2. The TARGET machine can't execute vector instructions in parallel.
6099 This is because each step of the algorithm depends on previous.
6100 The algorithm in vect_permute_load_chain is much more parallel.
6102 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6105 static bool
6106 vect_shift_permute_load_chain (vec<tree> dr_chain,
6107 unsigned int length,
6108 stmt_vec_info stmt_info,
6109 gimple_stmt_iterator *gsi,
6110 vec<tree> *result_chain)
6112 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6113 tree perm2_mask1, perm2_mask2, perm3_mask;
6114 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6115 gimple *perm_stmt;
6117 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6118 unsigned int i;
6119 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6121 unsigned HOST_WIDE_INT nelt, vf;
6122 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6123 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6124 /* Not supported for variable-length vectors. */
6125 return false;
6127 vec_perm_builder sel (nelt, nelt, 1);
6128 sel.quick_grow (nelt);
6130 result_chain->quick_grow (length);
6131 memcpy (result_chain->address (), dr_chain.address (),
6132 length * sizeof (tree));
6134 if (pow2p_hwi (length) && vf > 4)
6136 unsigned int j, log_length = exact_log2 (length);
6137 for (i = 0; i < nelt / 2; ++i)
6138 sel[i] = i * 2;
6139 for (i = 0; i < nelt / 2; ++i)
6140 sel[nelt / 2 + i] = i * 2 + 1;
6141 vec_perm_indices indices (sel, 2, nelt);
6142 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6144 if (dump_enabled_p ())
6145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146 "shuffle of 2 fields structure is not \
6147 supported by target\n");
6148 return false;
6150 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6152 for (i = 0; i < nelt / 2; ++i)
6153 sel[i] = i * 2 + 1;
6154 for (i = 0; i < nelt / 2; ++i)
6155 sel[nelt / 2 + i] = i * 2;
6156 indices.new_vector (sel, 2, nelt);
6157 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6159 if (dump_enabled_p ())
6160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6161 "shuffle of 2 fields structure is not \
6162 supported by target\n");
6163 return false;
6165 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6167 /* Generating permutation constant to shift all elements.
6168 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6169 for (i = 0; i < nelt; i++)
6170 sel[i] = nelt / 2 + i;
6171 indices.new_vector (sel, 2, nelt);
6172 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6174 if (dump_enabled_p ())
6175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6176 "shift permutation is not supported by target\n");
6177 return false;
6179 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6181 /* Generating permutation constant to select vector from 2.
6182 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6183 for (i = 0; i < nelt / 2; i++)
6184 sel[i] = i;
6185 for (i = nelt / 2; i < nelt; i++)
6186 sel[i] = nelt + i;
6187 indices.new_vector (sel, 2, nelt);
6188 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6190 if (dump_enabled_p ())
6191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6192 "select is not supported by target\n");
6193 return false;
6195 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6197 for (i = 0; i < log_length; i++)
6199 for (j = 0; j < length; j += 2)
6201 first_vect = dr_chain[j];
6202 second_vect = dr_chain[j + 1];
6204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6205 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206 first_vect, first_vect,
6207 perm2_mask1);
6208 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6209 vect[0] = data_ref;
6211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213 second_vect, second_vect,
6214 perm2_mask2);
6215 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6216 vect[1] = data_ref;
6218 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6219 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6220 vect[0], vect[1], shift1_mask);
6221 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6222 (*result_chain)[j/2 + length/2] = data_ref;
6224 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6225 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6226 vect[0], vect[1], select_mask);
6227 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6228 (*result_chain)[j/2] = data_ref;
6230 memcpy (dr_chain.address (), result_chain->address (),
6231 length * sizeof (tree));
6233 return true;
6235 if (length == 3 && vf > 2)
6237 unsigned int k = 0, l = 0;
6239 /* Generating permutation constant to get all elements in rigth order.
6240 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6241 for (i = 0; i < nelt; i++)
6243 if (3 * k + (l % 3) >= nelt)
6245 k = 0;
6246 l += (3 - (nelt % 3));
6248 sel[i] = 3 * k + (l % 3);
6249 k++;
6251 vec_perm_indices indices (sel, 2, nelt);
6252 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6254 if (dump_enabled_p ())
6255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256 "shuffle of 3 fields structure is not \
6257 supported by target\n");
6258 return false;
6260 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6262 /* Generating permutation constant to shift all elements.
6263 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6264 for (i = 0; i < nelt; i++)
6265 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6266 indices.new_vector (sel, 2, nelt);
6267 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6269 if (dump_enabled_p ())
6270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6271 "shift permutation is not supported by target\n");
6272 return false;
6274 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6276 /* Generating permutation constant to shift all elements.
6277 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6278 for (i = 0; i < nelt; i++)
6279 sel[i] = 2 * (nelt / 3) + 1 + i;
6280 indices.new_vector (sel, 2, nelt);
6281 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6283 if (dump_enabled_p ())
6284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6285 "shift permutation is not supported by target\n");
6286 return false;
6288 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6290 /* Generating permutation constant to shift all elements.
6291 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6292 for (i = 0; i < nelt; i++)
6293 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6294 indices.new_vector (sel, 2, nelt);
6295 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6297 if (dump_enabled_p ())
6298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6299 "shift permutation is not supported by target\n");
6300 return false;
6302 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6304 /* Generating permutation constant to shift all elements.
6305 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6306 for (i = 0; i < nelt; i++)
6307 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6308 indices.new_vector (sel, 2, nelt);
6309 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6311 if (dump_enabled_p ())
6312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6313 "shift permutation is not supported by target\n");
6314 return false;
6316 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6318 for (k = 0; k < 3; k++)
6320 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6321 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6322 dr_chain[k], dr_chain[k],
6323 perm3_mask);
6324 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6325 vect[k] = data_ref;
6328 for (k = 0; k < 3; k++)
6330 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6331 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6332 vect[k % 3], vect[(k + 1) % 3],
6333 shift1_mask);
6334 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6335 vect_shift[k] = data_ref;
6338 for (k = 0; k < 3; k++)
6340 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6341 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6342 vect_shift[(4 - k) % 3],
6343 vect_shift[(3 - k) % 3],
6344 shift2_mask);
6345 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6346 vect[k] = data_ref;
6349 (*result_chain)[3 - (nelt % 3)] = vect[2];
6351 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6352 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6353 vect[0], shift3_mask);
6354 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6355 (*result_chain)[nelt % 3] = data_ref;
6357 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6358 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6359 vect[1], shift4_mask);
6360 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6361 (*result_chain)[0] = data_ref;
6362 return true;
6364 return false;
6367 /* Function vect_transform_grouped_load.
6369 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6370 to perform their permutation and ascribe the result vectorized statements to
6371 the scalar statements.
6374 void
6375 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6376 int size, gimple_stmt_iterator *gsi)
6378 machine_mode mode;
6379 vec<tree> result_chain = vNULL;
6381 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6382 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6383 vectors, that are ready for vector computation. */
6384 result_chain.create (size);
6386 /* If reassociation width for vector type is 2 or greater target machine can
6387 execute 2 or more vector instructions in parallel. Otherwise try to
6388 get chain for loads group using vect_shift_permute_load_chain. */
6389 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6390 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6391 || pow2p_hwi (size)
6392 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6393 gsi, &result_chain))
6394 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6395 vect_record_grouped_load_vectors (stmt_info, result_chain);
6396 result_chain.release ();
6399 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6400 generated as part of the vectorization of STMT_INFO. Assign the statement
6401 for each vector to the associated scalar statement. */
6403 void
6404 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6405 vec<tree> result_chain)
6407 vec_info *vinfo = stmt_info->vinfo;
6408 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6409 unsigned int i, gap_count;
6410 tree tmp_data_ref;
6412 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6413 Since we scan the chain starting from it's first node, their order
6414 corresponds the order of data-refs in RESULT_CHAIN. */
6415 stmt_vec_info next_stmt_info = first_stmt_info;
6416 gap_count = 1;
6417 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6419 if (!next_stmt_info)
6420 break;
6422 /* Skip the gaps. Loads created for the gaps will be removed by dead
6423 code elimination pass later. No need to check for the first stmt in
6424 the group, since it always exists.
6425 DR_GROUP_GAP is the number of steps in elements from the previous
6426 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6427 correspond to the gaps. */
6428 if (next_stmt_info != first_stmt_info
6429 && gap_count < DR_GROUP_GAP (next_stmt_info))
6431 gap_count++;
6432 continue;
6435 /* ??? The following needs cleanup after the removal of
6436 DR_GROUP_SAME_DR_STMT. */
6437 if (next_stmt_info)
6439 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6440 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6441 copies, and we put the new vector statement in the first available
6442 RELATED_STMT. */
6443 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6444 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6445 else
6447 stmt_vec_info prev_stmt_info
6448 = STMT_VINFO_VEC_STMT (next_stmt_info);
6449 stmt_vec_info rel_stmt_info
6450 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6451 while (rel_stmt_info)
6453 prev_stmt_info = rel_stmt_info;
6454 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6457 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6460 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6461 gap_count = 1;
6466 /* Function vect_force_dr_alignment_p.
6468 Returns whether the alignment of a DECL can be forced to be aligned
6469 on ALIGNMENT bit boundary. */
6471 bool
6472 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6474 if (!VAR_P (decl))
6475 return false;
6477 if (decl_in_symtab_p (decl)
6478 && !symtab_node::get (decl)->can_increase_alignment_p ())
6479 return false;
6481 if (TREE_STATIC (decl))
6482 return (known_le (alignment,
6483 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6484 else
6485 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6489 /* Return whether the data reference DR_INFO is supported with respect to its
6490 alignment.
6491 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6492 it is aligned, i.e., check if it is possible to vectorize it with different
6493 alignment. */
6495 enum dr_alignment_support
6496 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6497 bool check_aligned_accesses)
6499 data_reference *dr = dr_info->dr;
6500 stmt_vec_info stmt_info = dr_info->stmt;
6501 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6502 machine_mode mode = TYPE_MODE (vectype);
6503 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6504 class loop *vect_loop = NULL;
6505 bool nested_in_vect_loop = false;
6507 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6508 return dr_aligned;
6510 /* For now assume all conditional loads/stores support unaligned
6511 access without any special code. */
6512 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6513 if (gimple_call_internal_p (stmt)
6514 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6515 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6516 return dr_unaligned_supported;
6518 if (loop_vinfo)
6520 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6521 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6524 /* Possibly unaligned access. */
6526 /* We can choose between using the implicit realignment scheme (generating
6527 a misaligned_move stmt) and the explicit realignment scheme (generating
6528 aligned loads with a REALIGN_LOAD). There are two variants to the
6529 explicit realignment scheme: optimized, and unoptimized.
6530 We can optimize the realignment only if the step between consecutive
6531 vector loads is equal to the vector size. Since the vector memory
6532 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6533 is guaranteed that the misalignment amount remains the same throughout the
6534 execution of the vectorized loop. Therefore, we can create the
6535 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6536 at the loop preheader.
6538 However, in the case of outer-loop vectorization, when vectorizing a
6539 memory access in the inner-loop nested within the LOOP that is now being
6540 vectorized, while it is guaranteed that the misalignment of the
6541 vectorized memory access will remain the same in different outer-loop
6542 iterations, it is *not* guaranteed that is will remain the same throughout
6543 the execution of the inner-loop. This is because the inner-loop advances
6544 with the original scalar step (and not in steps of VS). If the inner-loop
6545 step happens to be a multiple of VS, then the misalignment remains fixed
6546 and we can use the optimized realignment scheme. For example:
6548 for (i=0; i<N; i++)
6549 for (j=0; j<M; j++)
6550 s += a[i+j];
6552 When vectorizing the i-loop in the above example, the step between
6553 consecutive vector loads is 1, and so the misalignment does not remain
6554 fixed across the execution of the inner-loop, and the realignment cannot
6555 be optimized (as illustrated in the following pseudo vectorized loop):
6557 for (i=0; i<N; i+=4)
6558 for (j=0; j<M; j++){
6559 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6560 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6561 // (assuming that we start from an aligned address).
6564 We therefore have to use the unoptimized realignment scheme:
6566 for (i=0; i<N; i+=4)
6567 for (j=k; j<M; j+=4)
6568 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6569 // that the misalignment of the initial address is
6570 // 0).
6572 The loop can then be vectorized as follows:
6574 for (k=0; k<4; k++){
6575 rt = get_realignment_token (&vp[k]);
6576 for (i=0; i<N; i+=4){
6577 v1 = vp[i+k];
6578 for (j=k; j<M; j+=4){
6579 v2 = vp[i+j+VS-1];
6580 va = REALIGN_LOAD <v1,v2,rt>;
6581 vs += va;
6582 v1 = v2;
6585 } */
6587 if (DR_IS_READ (dr))
6589 bool is_packed = false;
6590 tree type = (TREE_TYPE (DR_REF (dr)));
6592 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6593 && (!targetm.vectorize.builtin_mask_for_load
6594 || targetm.vectorize.builtin_mask_for_load ()))
6596 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6598 /* If we are doing SLP then the accesses need not have the
6599 same alignment, instead it depends on the SLP group size. */
6600 if (loop_vinfo
6601 && STMT_SLP_TYPE (stmt_info)
6602 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6603 * (DR_GROUP_SIZE
6604 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6605 TYPE_VECTOR_SUBPARTS (vectype)))
6607 else if (!loop_vinfo
6608 || (nested_in_vect_loop
6609 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6610 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6611 return dr_explicit_realign;
6612 else
6613 return dr_explicit_realign_optimized;
6615 if (!known_alignment_for_access_p (dr_info))
6616 is_packed = not_size_aligned (DR_REF (dr));
6618 if (targetm.vectorize.support_vector_misalignment
6619 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6620 /* Can't software pipeline the loads, but can at least do them. */
6621 return dr_unaligned_supported;
6623 else
6625 bool is_packed = false;
6626 tree type = (TREE_TYPE (DR_REF (dr)));
6628 if (!known_alignment_for_access_p (dr_info))
6629 is_packed = not_size_aligned (DR_REF (dr));
6631 if (targetm.vectorize.support_vector_misalignment
6632 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6633 return dr_unaligned_supported;
6636 /* Unsupported. */
6637 return dr_unaligned_unsupported;