aix: Fix _STDC_FORMAT_MACROS in inttypes.h [PR97044]
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob5bf93e2942bdbfd8eb6524cfa4508a7ca2fbe564
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode, array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
96 return true;
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
160 if (i < gimple_call_num_args (call))
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
182 static opt_result
183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return opt_result::success ();
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
205 static void
206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
224 static bool
225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
235 /* STMT_A and STMT_B belong to overlapping groups. All loads are
236 emitted at the position of the first scalar load.
237 Stores in a group are emitted at the position of the last scalar store.
238 Compute that position and check whether the resulting order matches
239 the current one. */
240 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
241 if (il_a)
243 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
244 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
245 s = DR_GROUP_NEXT_ELEMENT (s))
246 il_a = get_later_stmt (il_a, s);
247 else /* DR_IS_READ */
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 if (get_later_stmt (il_a, s) == il_a)
251 il_a = s;
253 else
254 il_a = stmtinfo_a;
255 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
256 if (il_b)
258 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
259 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
260 s = DR_GROUP_NEXT_ELEMENT (s))
261 il_b = get_later_stmt (il_b, s);
262 else /* DR_IS_READ */
263 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
264 s = DR_GROUP_NEXT_ELEMENT (s))
265 if (get_later_stmt (il_b, s) == il_b)
266 il_b = s;
268 else
269 il_b = stmtinfo_b;
270 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
271 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
274 /* A subroutine of vect_analyze_data_ref_dependence. Handle
275 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
276 distances. These distances are conservatively correct but they don't
277 reflect a guaranteed dependence.
279 Return true if this function does all the work necessary to avoid
280 an alias or false if the caller should use the dependence distances
281 to limit the vectorization factor in the usual way. LOOP_DEPTH is
282 the depth of the loop described by LOOP_VINFO and the other arguments
283 are as for vect_analyze_data_ref_dependence. */
285 static bool
286 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
287 loop_vec_info loop_vinfo,
288 int loop_depth, unsigned int *max_vf)
290 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
291 lambda_vector dist_v;
292 unsigned int i;
293 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
295 int dist = dist_v[loop_depth];
296 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
298 /* If the user asserted safelen >= DIST consecutive iterations
299 can be executed concurrently, assume independence.
301 ??? An alternative would be to add the alias check even
302 in this case, and vectorize the fallback loop with the
303 maximum VF set to safelen. However, if the user has
304 explicitly given a length, it's less likely that that
305 would be a win. */
306 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
308 if ((unsigned int) loop->safelen < *max_vf)
309 *max_vf = loop->safelen;
310 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
311 continue;
314 /* For dependence distances of 2 or more, we have the option
315 of limiting VF or checking for an alias at runtime.
316 Prefer to check at runtime if we can, to avoid limiting
317 the VF unnecessarily when the bases are in fact independent.
319 Note that the alias checks will be removed if the VF ends up
320 being small enough. */
321 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
322 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
323 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
324 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
325 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
328 return true;
332 /* Function vect_analyze_data_ref_dependence.
334 FIXME: I needed to change the sense of the returned flag.
336 Return FALSE if there (might) exist a dependence between a memory-reference
337 DRA and a memory-reference DRB. When versioning for alias may check a
338 dependence at run-time, return TRUE. Adjust *MAX_VF according to
339 the data dependence. */
341 static opt_result
342 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
343 loop_vec_info loop_vinfo,
344 unsigned int *max_vf)
346 unsigned int i;
347 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
348 struct data_reference *dra = DDR_A (ddr);
349 struct data_reference *drb = DDR_B (ddr);
350 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
351 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
352 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
353 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
354 lambda_vector dist_v;
355 unsigned int loop_depth;
357 /* In loop analysis all data references should be vectorizable. */
358 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
359 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
360 gcc_unreachable ();
362 /* Independent data accesses. */
363 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
364 return opt_result::success ();
366 if (dra == drb
367 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
368 return opt_result::success ();
370 /* We do not have to consider dependences between accesses that belong
371 to the same group, unless the stride could be smaller than the
372 group size. */
373 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
374 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
375 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
376 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
377 return opt_result::success ();
379 /* Even if we have an anti-dependence then, as the vectorized loop covers at
380 least two scalar iterations, there is always also a true dependence.
381 As the vectorizer does not re-order loads and stores we can ignore
382 the anti-dependence if TBAA can disambiguate both DRs similar to the
383 case with known negative distance anti-dependences (positive
384 distance anti-dependences would violate TBAA constraints). */
385 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
386 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
387 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
388 get_alias_set (DR_REF (drb))))
389 return opt_result::success ();
391 /* Unknown data dependence. */
392 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
394 /* If user asserted safelen consecutive iterations can be
395 executed concurrently, assume independence. */
396 if (loop->safelen >= 2)
398 if ((unsigned int) loop->safelen < *max_vf)
399 *max_vf = loop->safelen;
400 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
401 return opt_result::success ();
404 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
405 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
406 return opt_result::failure_at
407 (stmtinfo_a->stmt,
408 "versioning for alias not supported for: "
409 "can't determine dependence between %T and %T\n",
410 DR_REF (dra), DR_REF (drb));
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
414 "versioning for alias required: "
415 "can't determine dependence between %T and %T\n",
416 DR_REF (dra), DR_REF (drb));
418 /* Add to list of ddrs that need to be tested at run-time. */
419 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
422 /* Known data dependence. */
423 if (DDR_NUM_DIST_VECTS (ddr) == 0)
425 /* If user asserted safelen consecutive iterations can be
426 executed concurrently, assume independence. */
427 if (loop->safelen >= 2)
429 if ((unsigned int) loop->safelen < *max_vf)
430 *max_vf = loop->safelen;
431 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
432 return opt_result::success ();
435 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
436 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
437 return opt_result::failure_at
438 (stmtinfo_a->stmt,
439 "versioning for alias not supported for: "
440 "bad dist vector for %T and %T\n",
441 DR_REF (dra), DR_REF (drb));
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
445 "versioning for alias required: "
446 "bad dist vector for %T and %T\n",
447 DR_REF (dra), DR_REF (drb));
448 /* Add to list of ddrs that need to be tested at run-time. */
449 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
452 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
455 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
456 loop_depth, max_vf))
457 return opt_result::success ();
459 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461 int dist = dist_v[loop_depth];
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location,
465 "dependence distance = %d.\n", dist);
467 if (dist == 0)
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "dependence distance == 0 between %T and %T\n",
472 DR_REF (dra), DR_REF (drb));
474 /* When we perform grouped accesses and perform implicit CSE
475 by detecting equal accesses and doing disambiguation with
476 runtime alias tests like for
477 .. = a[i];
478 .. = a[i+1];
479 a[i] = ..;
480 a[i+1] = ..;
481 *p = ..;
482 .. = a[i];
483 .. = a[i+1];
484 where we will end up loading { a[i], a[i+1] } once, make
485 sure that inserting group loads before the first load and
486 stores after the last store will do the right thing.
487 Similar for groups like
488 a[i] = ...;
489 ... = a[i];
490 a[i+1] = ...;
491 where loads from the group interleave with the store. */
492 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
493 return opt_result::failure_at (stmtinfo_a->stmt,
494 "READ_WRITE dependence"
495 " in interleaving.\n");
497 if (loop->safelen < 2)
499 tree indicator = dr_zero_step_indicator (dra);
500 if (!indicator || integer_zerop (indicator))
501 return opt_result::failure_at (stmtinfo_a->stmt,
502 "access also has a zero step\n");
503 else if (TREE_CODE (indicator) != INTEGER_CST)
504 vect_check_nonzero_value (loop_vinfo, indicator);
506 continue;
509 if (dist > 0 && DDR_REVERSED_P (ddr))
511 /* If DDR_REVERSED_P the order of the data-refs in DDR was
512 reversed (to make distance vector positive), and the actual
513 distance is negative. */
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "dependence distance negative.\n");
517 /* When doing outer loop vectorization, we need to check if there is
518 a backward dependence at the inner loop level if the dependence
519 at the outer loop is reversed. See PR81740. */
520 if (nested_in_vect_loop_p (loop, stmtinfo_a)
521 || nested_in_vect_loop_p (loop, stmtinfo_b))
523 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
524 DDR_LOOP_NEST (ddr));
525 if (dist_v[inner_depth] < 0)
526 return opt_result::failure_at (stmtinfo_a->stmt,
527 "not vectorized, dependence "
528 "between data-refs %T and %T\n",
529 DR_REF (dra), DR_REF (drb));
531 /* Record a negative dependence distance to later limit the
532 amount of stmt copying / unrolling we can perform.
533 Only need to handle read-after-write dependence. */
534 if (DR_IS_READ (drb)
535 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
536 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
537 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
538 continue;
541 unsigned int abs_dist = abs (dist);
542 if (abs_dist >= 2 && abs_dist < *max_vf)
544 /* The dependence distance requires reduction of the maximal
545 vectorization factor. */
546 *max_vf = abs_dist;
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location,
549 "adjusting maximal vectorization factor to %i\n",
550 *max_vf);
553 if (abs_dist >= *max_vf)
555 /* Dependence distance does not create dependence, as far as
556 vectorization is concerned, in this case. */
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "dependence distance >= VF.\n");
560 continue;
563 return opt_result::failure_at (stmtinfo_a->stmt,
564 "not vectorized, possible dependence "
565 "between data-refs %T and %T\n",
566 DR_REF (dra), DR_REF (drb));
569 return opt_result::success ();
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the loop, and make sure there do not
575 exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 opt_result
579 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
580 unsigned int *max_vf)
582 unsigned int i;
583 struct data_dependence_relation *ddr;
585 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589 LOOP_VINFO_DDRS (loop_vinfo)
590 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
591 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
592 /* We need read-read dependences to compute
593 STMT_VINFO_SAME_ALIGN_REFS. */
594 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595 &LOOP_VINFO_DDRS (loop_vinfo),
596 LOOP_VINFO_LOOP_NEST (loop_vinfo),
597 true);
598 gcc_assert (res);
601 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
603 /* For epilogues we either have no aliases or alias versioning
604 was applied to original loop. Therefore we may just get max_vf
605 using VF of original loop. */
606 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
607 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
608 else
609 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
611 opt_result res
612 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613 if (!res)
614 return res;
617 return opt_result::success ();
621 /* Function vect_slp_analyze_data_ref_dependence.
623 Return TRUE if there (might) exist a dependence between a memory-reference
624 DRA and a memory-reference DRB for VINFO. When versioning for alias
625 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
626 according to the data dependence. */
628 static bool
629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630 struct data_dependence_relation *ddr)
632 struct data_reference *dra = DDR_A (ddr);
633 struct data_reference *drb = DDR_B (ddr);
634 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
637 /* We need to check dependences of statements marked as unvectorizable
638 as well, they still can prohibit vectorization. */
640 /* Independent data accesses. */
641 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642 return false;
644 if (dra == drb)
645 return false;
647 /* Read-read is OK. */
648 if (DR_IS_READ (dra) && DR_IS_READ (drb))
649 return false;
651 /* If dra and drb are part of the same interleaving chain consider
652 them independent. */
653 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
656 return false;
658 /* Unknown data dependence. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "can't determine dependence between %T and %T\n",
664 DR_REF (dra), DR_REF (drb));
666 else if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location,
668 "determined dependence between %T and %T\n",
669 DR_REF (dra), DR_REF (drb));
671 return true;
675 /* Analyze dependences involved in the transform of SLP NODE. STORES
676 contain the vector of scalar stores of this instance if we are
677 disambiguating the loads. */
679 static bool
680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
681 vec<stmt_vec_info> stores,
682 stmt_vec_info last_store_info)
684 /* This walks over all stmts involved in the SLP load/store done
685 in NODE verifying we can sink them up to the last stmt in the
686 group. */
687 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
689 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
690 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
692 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
693 if (access_info == last_access_info)
694 continue;
695 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
696 ao_ref ref;
697 bool ref_initialized_p = false;
698 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
699 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
701 gimple *stmt = gsi_stmt (gsi);
702 if (! gimple_vuse (stmt))
703 continue;
705 /* If we couldn't record a (single) data reference for this
706 stmt we have to resort to the alias oracle. */
707 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
708 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
709 if (!dr_b)
711 /* We are moving a store - this means
712 we cannot use TBAA for disambiguation. */
713 if (!ref_initialized_p)
714 ao_ref_init (&ref, DR_REF (dr_a));
715 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
716 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
717 return false;
718 continue;
721 bool dependent = false;
722 /* If we run into a store of this same instance (we've just
723 marked those) then delay dependence checking until we run
724 into the last store because this is where it will have
725 been sunk to (and we verify if we can do that as well). */
726 if (gimple_visited_p (stmt))
728 if (stmt_info != last_store_info)
729 continue;
730 unsigned i;
731 stmt_vec_info store_info;
732 FOR_EACH_VEC_ELT (stores, i, store_info)
734 data_reference *store_dr
735 = STMT_VINFO_DATA_REF (store_info);
736 ddr_p ddr = initialize_data_dependence_relation
737 (dr_a, store_dr, vNULL);
738 dependent
739 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
740 free_dependence_relation (ddr);
741 if (dependent)
742 break;
745 else
747 ddr_p ddr = initialize_data_dependence_relation (dr_a,
748 dr_b, vNULL);
749 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
750 free_dependence_relation (ddr);
752 if (dependent)
753 return false;
757 else /* DR_IS_READ */
759 stmt_vec_info first_access_info
760 = vect_find_first_scalar_stmt_in_slp (node);
761 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
763 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
764 if (access_info == first_access_info)
765 continue;
766 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
767 ao_ref ref;
768 bool ref_initialized_p = false;
769 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
770 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
772 gimple *stmt = gsi_stmt (gsi);
773 if (! gimple_vdef (stmt))
774 continue;
776 /* If we couldn't record a (single) data reference for this
777 stmt we have to resort to the alias oracle. */
778 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
779 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
780 if (!dr_b)
782 /* We are hoisting a load - this means we can use
783 TBAA for disambiguation. */
784 if (!ref_initialized_p)
785 ao_ref_init (&ref, DR_REF (dr_a));
786 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
787 return false;
788 continue;
791 bool dependent = false;
792 /* If we run into a store of this same instance (we've just
793 marked those) then delay dependence checking until we run
794 into the last store because this is where it will have
795 been sunk to (and we verify if we can do that as well). */
796 if (gimple_visited_p (stmt))
798 if (stmt_info != last_store_info)
799 continue;
800 unsigned i;
801 stmt_vec_info store_info;
802 FOR_EACH_VEC_ELT (stores, i, store_info)
804 data_reference *store_dr
805 = STMT_VINFO_DATA_REF (store_info);
806 ddr_p ddr = initialize_data_dependence_relation
807 (dr_a, store_dr, vNULL);
808 dependent
809 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
810 free_dependence_relation (ddr);
811 if (dependent)
812 break;
815 else
817 ddr_p ddr = initialize_data_dependence_relation (dr_a,
818 dr_b, vNULL);
819 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
820 free_dependence_relation (ddr);
822 if (dependent)
823 return false;
827 return true;
831 /* Function vect_analyze_data_ref_dependences.
833 Examine all the data references in the basic-block, and make sure there
834 do not exist any data dependences between them. Set *MAX_VF according to
835 the maximum vectorization factor the data dependences allow. */
837 bool
838 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
840 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
842 /* The stores of this instance are at the root of the SLP tree. */
843 slp_tree store = SLP_INSTANCE_TREE (instance);
844 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
845 store = NULL;
847 /* Verify we can sink stores to the vectorized stmt insert location. */
848 stmt_vec_info last_store_info = NULL;
849 if (store)
851 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
852 return false;
854 /* Mark stores in this instance and remember the last one. */
855 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
856 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
857 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
860 bool res = true;
862 /* Verify we can sink loads to the vectorized stmt insert location,
863 special-casing stores of this instance. */
864 slp_tree load;
865 unsigned int i;
866 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
867 if (! vect_slp_analyze_node_dependences (vinfo, load,
868 store
869 ? SLP_TREE_SCALAR_STMTS (store)
870 : vNULL, last_store_info))
872 res = false;
873 break;
876 /* Unset the visited flag. */
877 if (store)
878 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
879 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
881 return res;
884 /* Record the base alignment guarantee given by DRB, which occurs
885 in STMT_INFO. */
887 static void
888 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
889 innermost_loop_behavior *drb)
891 bool existed;
892 innermost_loop_behavior *&entry
893 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
894 if (!existed || entry->base_alignment < drb->base_alignment)
896 entry = drb;
897 if (dump_enabled_p ())
898 dump_printf_loc (MSG_NOTE, vect_location,
899 "recording new base alignment for %T\n"
900 " alignment: %d\n"
901 " misalignment: %d\n"
902 " based on: %G",
903 drb->base_address,
904 drb->base_alignment,
905 drb->base_misalignment,
906 stmt_info->stmt);
910 /* If the region we're going to vectorize is reached, all unconditional
911 data references occur at least once. We can therefore pool the base
912 alignment guarantees from each unconditional reference. Do this by
913 going through all the data references in VINFO and checking whether
914 the containing statement makes the reference unconditionally. If so,
915 record the alignment of the base address in VINFO so that it can be
916 used for all other references with the same base. */
918 void
919 vect_record_base_alignments (vec_info *vinfo)
921 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
922 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
923 data_reference *dr;
924 unsigned int i;
925 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
927 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
928 stmt_vec_info stmt_info = dr_info->stmt;
929 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
930 && STMT_VINFO_VECTORIZABLE (stmt_info)
931 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
933 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
935 /* If DR is nested in the loop that is being vectorized, we can also
936 record the alignment of the base wrt the outer loop. */
937 if (loop && nested_in_vect_loop_p (loop, stmt_info))
938 vect_record_base_alignment
939 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
944 /* Return the target alignment for the vectorized form of DR_INFO. */
946 static poly_uint64
947 vect_calculate_target_alignment (dr_vec_info *dr_info)
949 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
950 return targetm.vectorize.preferred_vector_alignment (vectype);
953 /* Function vect_compute_data_ref_alignment
955 Compute the misalignment of the data reference DR_INFO.
957 Output:
958 1. DR_MISALIGNMENT (DR_INFO) is defined.
960 FOR NOW: No analysis is actually performed. Misalignment is calculated
961 only for trivial cases. TODO. */
963 static void
964 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
966 stmt_vec_info stmt_info = dr_info->stmt;
967 vec_base_alignments *base_alignments = &vinfo->base_alignments;
968 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
969 class loop *loop = NULL;
970 tree ref = DR_REF (dr_info->dr);
971 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
973 if (dump_enabled_p ())
974 dump_printf_loc (MSG_NOTE, vect_location,
975 "vect_compute_data_ref_alignment:\n");
977 if (loop_vinfo)
978 loop = LOOP_VINFO_LOOP (loop_vinfo);
980 /* Initialize misalignment to unknown. */
981 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
983 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
984 return;
986 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
987 bool step_preserves_misalignment_p;
989 poly_uint64 vector_alignment
990 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
991 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
993 /* If the main loop has peeled for alignment we have no way of knowing
994 whether the data accesses in the epilogues are aligned. We can't at
995 compile time answer the question whether we have entered the main loop or
996 not. Fixes PR 92351. */
997 if (loop_vinfo)
999 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1000 if (orig_loop_vinfo
1001 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1002 return;
1005 unsigned HOST_WIDE_INT vect_align_c;
1006 if (!vector_alignment.is_constant (&vect_align_c))
1007 return;
1009 /* No step for BB vectorization. */
1010 if (!loop)
1012 gcc_assert (integer_zerop (drb->step));
1013 step_preserves_misalignment_p = true;
1016 /* In case the dataref is in an inner-loop of the loop that is being
1017 vectorized (LOOP), we use the base and misalignment information
1018 relative to the outer-loop (LOOP). This is ok only if the misalignment
1019 stays the same throughout the execution of the inner-loop, which is why
1020 we have to check that the stride of the dataref in the inner-loop evenly
1021 divides by the vector alignment. */
1022 else if (nested_in_vect_loop_p (loop, stmt_info))
1024 step_preserves_misalignment_p
1025 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1027 if (dump_enabled_p ())
1029 if (step_preserves_misalignment_p)
1030 dump_printf_loc (MSG_NOTE, vect_location,
1031 "inner step divides the vector alignment.\n");
1032 else
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "inner step doesn't divide the vector"
1035 " alignment.\n");
1039 /* Similarly we can only use base and misalignment information relative to
1040 an innermost loop if the misalignment stays the same throughout the
1041 execution of the loop. As above, this is the case if the stride of
1042 the dataref evenly divides by the alignment. */
1043 else
1045 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1046 step_preserves_misalignment_p
1047 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1049 if (!step_preserves_misalignment_p && dump_enabled_p ())
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051 "step doesn't divide the vector alignment.\n");
1054 unsigned int base_alignment = drb->base_alignment;
1055 unsigned int base_misalignment = drb->base_misalignment;
1057 /* Calculate the maximum of the pooled base address alignment and the
1058 alignment that we can compute for DR itself. */
1059 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1060 if (entry && base_alignment < (*entry)->base_alignment)
1062 base_alignment = (*entry)->base_alignment;
1063 base_misalignment = (*entry)->base_misalignment;
1066 if (drb->offset_alignment < vect_align_c
1067 || !step_preserves_misalignment_p
1068 /* We need to know whether the step wrt the vectorized loop is
1069 negative when computing the starting misalignment below. */
1070 || TREE_CODE (drb->step) != INTEGER_CST)
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "Unknown alignment for access: %T\n", ref);
1075 return;
1078 if (base_alignment < vect_align_c)
1080 unsigned int max_alignment;
1081 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1082 if (max_alignment < vect_align_c
1083 || !vect_can_force_dr_alignment_p (base,
1084 vect_align_c * BITS_PER_UNIT))
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_NOTE, vect_location,
1088 "can't force alignment of ref: %T\n", ref);
1089 return;
1092 /* Force the alignment of the decl.
1093 NOTE: This is the only change to the code we make during
1094 the analysis phase, before deciding to vectorize the loop. */
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_NOTE, vect_location,
1097 "force alignment of %T\n", ref);
1099 dr_info->base_decl = base;
1100 dr_info->base_misaligned = true;
1101 base_misalignment = 0;
1103 poly_int64 misalignment
1104 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1106 /* If this is a backward running DR then first access in the larger
1107 vectype actually is N-1 elements before the address in the DR.
1108 Adjust misalign accordingly. */
1109 if (tree_int_cst_sgn (drb->step) < 0)
1110 /* PLUS because STEP is negative. */
1111 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1112 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1114 unsigned int const_misalignment;
1115 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "Non-constant misalignment for access: %T\n", ref);
1120 return;
1123 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "misalign = %d bytes of ref %T\n",
1128 DR_MISALIGNMENT (dr_info), ref);
1130 return;
1133 /* Function vect_update_misalignment_for_peel.
1134 Sets DR_INFO's misalignment
1135 - to 0 if it has the same alignment as DR_PEEL_INFO,
1136 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1137 - to -1 (unknown) otherwise.
1139 DR_INFO - the data reference whose misalignment is to be adjusted.
1140 DR_PEEL_INFO - the data reference whose misalignment is being made
1141 zero in the vector loop by the peel.
1142 NPEEL - the number of iterations in the peel loop if the misalignment
1143 of DR_PEEL_INFO is known at compile time. */
1145 static void
1146 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1147 dr_vec_info *dr_peel_info, int npeel)
1149 unsigned int i;
1150 vec<dr_p> same_aligned_drs;
1151 struct data_reference *current_dr;
1152 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1154 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1155 it is aligned in the vector loop. */
1156 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1157 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1159 if (current_dr != dr_info->dr)
1160 continue;
1161 gcc_assert (!known_alignment_for_access_p (dr_info)
1162 || !known_alignment_for_access_p (dr_peel_info)
1163 || (DR_MISALIGNMENT (dr_info)
1164 == DR_MISALIGNMENT (dr_peel_info)));
1165 SET_DR_MISALIGNMENT (dr_info, 0);
1166 return;
1169 unsigned HOST_WIDE_INT alignment;
1170 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1171 && known_alignment_for_access_p (dr_info)
1172 && known_alignment_for_access_p (dr_peel_info))
1174 int misal = DR_MISALIGNMENT (dr_info);
1175 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1176 misal &= alignment - 1;
1177 SET_DR_MISALIGNMENT (dr_info, misal);
1178 return;
1181 if (dump_enabled_p ())
1182 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1183 "to unknown (-1).\n");
1184 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1187 /* Return true if alignment is relevant for DR_INFO. */
1189 static bool
1190 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1192 stmt_vec_info stmt_info = dr_info->stmt;
1194 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1195 return false;
1197 /* For interleaving, only the alignment of the first access matters. */
1198 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1199 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1200 return false;
1202 /* Scatter-gather and invariant accesses continue to address individual
1203 scalars, so vector-level alignment is irrelevant. */
1204 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1205 || integer_zerop (DR_STEP (dr_info->dr)))
1206 return false;
1208 /* Strided accesses perform only component accesses, alignment is
1209 irrelevant for them. */
1210 if (STMT_VINFO_STRIDED_P (stmt_info)
1211 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212 return false;
1214 return true;
1217 /* Given an memory reference EXP return whether its alignment is less
1218 than its size. */
1220 static bool
1221 not_size_aligned (tree exp)
1223 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1224 return true;
1226 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1227 > get_object_alignment (exp));
1230 /* Function vector_alignment_reachable_p
1232 Return true if vector alignment for DR_INFO is reachable by peeling
1233 a few loop iterations. Return false otherwise. */
1235 static bool
1236 vector_alignment_reachable_p (dr_vec_info *dr_info)
1238 stmt_vec_info stmt_info = dr_info->stmt;
1239 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1243 /* For interleaved access we peel only if number of iterations in
1244 the prolog loop ({VF - misalignment}), is a multiple of the
1245 number of the interleaved accesses. */
1246 int elem_size, mis_in_elements;
1248 /* FORNOW: handle only known alignment. */
1249 if (!known_alignment_for_access_p (dr_info))
1250 return false;
1252 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1253 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1254 elem_size = vector_element_size (vector_size, nelements);
1255 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1257 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1258 return false;
1261 /* If misalignment is known at the compile time then allow peeling
1262 only if natural alignment is reachable through peeling. */
1263 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1265 HOST_WIDE_INT elmsize =
1266 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1267 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_NOTE, vect_location,
1270 "data size = %wd. misalignment = %d.\n", elmsize,
1271 DR_MISALIGNMENT (dr_info));
1273 if (DR_MISALIGNMENT (dr_info) % elmsize)
1275 if (dump_enabled_p ())
1276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277 "data size does not divide the misalignment.\n");
1278 return false;
1282 if (!known_alignment_for_access_p (dr_info))
1284 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1285 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1286 if (dump_enabled_p ())
1287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288 "Unknown misalignment, %snaturally aligned\n",
1289 is_packed ? "not " : "");
1290 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1293 return true;
1297 /* Calculate the cost of the memory access represented by DR_INFO. */
1299 static void
1300 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1301 unsigned int *inside_cost,
1302 unsigned int *outside_cost,
1303 stmt_vector_for_cost *body_cost_vec,
1304 stmt_vector_for_cost *prologue_cost_vec)
1306 stmt_vec_info stmt_info = dr_info->stmt;
1307 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1308 int ncopies;
1310 if (PURE_SLP_STMT (stmt_info))
1311 ncopies = 1;
1312 else
1313 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1315 if (DR_IS_READ (dr_info->dr))
1316 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1317 outside_cost, prologue_cost_vec, body_cost_vec, false);
1318 else
1319 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_NOTE, vect_location,
1323 "vect_get_data_access_cost: inside_cost = %d, "
1324 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1328 typedef struct _vect_peel_info
1330 dr_vec_info *dr_info;
1331 int npeel;
1332 unsigned int count;
1333 } *vect_peel_info;
1335 typedef struct _vect_peel_extended_info
1337 vec_info *vinfo;
1338 struct _vect_peel_info peel_info;
1339 unsigned int inside_cost;
1340 unsigned int outside_cost;
1341 } *vect_peel_extended_info;
1344 /* Peeling hashtable helpers. */
1346 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1348 static inline hashval_t hash (const _vect_peel_info *);
1349 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1352 inline hashval_t
1353 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1355 return (hashval_t) peel_info->npeel;
1358 inline bool
1359 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1361 return (a->npeel == b->npeel);
1365 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1367 static void
1368 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1369 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1370 int npeel)
1372 struct _vect_peel_info elem, *slot;
1373 _vect_peel_info **new_slot;
1374 bool supportable_dr_alignment
1375 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1377 elem.npeel = npeel;
1378 slot = peeling_htab->find (&elem);
1379 if (slot)
1380 slot->count++;
1381 else
1383 slot = XNEW (struct _vect_peel_info);
1384 slot->npeel = npeel;
1385 slot->dr_info = dr_info;
1386 slot->count = 1;
1387 new_slot = peeling_htab->find_slot (slot, INSERT);
1388 *new_slot = slot;
1391 if (!supportable_dr_alignment
1392 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1393 slot->count += VECT_MAX_COST;
1397 /* Traverse peeling hash table to find peeling option that aligns maximum
1398 number of data accesses. */
1401 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1402 _vect_peel_extended_info *max)
1404 vect_peel_info elem = *slot;
1406 if (elem->count > max->peel_info.count
1407 || (elem->count == max->peel_info.count
1408 && max->peel_info.npeel > elem->npeel))
1410 max->peel_info.npeel = elem->npeel;
1411 max->peel_info.count = elem->count;
1412 max->peel_info.dr_info = elem->dr_info;
1415 return 1;
1418 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1419 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1420 we assume DR0_INFO's misalignment will be zero after peeling. */
1422 static void
1423 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1424 dr_vec_info *dr0_info,
1425 unsigned int *inside_cost,
1426 unsigned int *outside_cost,
1427 stmt_vector_for_cost *body_cost_vec,
1428 stmt_vector_for_cost *prologue_cost_vec,
1429 unsigned int npeel,
1430 bool unknown_misalignment)
1432 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1433 unsigned i;
1434 data_reference *dr;
1436 FOR_EACH_VEC_ELT (datarefs, i, dr)
1438 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1439 if (!vect_relevant_for_alignment_p (dr_info))
1440 continue;
1442 int save_misalignment;
1443 save_misalignment = DR_MISALIGNMENT (dr_info);
1444 if (npeel == 0)
1446 else if (unknown_misalignment && dr_info == dr0_info)
1447 SET_DR_MISALIGNMENT (dr_info, 0);
1448 else
1449 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1450 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1451 body_cost_vec, prologue_cost_vec);
1452 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1456 /* Traverse peeling hash table and calculate cost for each peeling option.
1457 Find the one with the lowest cost. */
1460 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1461 _vect_peel_extended_info *min)
1463 vect_peel_info elem = *slot;
1464 int dummy;
1465 unsigned int inside_cost = 0, outside_cost = 0;
1466 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1467 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1468 epilogue_cost_vec;
1470 prologue_cost_vec.create (2);
1471 body_cost_vec.create (2);
1472 epilogue_cost_vec.create (2);
1474 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1475 &outside_cost, &body_cost_vec,
1476 &prologue_cost_vec, elem->npeel, false);
1478 body_cost_vec.release ();
1480 outside_cost += vect_get_known_peeling_cost
1481 (loop_vinfo, elem->npeel, &dummy,
1482 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1483 &prologue_cost_vec, &epilogue_cost_vec);
1485 /* Prologue and epilogue costs are added to the target model later.
1486 These costs depend only on the scalar iteration cost, the
1487 number of peeling iterations finally chosen, and the number of
1488 misaligned statements. So discard the information found here. */
1489 prologue_cost_vec.release ();
1490 epilogue_cost_vec.release ();
1492 if (inside_cost < min->inside_cost
1493 || (inside_cost == min->inside_cost
1494 && outside_cost < min->outside_cost))
1496 min->inside_cost = inside_cost;
1497 min->outside_cost = outside_cost;
1498 min->peel_info.dr_info = elem->dr_info;
1499 min->peel_info.npeel = elem->npeel;
1500 min->peel_info.count = elem->count;
1503 return 1;
1507 /* Choose best peeling option by traversing peeling hash table and either
1508 choosing an option with the lowest cost (if cost model is enabled) or the
1509 option that aligns as many accesses as possible. */
1511 static struct _vect_peel_extended_info
1512 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1513 loop_vec_info loop_vinfo)
1515 struct _vect_peel_extended_info res;
1517 res.peel_info.dr_info = NULL;
1518 res.vinfo = loop_vinfo;
1520 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1522 res.inside_cost = INT_MAX;
1523 res.outside_cost = INT_MAX;
1524 peeling_htab->traverse <_vect_peel_extended_info *,
1525 vect_peeling_hash_get_lowest_cost> (&res);
1527 else
1529 res.peel_info.count = 0;
1530 peeling_htab->traverse <_vect_peel_extended_info *,
1531 vect_peeling_hash_get_most_frequent> (&res);
1532 res.inside_cost = 0;
1533 res.outside_cost = 0;
1536 return res;
1539 /* Return true if the new peeling NPEEL is supported. */
1541 static bool
1542 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1543 unsigned npeel)
1545 unsigned i;
1546 struct data_reference *dr = NULL;
1547 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1548 enum dr_alignment_support supportable_dr_alignment;
1550 /* Ensure that all data refs can be vectorized after the peel. */
1551 FOR_EACH_VEC_ELT (datarefs, i, dr)
1553 int save_misalignment;
1555 if (dr == dr0_info->dr)
1556 continue;
1558 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1559 if (!vect_relevant_for_alignment_p (dr_info))
1560 continue;
1562 save_misalignment = DR_MISALIGNMENT (dr_info);
1563 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1564 supportable_dr_alignment
1565 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1566 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1568 if (!supportable_dr_alignment)
1569 return false;
1572 return true;
1575 /* Function vect_enhance_data_refs_alignment
1577 This pass will use loop versioning and loop peeling in order to enhance
1578 the alignment of data references in the loop.
1580 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1581 original loop is to be vectorized. Any other loops that are created by
1582 the transformations performed in this pass - are not supposed to be
1583 vectorized. This restriction will be relaxed.
1585 This pass will require a cost model to guide it whether to apply peeling
1586 or versioning or a combination of the two. For example, the scheme that
1587 intel uses when given a loop with several memory accesses, is as follows:
1588 choose one memory access ('p') which alignment you want to force by doing
1589 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1590 other accesses are not necessarily aligned, or (2) use loop versioning to
1591 generate one loop in which all accesses are aligned, and another loop in
1592 which only 'p' is necessarily aligned.
1594 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1595 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1596 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1598 Devising a cost model is the most critical aspect of this work. It will
1599 guide us on which access to peel for, whether to use loop versioning, how
1600 many versions to create, etc. The cost model will probably consist of
1601 generic considerations as well as target specific considerations (on
1602 powerpc for example, misaligned stores are more painful than misaligned
1603 loads).
1605 Here are the general steps involved in alignment enhancements:
1607 -- original loop, before alignment analysis:
1608 for (i=0; i<N; i++){
1609 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1610 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1613 -- After vect_compute_data_refs_alignment:
1614 for (i=0; i<N; i++){
1615 x = q[i]; # DR_MISALIGNMENT(q) = 3
1616 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1619 -- Possibility 1: we do loop versioning:
1620 if (p is aligned) {
1621 for (i=0; i<N; i++){ # loop 1A
1622 x = q[i]; # DR_MISALIGNMENT(q) = 3
1623 p[i] = y; # DR_MISALIGNMENT(p) = 0
1626 else {
1627 for (i=0; i<N; i++){ # loop 1B
1628 x = q[i]; # DR_MISALIGNMENT(q) = 3
1629 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1633 -- Possibility 2: we do loop peeling:
1634 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1635 x = q[i];
1636 p[i] = y;
1638 for (i = 3; i < N; i++){ # loop 2A
1639 x = q[i]; # DR_MISALIGNMENT(q) = 0
1640 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1643 -- Possibility 3: combination of loop peeling and versioning:
1644 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1645 x = q[i];
1646 p[i] = y;
1648 if (p is aligned) {
1649 for (i = 3; i<N; i++){ # loop 3A
1650 x = q[i]; # DR_MISALIGNMENT(q) = 0
1651 p[i] = y; # DR_MISALIGNMENT(p) = 0
1654 else {
1655 for (i = 3; i<N; i++){ # loop 3B
1656 x = q[i]; # DR_MISALIGNMENT(q) = 0
1657 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1661 These loops are later passed to loop_transform to be vectorized. The
1662 vectorizer will use the alignment information to guide the transformation
1663 (whether to generate regular loads/stores, or with special handling for
1664 misalignment). */
1666 opt_result
1667 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1669 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1670 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1671 enum dr_alignment_support supportable_dr_alignment;
1672 dr_vec_info *first_store = NULL;
1673 dr_vec_info *dr0_info = NULL;
1674 struct data_reference *dr;
1675 unsigned int i;
1676 bool do_peeling = false;
1677 bool do_versioning = false;
1678 unsigned int npeel = 0;
1679 bool one_misalignment_known = false;
1680 bool one_misalignment_unknown = false;
1681 bool one_dr_unsupportable = false;
1682 dr_vec_info *unsupportable_dr_info = NULL;
1683 unsigned int mis, same_align_drs_max = 0;
1684 hash_table<peel_info_hasher> peeling_htab (1);
1686 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1688 /* Reset data so we can safely be called multiple times. */
1689 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1690 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1692 /* While cost model enhancements are expected in the future, the high level
1693 view of the code at this time is as follows:
1695 A) If there is a misaligned access then see if peeling to align
1696 this access can make all data references satisfy
1697 vect_supportable_dr_alignment. If so, update data structures
1698 as needed and return true.
1700 B) If peeling wasn't possible and there is a data reference with an
1701 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1702 then see if loop versioning checks can be used to make all data
1703 references satisfy vect_supportable_dr_alignment. If so, update
1704 data structures as needed and return true.
1706 C) If neither peeling nor versioning were successful then return false if
1707 any data reference does not satisfy vect_supportable_dr_alignment.
1709 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1711 Note, Possibility 3 above (which is peeling and versioning together) is not
1712 being done at this time. */
1714 /* (1) Peeling to force alignment. */
1716 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1717 Considerations:
1718 + How many accesses will become aligned due to the peeling
1719 - How many accesses will become unaligned due to the peeling,
1720 and the cost of misaligned accesses.
1721 - The cost of peeling (the extra runtime checks, the increase
1722 in code size). */
1724 FOR_EACH_VEC_ELT (datarefs, i, dr)
1726 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1727 if (!vect_relevant_for_alignment_p (dr_info))
1728 continue;
1730 stmt_vec_info stmt_info = dr_info->stmt;
1731 supportable_dr_alignment
1732 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1733 do_peeling = vector_alignment_reachable_p (dr_info);
1734 if (do_peeling)
1736 if (known_alignment_for_access_p (dr_info))
1738 unsigned int npeel_tmp = 0;
1739 bool negative = tree_int_cst_compare (DR_STEP (dr),
1740 size_zero_node) < 0;
1742 /* If known_alignment_for_access_p then we have set
1743 DR_MISALIGNMENT which is only done if we know it at compiler
1744 time, so it is safe to assume target alignment is constant.
1746 unsigned int target_align =
1747 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1748 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1749 mis = (negative
1750 ? DR_MISALIGNMENT (dr_info)
1751 : -DR_MISALIGNMENT (dr_info));
1752 if (DR_MISALIGNMENT (dr_info) != 0)
1753 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1755 /* For multiple types, it is possible that the bigger type access
1756 will have more than one peeling option. E.g., a loop with two
1757 types: one of size (vector size / 4), and the other one of
1758 size (vector size / 8). Vectorization factor will 8. If both
1759 accesses are misaligned by 3, the first one needs one scalar
1760 iteration to be aligned, and the second one needs 5. But the
1761 first one will be aligned also by peeling 5 scalar
1762 iterations, and in that case both accesses will be aligned.
1763 Hence, except for the immediate peeling amount, we also want
1764 to try to add full vector size, while we don't exceed
1765 vectorization factor.
1766 We do this automatically for cost model, since we calculate
1767 cost for every peeling option. */
1768 poly_uint64 nscalars = npeel_tmp;
1769 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1771 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1772 nscalars = (STMT_SLP_TYPE (stmt_info)
1773 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1776 /* Save info about DR in the hash table. Also include peeling
1777 amounts according to the explanation above. */
1778 while (known_le (npeel_tmp, nscalars))
1780 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1781 dr_info, npeel_tmp);
1782 npeel_tmp += MAX (1, target_align / dr_size);
1785 one_misalignment_known = true;
1787 else
1789 /* If we don't know any misalignment values, we prefer
1790 peeling for data-ref that has the maximum number of data-refs
1791 with the same alignment, unless the target prefers to align
1792 stores over load. */
1793 unsigned same_align_drs
1794 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1795 if (!dr0_info
1796 || same_align_drs_max < same_align_drs)
1798 same_align_drs_max = same_align_drs;
1799 dr0_info = dr_info;
1801 /* For data-refs with the same number of related
1802 accesses prefer the one where the misalign
1803 computation will be invariant in the outermost loop. */
1804 else if (same_align_drs_max == same_align_drs)
1806 class loop *ivloop0, *ivloop;
1807 ivloop0 = outermost_invariant_loop_for_expr
1808 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1809 ivloop = outermost_invariant_loop_for_expr
1810 (loop, DR_BASE_ADDRESS (dr));
1811 if ((ivloop && !ivloop0)
1812 || (ivloop && ivloop0
1813 && flow_loop_nested_p (ivloop, ivloop0)))
1814 dr0_info = dr_info;
1817 one_misalignment_unknown = true;
1819 /* Check for data refs with unsupportable alignment that
1820 can be peeled. */
1821 if (!supportable_dr_alignment)
1823 one_dr_unsupportable = true;
1824 unsupportable_dr_info = dr_info;
1827 if (!first_store && DR_IS_WRITE (dr))
1828 first_store = dr_info;
1831 else
1833 if (!aligned_access_p (dr_info))
1835 if (dump_enabled_p ())
1836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 "vector alignment may not be reachable\n");
1838 break;
1843 /* Check if we can possibly peel the loop. */
1844 if (!vect_can_advance_ivs_p (loop_vinfo)
1845 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1846 || loop->inner)
1847 do_peeling = false;
1849 struct _vect_peel_extended_info peel_for_known_alignment;
1850 struct _vect_peel_extended_info peel_for_unknown_alignment;
1851 struct _vect_peel_extended_info best_peel;
1853 peel_for_unknown_alignment.inside_cost = INT_MAX;
1854 peel_for_unknown_alignment.outside_cost = INT_MAX;
1855 peel_for_unknown_alignment.peel_info.count = 0;
1857 if (do_peeling
1858 && one_misalignment_unknown)
1860 /* Check if the target requires to prefer stores over loads, i.e., if
1861 misaligned stores are more expensive than misaligned loads (taking
1862 drs with same alignment into account). */
1863 unsigned int load_inside_cost = 0;
1864 unsigned int load_outside_cost = 0;
1865 unsigned int store_inside_cost = 0;
1866 unsigned int store_outside_cost = 0;
1867 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1869 stmt_vector_for_cost dummy;
1870 dummy.create (2);
1871 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1872 &load_inside_cost,
1873 &load_outside_cost,
1874 &dummy, &dummy, estimated_npeels, true);
1875 dummy.release ();
1877 if (first_store)
1879 dummy.create (2);
1880 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1881 &store_inside_cost,
1882 &store_outside_cost,
1883 &dummy, &dummy,
1884 estimated_npeels, true);
1885 dummy.release ();
1887 else
1889 store_inside_cost = INT_MAX;
1890 store_outside_cost = INT_MAX;
1893 if (load_inside_cost > store_inside_cost
1894 || (load_inside_cost == store_inside_cost
1895 && load_outside_cost > store_outside_cost))
1897 dr0_info = first_store;
1898 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1899 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1901 else
1903 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1904 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1907 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1908 prologue_cost_vec.create (2);
1909 epilogue_cost_vec.create (2);
1911 int dummy2;
1912 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1913 (loop_vinfo, estimated_npeels, &dummy2,
1914 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1915 &prologue_cost_vec, &epilogue_cost_vec);
1917 prologue_cost_vec.release ();
1918 epilogue_cost_vec.release ();
1920 peel_for_unknown_alignment.peel_info.count = 1
1921 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1924 peel_for_unknown_alignment.peel_info.npeel = 0;
1925 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1927 best_peel = peel_for_unknown_alignment;
1929 peel_for_known_alignment.inside_cost = INT_MAX;
1930 peel_for_known_alignment.outside_cost = INT_MAX;
1931 peel_for_known_alignment.peel_info.count = 0;
1932 peel_for_known_alignment.peel_info.dr_info = NULL;
1934 if (do_peeling && one_misalignment_known)
1936 /* Peeling is possible, but there is no data access that is not supported
1937 unless aligned. So we try to choose the best possible peeling from
1938 the hash table. */
1939 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1940 (&peeling_htab, loop_vinfo);
1943 /* Compare costs of peeling for known and unknown alignment. */
1944 if (peel_for_known_alignment.peel_info.dr_info != NULL
1945 && peel_for_unknown_alignment.inside_cost
1946 >= peel_for_known_alignment.inside_cost)
1948 best_peel = peel_for_known_alignment;
1950 /* If the best peeling for known alignment has NPEEL == 0, perform no
1951 peeling at all except if there is an unsupportable dr that we can
1952 align. */
1953 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1954 do_peeling = false;
1957 /* If there is an unsupportable data ref, prefer this over all choices so far
1958 since we'd have to discard a chosen peeling except when it accidentally
1959 aligned the unsupportable data ref. */
1960 if (one_dr_unsupportable)
1961 dr0_info = unsupportable_dr_info;
1962 else if (do_peeling)
1964 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1965 TODO: Use nopeel_outside_cost or get rid of it? */
1966 unsigned nopeel_inside_cost = 0;
1967 unsigned nopeel_outside_cost = 0;
1969 stmt_vector_for_cost dummy;
1970 dummy.create (2);
1971 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1972 &nopeel_outside_cost, &dummy, &dummy,
1973 0, false);
1974 dummy.release ();
1976 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1977 costs will be recorded. */
1978 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1979 prologue_cost_vec.create (2);
1980 epilogue_cost_vec.create (2);
1982 int dummy2;
1983 nopeel_outside_cost += vect_get_known_peeling_cost
1984 (loop_vinfo, 0, &dummy2,
1985 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1986 &prologue_cost_vec, &epilogue_cost_vec);
1988 prologue_cost_vec.release ();
1989 epilogue_cost_vec.release ();
1991 npeel = best_peel.peel_info.npeel;
1992 dr0_info = best_peel.peel_info.dr_info;
1994 /* If no peeling is not more expensive than the best peeling we
1995 have so far, don't perform any peeling. */
1996 if (nopeel_inside_cost <= best_peel.inside_cost)
1997 do_peeling = false;
2000 if (do_peeling)
2002 stmt_vec_info stmt_info = dr0_info->stmt;
2003 if (known_alignment_for_access_p (dr0_info))
2005 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2006 size_zero_node) < 0;
2007 if (!npeel)
2009 /* Since it's known at compile time, compute the number of
2010 iterations in the peeled loop (the peeling factor) for use in
2011 updating DR_MISALIGNMENT values. The peeling factor is the
2012 vectorization factor minus the misalignment as an element
2013 count. */
2014 mis = (negative
2015 ? DR_MISALIGNMENT (dr0_info)
2016 : -DR_MISALIGNMENT (dr0_info));
2017 /* If known_alignment_for_access_p then we have set
2018 DR_MISALIGNMENT which is only done if we know it at compiler
2019 time, so it is safe to assume target alignment is constant.
2021 unsigned int target_align =
2022 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2023 npeel = ((mis & (target_align - 1))
2024 / vect_get_scalar_dr_size (dr0_info));
2027 /* For interleaved data access every iteration accesses all the
2028 members of the group, therefore we divide the number of iterations
2029 by the group size. */
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 npeel /= DR_GROUP_SIZE (stmt_info);
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE, vect_location,
2035 "Try peeling by %d\n", npeel);
2038 /* Ensure that all datarefs can be vectorized after the peel. */
2039 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2040 do_peeling = false;
2042 /* Check if all datarefs are supportable and log. */
2043 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2044 return opt_result::success ();
2046 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2047 if (do_peeling)
2049 unsigned max_allowed_peel
2050 = param_vect_max_peeling_for_alignment;
2051 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2052 max_allowed_peel = 0;
2053 if (max_allowed_peel != (unsigned)-1)
2055 unsigned max_peel = npeel;
2056 if (max_peel == 0)
2058 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2059 unsigned HOST_WIDE_INT target_align_c;
2060 if (target_align.is_constant (&target_align_c))
2061 max_peel =
2062 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2063 else
2065 do_peeling = false;
2066 if (dump_enabled_p ())
2067 dump_printf_loc (MSG_NOTE, vect_location,
2068 "Disable peeling, max peels set and vector"
2069 " alignment unknown\n");
2072 if (max_peel > max_allowed_peel)
2074 do_peeling = false;
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location,
2077 "Disable peeling, max peels reached: %d\n", max_peel);
2082 /* Cost model #2 - if peeling may result in a remaining loop not
2083 iterating enough to be vectorized then do not peel. Since this
2084 is a cost heuristic rather than a correctness decision, use the
2085 most likely runtime value for variable vectorization factors. */
2086 if (do_peeling
2087 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2089 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2090 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2091 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2092 < assumed_vf + max_peel)
2093 do_peeling = false;
2096 if (do_peeling)
2098 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2099 If the misalignment of DR_i is identical to that of dr0 then set
2100 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2101 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2102 by the peeling factor times the element size of DR_i (MOD the
2103 vectorization factor times the size). Otherwise, the
2104 misalignment of DR_i must be set to unknown. */
2105 FOR_EACH_VEC_ELT (datarefs, i, dr)
2106 if (dr != dr0_info->dr)
2108 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2109 if (!vect_relevant_for_alignment_p (dr_info))
2110 continue;
2112 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2115 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2116 if (npeel)
2117 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2118 else
2119 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2120 = DR_MISALIGNMENT (dr0_info);
2121 SET_DR_MISALIGNMENT (dr0_info, 0);
2122 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_NOTE, vect_location,
2125 "Alignment of access forced using peeling.\n");
2126 dump_printf_loc (MSG_NOTE, vect_location,
2127 "Peeling for alignment will be applied.\n");
2130 /* The inside-loop cost will be accounted for in vectorizable_load
2131 and vectorizable_store correctly with adjusted alignments.
2132 Drop the body_cst_vec on the floor here. */
2133 return opt_result::success ();
2137 /* (2) Versioning to force alignment. */
2139 /* Try versioning if:
2140 1) optimize loop for speed and the cost-model is not cheap
2141 2) there is at least one unsupported misaligned data ref with an unknown
2142 misalignment, and
2143 3) all misaligned data refs with a known misalignment are supported, and
2144 4) the number of runtime alignment checks is within reason. */
2146 do_versioning
2147 = (optimize_loop_nest_for_speed_p (loop)
2148 && !loop->inner /* FORNOW */
2149 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2151 if (do_versioning)
2153 FOR_EACH_VEC_ELT (datarefs, i, dr)
2155 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2156 if (aligned_access_p (dr_info)
2157 || !vect_relevant_for_alignment_p (dr_info))
2158 continue;
2160 stmt_vec_info stmt_info = dr_info->stmt;
2161 if (STMT_VINFO_STRIDED_P (stmt_info))
2163 do_versioning = false;
2164 break;
2167 supportable_dr_alignment
2168 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2170 if (!supportable_dr_alignment)
2172 int mask;
2173 tree vectype;
2175 if (known_alignment_for_access_p (dr_info)
2176 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2177 >= (unsigned) param_vect_max_version_for_alignment_checks)
2179 do_versioning = false;
2180 break;
2183 vectype = STMT_VINFO_VECTYPE (stmt_info);
2184 gcc_assert (vectype);
2186 /* At present we don't support versioning for alignment
2187 with variable VF, since there's no guarantee that the
2188 VF is a power of two. We could relax this if we added
2189 a way of enforcing a power-of-two size. */
2190 unsigned HOST_WIDE_INT size;
2191 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2193 do_versioning = false;
2194 break;
2197 /* Forcing alignment in the first iteration is no good if
2198 we don't keep it across iterations. For now, just disable
2199 versioning in this case.
2200 ?? We could actually unroll the loop to achieve the required
2201 overall step alignment, and forcing the alignment could be
2202 done by doing some iterations of the non-vectorized loop. */
2203 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2204 * DR_STEP_ALIGNMENT (dr),
2205 DR_TARGET_ALIGNMENT (dr_info)))
2207 do_versioning = false;
2208 break;
2211 /* The rightmost bits of an aligned address must be zeros.
2212 Construct the mask needed for this test. For example,
2213 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2214 mask must be 15 = 0xf. */
2215 mask = size - 1;
2217 /* FORNOW: use the same mask to test all potentially unaligned
2218 references in the loop. */
2219 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2220 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2222 do_versioning = false;
2223 break;
2226 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2227 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2231 /* Versioning requires at least one misaligned data reference. */
2232 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2233 do_versioning = false;
2234 else if (!do_versioning)
2235 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2238 if (do_versioning)
2240 vec<stmt_vec_info> may_misalign_stmts
2241 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2242 stmt_vec_info stmt_info;
2244 /* It can now be assumed that the data references in the statements
2245 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2246 of the loop being vectorized. */
2247 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2249 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2250 SET_DR_MISALIGNMENT (dr_info, 0);
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "Alignment of access forced using versioning.\n");
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "Versioning for alignment will be applied.\n");
2260 /* Peeling and versioning can't be done together at this time. */
2261 gcc_assert (! (do_peeling && do_versioning));
2263 return opt_result::success ();
2266 /* This point is reached if neither peeling nor versioning is being done. */
2267 gcc_assert (! (do_peeling || do_versioning));
2269 return opt_result::success ();
2273 /* Function vect_find_same_alignment_drs.
2275 Update group and alignment relations in VINFO according to the chosen
2276 vectorization factor. */
2278 static void
2279 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2281 struct data_reference *dra = DDR_A (ddr);
2282 struct data_reference *drb = DDR_B (ddr);
2283 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2284 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2285 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2286 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2288 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2289 return;
2291 if (dra == drb)
2292 return;
2294 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2295 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2296 return;
2298 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2299 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2300 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2301 return;
2303 /* Two references with distance zero have the same alignment. */
2304 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2305 - wi::to_poly_offset (DR_INIT (drb)));
2306 if (maybe_ne (diff, 0))
2308 /* Get the wider of the two alignments. */
2309 poly_uint64 align_a =
2310 exact_div (vect_calculate_target_alignment (dr_info_a),
2311 BITS_PER_UNIT);
2312 poly_uint64 align_b =
2313 exact_div (vect_calculate_target_alignment (dr_info_b),
2314 BITS_PER_UNIT);
2315 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2316 if (!align_a.is_constant (&align_a_c)
2317 || !align_b.is_constant (&align_b_c))
2318 return;
2320 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2322 /* Require the gap to be a multiple of the larger vector alignment. */
2323 if (!multiple_p (diff, max_align))
2324 return;
2327 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2328 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2329 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_NOTE, vect_location,
2331 "accesses have the same alignment: %T and %T\n",
2332 DR_REF (dra), DR_REF (drb));
2336 /* Function vect_analyze_data_refs_alignment
2338 Analyze the alignment of the data-references in the loop.
2339 Return FALSE if a data reference is found that cannot be vectorized. */
2341 opt_result
2342 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2344 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2346 /* Mark groups of data references with same alignment using
2347 data dependence information. */
2348 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2349 struct data_dependence_relation *ddr;
2350 unsigned int i;
2352 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2353 vect_find_same_alignment_drs (loop_vinfo, ddr);
2355 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2356 struct data_reference *dr;
2358 vect_record_base_alignments (loop_vinfo);
2359 FOR_EACH_VEC_ELT (datarefs, i, dr)
2361 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2362 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2363 vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2366 return opt_result::success ();
2370 /* Analyze alignment of DRs of stmts in NODE. */
2372 static bool
2373 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2375 /* We vectorize from the first scalar stmt in the node unless
2376 the node is permuted in which case we start from the first
2377 element in the group. */
2378 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2379 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2380 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2381 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2383 /* We need to commit to a vector type for the group now. */
2384 if (is_a <bb_vec_info> (vinfo)
2385 && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2386 return false;
2388 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2389 vect_compute_data_ref_alignment (vinfo, dr_info);
2390 /* In several places we need alignment of the first element anyway. */
2391 if (dr_info != first_dr_info)
2392 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2394 /* For creating the data-ref pointer we need alignment of the
2395 first element as well. */
2396 first_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
2397 if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2399 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2400 if (dr_info != first_dr_info)
2401 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2404 return true;
2407 /* Function vect_slp_analyze_instance_alignment
2409 Analyze the alignment of the data-references in the SLP instance.
2410 Return FALSE if a data reference is found that cannot be vectorized. */
2412 bool
2413 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2414 slp_instance instance)
2416 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2418 slp_tree node;
2419 unsigned i;
2420 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2421 if (! vect_slp_analyze_node_alignment (vinfo, node))
2422 return false;
2424 node = SLP_INSTANCE_TREE (instance);
2425 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2426 && ! vect_slp_analyze_node_alignment
2427 (vinfo, SLP_INSTANCE_TREE (instance)))
2428 return false;
2430 return true;
2434 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2435 accesses of legal size, step, etc. Detect gaps, single element
2436 interleaving, and other special cases. Set grouped access info.
2437 Collect groups of strided stores for further use in SLP analysis.
2438 Worker for vect_analyze_group_access. */
2440 static bool
2441 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2443 data_reference *dr = dr_info->dr;
2444 tree step = DR_STEP (dr);
2445 tree scalar_type = TREE_TYPE (DR_REF (dr));
2446 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2447 stmt_vec_info stmt_info = dr_info->stmt;
2448 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2449 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2450 HOST_WIDE_INT dr_step = -1;
2451 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2452 bool slp_impossible = false;
2454 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2455 size of the interleaving group (including gaps). */
2456 if (tree_fits_shwi_p (step))
2458 dr_step = tree_to_shwi (step);
2459 /* Check that STEP is a multiple of type size. Otherwise there is
2460 a non-element-sized gap at the end of the group which we
2461 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2462 ??? As we can handle non-constant step fine here we should
2463 simply remove uses of DR_GROUP_GAP between the last and first
2464 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2465 simply not include that gap. */
2466 if ((dr_step % type_size) != 0)
2468 if (dump_enabled_p ())
2469 dump_printf_loc (MSG_NOTE, vect_location,
2470 "Step %T is not a multiple of the element size"
2471 " for %T\n",
2472 step, DR_REF (dr));
2473 return false;
2475 groupsize = absu_hwi (dr_step) / type_size;
2477 else
2478 groupsize = 0;
2480 /* Not consecutive access is possible only if it is a part of interleaving. */
2481 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2483 /* Check if it this DR is a part of interleaving, and is a single
2484 element of the group that is accessed in the loop. */
2486 /* Gaps are supported only for loads. STEP must be a multiple of the type
2487 size. */
2488 if (DR_IS_READ (dr)
2489 && (dr_step % type_size) == 0
2490 && groupsize > 0)
2492 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2493 DR_GROUP_SIZE (stmt_info) = groupsize;
2494 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_NOTE, vect_location,
2497 "Detected single element interleaving %T"
2498 " step %T\n",
2499 DR_REF (dr), step);
2501 return true;
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506 "not consecutive access %G", stmt_info->stmt);
2508 if (bb_vinfo)
2510 /* Mark the statement as unvectorizable. */
2511 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2512 return true;
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2517 STMT_VINFO_STRIDED_P (stmt_info) = true;
2518 return true;
2521 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2523 /* First stmt in the interleaving chain. Check the chain. */
2524 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2525 struct data_reference *data_ref = dr;
2526 unsigned int count = 1;
2527 tree prev_init = DR_INIT (data_ref);
2528 HOST_WIDE_INT diff, gaps = 0;
2530 /* By construction, all group members have INTEGER_CST DR_INITs. */
2531 while (next)
2533 /* We never have the same DR multiple times. */
2534 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2535 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2537 data_ref = STMT_VINFO_DATA_REF (next);
2539 /* All group members have the same STEP by construction. */
2540 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2542 /* Check that the distance between two accesses is equal to the type
2543 size. Otherwise, we have gaps. */
2544 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2545 - TREE_INT_CST_LOW (prev_init)) / type_size;
2546 if (diff != 1)
2548 /* FORNOW: SLP of accesses with gaps is not supported. */
2549 slp_impossible = true;
2550 if (DR_IS_WRITE (data_ref))
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 "interleaved store with gaps\n");
2555 return false;
2558 gaps += diff - 1;
2561 last_accessed_element += diff;
2563 /* Store the gap from the previous member of the group. If there is no
2564 gap in the access, DR_GROUP_GAP is always 1. */
2565 DR_GROUP_GAP (next) = diff;
2567 prev_init = DR_INIT (data_ref);
2568 next = DR_GROUP_NEXT_ELEMENT (next);
2569 /* Count the number of data-refs in the chain. */
2570 count++;
2573 if (groupsize == 0)
2574 groupsize = count + gaps;
2576 /* This could be UINT_MAX but as we are generating code in a very
2577 inefficient way we have to cap earlier. See PR78699 for example. */
2578 if (groupsize > 4096)
2580 if (dump_enabled_p ())
2581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2582 "group is too large\n");
2583 return false;
2586 /* Check that the size of the interleaving is equal to count for stores,
2587 i.e., that there are no gaps. */
2588 if (groupsize != count
2589 && !DR_IS_READ (dr))
2591 groupsize = count;
2592 STMT_VINFO_STRIDED_P (stmt_info) = true;
2595 /* If there is a gap after the last load in the group it is the
2596 difference between the groupsize and the last accessed
2597 element.
2598 When there is no gap, this difference should be 0. */
2599 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2601 DR_GROUP_SIZE (stmt_info) = groupsize;
2602 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_NOTE, vect_location,
2605 "Detected interleaving ");
2606 if (DR_IS_READ (dr))
2607 dump_printf (MSG_NOTE, "load ");
2608 else if (STMT_VINFO_STRIDED_P (stmt_info))
2609 dump_printf (MSG_NOTE, "strided store ");
2610 else
2611 dump_printf (MSG_NOTE, "store ");
2612 dump_printf (MSG_NOTE, "of size %u\n",
2613 (unsigned)groupsize);
2614 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2615 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2616 while (next)
2618 if (DR_GROUP_GAP (next) != 1)
2619 dump_printf_loc (MSG_NOTE, vect_location,
2620 "\t<gap of %d elements>\n",
2621 DR_GROUP_GAP (next) - 1);
2622 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2623 next = DR_GROUP_NEXT_ELEMENT (next);
2625 if (DR_GROUP_GAP (stmt_info) != 0)
2626 dump_printf_loc (MSG_NOTE, vect_location,
2627 "\t<gap of %d elements>\n",
2628 DR_GROUP_GAP (stmt_info));
2631 /* SLP: create an SLP data structure for every interleaving group of
2632 stores for further analysis in vect_analyse_slp. */
2633 if (DR_IS_WRITE (dr) && !slp_impossible)
2635 if (loop_vinfo)
2636 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2637 if (bb_vinfo)
2638 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2642 return true;
2645 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2646 accesses of legal size, step, etc. Detect gaps, single element
2647 interleaving, and other special cases. Set grouped access info.
2648 Collect groups of strided stores for further use in SLP analysis. */
2650 static bool
2651 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2653 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2655 /* Dissolve the group if present. */
2656 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2657 while (stmt_info)
2659 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2660 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2661 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2662 stmt_info = next;
2664 return false;
2666 return true;
2669 /* Analyze the access pattern of the data-reference DR_INFO.
2670 In case of non-consecutive accesses call vect_analyze_group_access() to
2671 analyze groups of accesses. */
2673 static bool
2674 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2676 data_reference *dr = dr_info->dr;
2677 tree step = DR_STEP (dr);
2678 tree scalar_type = TREE_TYPE (DR_REF (dr));
2679 stmt_vec_info stmt_info = dr_info->stmt;
2680 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2681 class loop *loop = NULL;
2683 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2684 return true;
2686 if (loop_vinfo)
2687 loop = LOOP_VINFO_LOOP (loop_vinfo);
2689 if (loop_vinfo && !step)
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2693 "bad data-ref access in loop\n");
2694 return false;
2697 /* Allow loads with zero step in inner-loop vectorization. */
2698 if (loop_vinfo && integer_zerop (step))
2700 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2701 if (!nested_in_vect_loop_p (loop, stmt_info))
2702 return DR_IS_READ (dr);
2703 /* Allow references with zero step for outer loops marked
2704 with pragma omp simd only - it guarantees absence of
2705 loop-carried dependencies between inner loop iterations. */
2706 if (loop->safelen < 2)
2708 if (dump_enabled_p ())
2709 dump_printf_loc (MSG_NOTE, vect_location,
2710 "zero step in inner loop of nest\n");
2711 return false;
2715 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2717 /* Interleaved accesses are not yet supported within outer-loop
2718 vectorization for references in the inner-loop. */
2719 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2721 /* For the rest of the analysis we use the outer-loop step. */
2722 step = STMT_VINFO_DR_STEP (stmt_info);
2723 if (integer_zerop (step))
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_NOTE, vect_location,
2727 "zero step in outer loop.\n");
2728 return DR_IS_READ (dr);
2732 /* Consecutive? */
2733 if (TREE_CODE (step) == INTEGER_CST)
2735 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2736 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2737 || (dr_step < 0
2738 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2740 /* Mark that it is not interleaving. */
2741 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2742 return true;
2746 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2748 if (dump_enabled_p ())
2749 dump_printf_loc (MSG_NOTE, vect_location,
2750 "grouped access in outer loop.\n");
2751 return false;
2755 /* Assume this is a DR handled by non-constant strided load case. */
2756 if (TREE_CODE (step) != INTEGER_CST)
2757 return (STMT_VINFO_STRIDED_P (stmt_info)
2758 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2759 || vect_analyze_group_access (vinfo, dr_info)));
2761 /* Not consecutive access - check if it's a part of interleaving group. */
2762 return vect_analyze_group_access (vinfo, dr_info);
2765 typedef std::pair<data_reference_p, int> data_ref_pair;
2767 /* Compare two data-references DRA and DRB to group them into chunks
2768 suitable for grouping. */
2770 static int
2771 dr_group_sort_cmp (const void *dra_, const void *drb_)
2773 data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2774 data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2775 data_reference_p dra = dra_pair.first;
2776 data_reference_p drb = drb_pair.first;
2777 int cmp;
2779 /* Stabilize sort. */
2780 if (dra == drb)
2781 return 0;
2783 /* DRs in different basic-blocks never belong to the same group. */
2784 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2785 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2786 if (bb_index1 != bb_index2)
2787 return bb_index1 < bb_index2 ? -1 : 1;
2789 /* Different group IDs lead never belong to the same group. */
2790 if (dra_pair.second != drb_pair.second)
2791 return dra_pair.second < drb_pair.second ? -1 : 1;
2793 /* Ordering of DRs according to base. */
2794 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2795 DR_BASE_ADDRESS (drb));
2796 if (cmp != 0)
2797 return cmp;
2799 /* And according to DR_OFFSET. */
2800 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2801 if (cmp != 0)
2802 return cmp;
2804 /* Put reads before writes. */
2805 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2806 return DR_IS_READ (dra) ? -1 : 1;
2808 /* Then sort after access size. */
2809 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2810 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2811 if (cmp != 0)
2812 return cmp;
2814 /* And after step. */
2815 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2816 if (cmp != 0)
2817 return cmp;
2819 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2820 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2821 if (cmp == 0)
2822 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2823 return cmp;
2826 /* If OP is the result of a conversion, return the unconverted value,
2827 otherwise return null. */
2829 static tree
2830 strip_conversion (tree op)
2832 if (TREE_CODE (op) != SSA_NAME)
2833 return NULL_TREE;
2834 gimple *stmt = SSA_NAME_DEF_STMT (op);
2835 if (!is_gimple_assign (stmt)
2836 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2837 return NULL_TREE;
2838 return gimple_assign_rhs1 (stmt);
2841 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2842 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2843 be grouped in SLP mode. */
2845 static bool
2846 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2847 bool allow_slp_p)
2849 if (gimple_assign_single_p (stmt1_info->stmt))
2850 return gimple_assign_single_p (stmt2_info->stmt);
2852 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2853 if (call1 && gimple_call_internal_p (call1))
2855 /* Check for two masked loads or two masked stores. */
2856 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2857 if (!call2 || !gimple_call_internal_p (call2))
2858 return false;
2859 internal_fn ifn = gimple_call_internal_fn (call1);
2860 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2861 return false;
2862 if (ifn != gimple_call_internal_fn (call2))
2863 return false;
2865 /* Check that the masks are the same. Cope with casts of masks,
2866 like those created by build_mask_conversion. */
2867 tree mask1 = gimple_call_arg (call1, 2);
2868 tree mask2 = gimple_call_arg (call2, 2);
2869 if (!operand_equal_p (mask1, mask2, 0)
2870 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2872 mask1 = strip_conversion (mask1);
2873 if (!mask1)
2874 return false;
2875 mask2 = strip_conversion (mask2);
2876 if (!mask2)
2877 return false;
2878 if (!operand_equal_p (mask1, mask2, 0))
2879 return false;
2881 return true;
2884 return false;
2887 /* Function vect_analyze_data_ref_accesses.
2889 Analyze the access pattern of all the data references in the loop.
2891 FORNOW: the only access pattern that is considered vectorizable is a
2892 simple step 1 (consecutive) access.
2894 FORNOW: handle only arrays and pointer accesses. */
2896 opt_result
2897 vect_analyze_data_ref_accesses (vec_info *vinfo,
2898 vec<int> *dataref_groups)
2900 unsigned int i;
2901 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2903 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2905 if (datarefs.is_empty ())
2906 return opt_result::success ();
2908 /* Sort the array of datarefs to make building the interleaving chains
2909 linear. Don't modify the original vector's order, it is needed for
2910 determining what dependencies are reversed. */
2911 vec<data_ref_pair> datarefs_copy;
2912 datarefs_copy.create (datarefs.length ());
2913 for (unsigned i = 0; i < datarefs.length (); i++)
2915 int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2916 datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2918 datarefs_copy.qsort (dr_group_sort_cmp);
2919 hash_set<stmt_vec_info> to_fixup;
2921 /* Build the interleaving chains. */
2922 for (i = 0; i < datarefs_copy.length () - 1;)
2924 data_reference_p dra = datarefs_copy[i].first;
2925 int dra_group_id = datarefs_copy[i].second;
2926 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2927 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2928 stmt_vec_info lastinfo = NULL;
2929 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2930 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2932 ++i;
2933 continue;
2935 for (i = i + 1; i < datarefs_copy.length (); ++i)
2937 data_reference_p drb = datarefs_copy[i].first;
2938 int drb_group_id = datarefs_copy[i].second;
2939 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2940 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2941 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2942 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2943 break;
2945 /* ??? Imperfect sorting (non-compatible types, non-modulo
2946 accesses, same accesses) can lead to a group to be artificially
2947 split here as we don't just skip over those. If it really
2948 matters we can push those to a worklist and re-iterate
2949 over them. The we can just skip ahead to the next DR here. */
2951 /* DRs in a different BBs should not be put into the same
2952 interleaving group. */
2953 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2954 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2955 if (bb_index1 != bb_index2)
2956 break;
2958 if (dra_group_id != drb_group_id)
2959 break;
2961 /* Check that the data-refs have same first location (except init)
2962 and they are both either store or load (not load and store,
2963 not masked loads or stores). */
2964 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2965 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2966 DR_BASE_ADDRESS (drb)) != 0
2967 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2968 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2969 break;
2971 /* Check that the data-refs have the same constant size. */
2972 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2973 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2974 if (!tree_fits_uhwi_p (sza)
2975 || !tree_fits_uhwi_p (szb)
2976 || !tree_int_cst_equal (sza, szb))
2977 break;
2979 /* Check that the data-refs have the same step. */
2980 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2981 break;
2983 /* Check the types are compatible.
2984 ??? We don't distinguish this during sorting. */
2985 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2986 TREE_TYPE (DR_REF (drb))))
2987 break;
2989 /* Check that the DR_INITs are compile-time constants. */
2990 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2991 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2992 break;
2994 /* Different .GOMP_SIMD_LANE calls still give the same lane,
2995 just hold extra information. */
2996 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
2997 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
2998 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
2999 break;
3001 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3002 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3003 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3004 HOST_WIDE_INT init_prev
3005 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3006 gcc_assert (init_a <= init_b
3007 && init_a <= init_prev
3008 && init_prev <= init_b);
3010 /* Do not place the same access in the interleaving chain twice. */
3011 if (init_b == init_prev)
3013 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3014 < gimple_uid (DR_STMT (drb)));
3015 /* Simply link in duplicates and fix up the chain below. */
3017 else
3019 /* If init_b == init_a + the size of the type * k, we have an
3020 interleaving, and DRA is accessed before DRB. */
3021 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3022 if (type_size_a == 0
3023 || (init_b - init_a) % type_size_a != 0)
3024 break;
3026 /* If we have a store, the accesses are adjacent. This splits
3027 groups into chunks we support (we don't support vectorization
3028 of stores with gaps). */
3029 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3030 break;
3032 /* If the step (if not zero or non-constant) is smaller than the
3033 difference between data-refs' inits this splits groups into
3034 suitable sizes. */
3035 if (tree_fits_shwi_p (DR_STEP (dra)))
3037 unsigned HOST_WIDE_INT step
3038 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3039 if (step != 0
3040 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3041 break;
3045 if (dump_enabled_p ())
3046 dump_printf_loc (MSG_NOTE, vect_location,
3047 DR_IS_READ (dra)
3048 ? "Detected interleaving load %T and %T\n"
3049 : "Detected interleaving store %T and %T\n",
3050 DR_REF (dra), DR_REF (drb));
3052 /* Link the found element into the group list. */
3053 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3055 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3056 lastinfo = stmtinfo_a;
3058 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3059 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3060 lastinfo = stmtinfo_b;
3062 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3063 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3065 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "Load suitable for SLP vectorization only.\n");
3069 if (init_b == init_prev
3070 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3071 && dump_enabled_p ())
3072 dump_printf_loc (MSG_NOTE, vect_location,
3073 "Queuing group with duplicate access for fixup\n");
3077 /* Fixup groups with duplicate entries by splitting it. */
3078 while (1)
3080 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3081 if (!(it != to_fixup.end ()))
3082 break;
3083 stmt_vec_info grp = *it;
3084 to_fixup.remove (grp);
3086 /* Find the earliest duplicate group member. */
3087 unsigned first_duplicate = -1u;
3088 stmt_vec_info next, g = grp;
3089 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3091 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3092 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3093 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3094 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3095 g = next;
3097 if (first_duplicate == -1U)
3098 continue;
3100 /* Then move all stmts after the first duplicate to a new group.
3101 Note this is a heuristic but one with the property that *it
3102 is fixed up completely. */
3103 g = grp;
3104 stmt_vec_info newgroup = NULL, ng = grp;
3105 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3107 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3109 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3110 if (!newgroup)
3111 newgroup = next;
3112 else
3113 DR_GROUP_NEXT_ELEMENT (ng) = next;
3114 ng = next;
3115 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3117 else
3118 g = DR_GROUP_NEXT_ELEMENT (g);
3120 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3122 /* Fixup the new group which still may contain duplicates. */
3123 to_fixup.add (newgroup);
3126 data_ref_pair *dr_pair;
3127 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3129 dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3130 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3131 && !vect_analyze_data_ref_access (vinfo, dr_info))
3133 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3135 "not vectorized: complicated access pattern.\n");
3137 if (is_a <bb_vec_info> (vinfo))
3139 /* Mark the statement as not vectorizable. */
3140 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3141 continue;
3143 else
3145 datarefs_copy.release ();
3146 return opt_result::failure_at (dr_info->stmt->stmt,
3147 "not vectorized:"
3148 " complicated access pattern.\n");
3153 datarefs_copy.release ();
3154 return opt_result::success ();
3157 /* Function vect_vfa_segment_size.
3159 Input:
3160 DR_INFO: The data reference.
3161 LENGTH_FACTOR: segment length to consider.
3163 Return a value suitable for the dr_with_seg_len::seg_len field.
3164 This is the "distance travelled" by the pointer from the first
3165 iteration in the segment to the last. Note that it does not include
3166 the size of the access; in effect it only describes the first byte. */
3168 static tree
3169 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3171 length_factor = size_binop (MINUS_EXPR,
3172 fold_convert (sizetype, length_factor),
3173 size_one_node);
3174 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3175 length_factor);
3178 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3179 gives the worst-case number of bytes covered by the segment. */
3181 static unsigned HOST_WIDE_INT
3182 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3184 stmt_vec_info stmt_vinfo = dr_info->stmt;
3185 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3186 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3187 unsigned HOST_WIDE_INT access_size = ref_size;
3188 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3190 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3191 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3193 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3194 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3195 == dr_explicit_realign_optimized))
3197 /* We might access a full vector's worth. */
3198 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3199 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3201 return access_size;
3204 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3205 describes. */
3207 static unsigned int
3208 vect_vfa_align (dr_vec_info *dr_info)
3210 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3213 /* Function vect_no_alias_p.
3215 Given data references A and B with equal base and offset, see whether
3216 the alias relation can be decided at compilation time. Return 1 if
3217 it can and the references alias, 0 if it can and the references do
3218 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3219 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3220 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3222 static int
3223 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3224 tree segment_length_a, tree segment_length_b,
3225 unsigned HOST_WIDE_INT access_size_a,
3226 unsigned HOST_WIDE_INT access_size_b)
3228 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3229 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3230 poly_uint64 const_length_a;
3231 poly_uint64 const_length_b;
3233 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3234 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3235 [a, a+12) */
3236 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3238 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3239 offset_a -= const_length_a;
3241 else
3242 const_length_a = tree_to_poly_uint64 (segment_length_a);
3243 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3245 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3246 offset_b -= const_length_b;
3248 else
3249 const_length_b = tree_to_poly_uint64 (segment_length_b);
3251 const_length_a += access_size_a;
3252 const_length_b += access_size_b;
3254 if (ranges_known_overlap_p (offset_a, const_length_a,
3255 offset_b, const_length_b))
3256 return 1;
3258 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3259 offset_b, const_length_b))
3260 return 0;
3262 return -1;
3265 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3266 in DDR is >= VF. */
3268 static bool
3269 dependence_distance_ge_vf (data_dependence_relation *ddr,
3270 unsigned int loop_depth, poly_uint64 vf)
3272 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3273 || DDR_NUM_DIST_VECTS (ddr) == 0)
3274 return false;
3276 /* If the dependence is exact, we should have limited the VF instead. */
3277 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3279 unsigned int i;
3280 lambda_vector dist_v;
3281 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3283 HOST_WIDE_INT dist = dist_v[loop_depth];
3284 if (dist != 0
3285 && !(dist > 0 && DDR_REVERSED_P (ddr))
3286 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3287 return false;
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE, vect_location,
3292 "dependence distance between %T and %T is >= VF\n",
3293 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3295 return true;
3298 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3300 static void
3301 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3303 dump_printf (dump_kind, "%s (%T) >= ",
3304 lower_bound.unsigned_p ? "unsigned" : "abs",
3305 lower_bound.expr);
3306 dump_dec (dump_kind, lower_bound.min_value);
3309 /* Record that the vectorized loop requires the vec_lower_bound described
3310 by EXPR, UNSIGNED_P and MIN_VALUE. */
3312 static void
3313 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3314 poly_uint64 min_value)
3316 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3317 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3318 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3320 unsigned_p &= lower_bounds[i].unsigned_p;
3321 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3322 if (lower_bounds[i].unsigned_p != unsigned_p
3323 || maybe_lt (lower_bounds[i].min_value, min_value))
3325 lower_bounds[i].unsigned_p = unsigned_p;
3326 lower_bounds[i].min_value = min_value;
3327 if (dump_enabled_p ())
3329 dump_printf_loc (MSG_NOTE, vect_location,
3330 "updating run-time check to ");
3331 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3332 dump_printf (MSG_NOTE, "\n");
3335 return;
3338 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3339 if (dump_enabled_p ())
3341 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3342 dump_lower_bound (MSG_NOTE, lower_bound);
3343 dump_printf (MSG_NOTE, "\n");
3345 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3348 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3349 will span fewer than GAP bytes. */
3351 static bool
3352 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3353 poly_int64 gap)
3355 stmt_vec_info stmt_info = dr_info->stmt;
3356 HOST_WIDE_INT count
3357 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3358 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3359 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3360 return (estimated_poly_value (gap)
3361 <= count * vect_get_scalar_dr_size (dr_info));
3364 /* Return true if we know that there is no alias between DR_INFO_A and
3365 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3366 When returning true, set *LOWER_BOUND_OUT to this N. */
3368 static bool
3369 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3370 poly_uint64 *lower_bound_out)
3372 /* Check that there is a constant gap of known sign between DR_A
3373 and DR_B. */
3374 data_reference *dr_a = dr_info_a->dr;
3375 data_reference *dr_b = dr_info_b->dr;
3376 poly_int64 init_a, init_b;
3377 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3378 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3379 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3380 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3381 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3382 || !ordered_p (init_a, init_b))
3383 return false;
3385 /* Sort DR_A and DR_B by the address they access. */
3386 if (maybe_lt (init_b, init_a))
3388 std::swap (init_a, init_b);
3389 std::swap (dr_info_a, dr_info_b);
3390 std::swap (dr_a, dr_b);
3393 /* If the two accesses could be dependent within a scalar iteration,
3394 make sure that we'd retain their order. */
3395 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3396 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3397 return false;
3399 /* There is no alias if abs (DR_STEP) is greater than or equal to
3400 the bytes spanned by the combination of the two accesses. */
3401 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3402 return true;
3405 /* Function vect_prune_runtime_alias_test_list.
3407 Prune a list of ddrs to be tested at run-time by versioning for alias.
3408 Merge several alias checks into one if possible.
3409 Return FALSE if resulting list of ddrs is longer then allowed by
3410 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3412 opt_result
3413 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3415 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3416 hash_set <tree_pair_hash> compared_objects;
3418 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3419 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3420 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3421 vec<vec_object_pair> &check_unequal_addrs
3422 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3423 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3424 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3426 ddr_p ddr;
3427 unsigned int i;
3428 tree length_factor;
3430 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3432 /* Step values are irrelevant for aliasing if the number of vector
3433 iterations is equal to the number of scalar iterations (which can
3434 happen for fully-SLP loops). */
3435 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3437 if (!ignore_step_p)
3439 /* Convert the checks for nonzero steps into bound tests. */
3440 tree value;
3441 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3442 vect_check_lower_bound (loop_vinfo, value, true, 1);
3445 if (may_alias_ddrs.is_empty ())
3446 return opt_result::success ();
3448 comp_alias_ddrs.create (may_alias_ddrs.length ());
3450 unsigned int loop_depth
3451 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3452 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3454 /* First, we collect all data ref pairs for aliasing checks. */
3455 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3457 poly_uint64 lower_bound;
3458 tree segment_length_a, segment_length_b;
3459 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3460 unsigned int align_a, align_b;
3462 /* Ignore the alias if the VF we chose ended up being no greater
3463 than the dependence distance. */
3464 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3465 continue;
3467 if (DDR_OBJECT_A (ddr))
3469 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3470 if (!compared_objects.add (new_pair))
3472 if (dump_enabled_p ())
3473 dump_printf_loc (MSG_NOTE, vect_location,
3474 "checking that %T and %T"
3475 " have different addresses\n",
3476 new_pair.first, new_pair.second);
3477 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3479 continue;
3482 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3483 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3485 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3486 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3488 bool preserves_scalar_order_p
3489 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3491 /* Skip the pair if inter-iteration dependencies are irrelevant
3492 and intra-iteration dependencies are guaranteed to be honored. */
3493 if (ignore_step_p
3494 && (preserves_scalar_order_p
3495 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3496 &lower_bound)))
3498 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_NOTE, vect_location,
3500 "no need for alias check between "
3501 "%T and %T when VF is 1\n",
3502 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3503 continue;
3506 /* See whether we can handle the alias using a bounds check on
3507 the step, and whether that's likely to be the best approach.
3508 (It might not be, for example, if the minimum step is much larger
3509 than the number of bytes handled by one vector iteration.) */
3510 if (!ignore_step_p
3511 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3512 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3513 &lower_bound)
3514 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3515 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3517 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3518 if (dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3521 "%T and %T when the step %T is outside ",
3522 DR_REF (dr_info_a->dr),
3523 DR_REF (dr_info_b->dr),
3524 DR_STEP (dr_info_a->dr));
3525 if (unsigned_p)
3526 dump_printf (MSG_NOTE, "[0");
3527 else
3529 dump_printf (MSG_NOTE, "(");
3530 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3532 dump_printf (MSG_NOTE, ", ");
3533 dump_dec (MSG_NOTE, lower_bound);
3534 dump_printf (MSG_NOTE, ")\n");
3536 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3537 unsigned_p, lower_bound);
3538 continue;
3541 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3542 if (dr_group_first_a)
3544 stmt_info_a = dr_group_first_a;
3545 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3548 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3549 if (dr_group_first_b)
3551 stmt_info_b = dr_group_first_b;
3552 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3555 if (ignore_step_p)
3557 segment_length_a = size_zero_node;
3558 segment_length_b = size_zero_node;
3560 else
3562 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3563 DR_STEP (dr_info_b->dr), 0))
3564 length_factor = scalar_loop_iters;
3565 else
3566 length_factor = size_int (vect_factor);
3567 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3568 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3570 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3571 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3572 align_a = vect_vfa_align (dr_info_a);
3573 align_b = vect_vfa_align (dr_info_b);
3575 /* See whether the alias is known at compilation time. */
3576 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3577 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3578 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3579 DR_OFFSET (dr_info_b->dr), 0)
3580 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3581 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3582 && poly_int_tree_p (segment_length_a)
3583 && poly_int_tree_p (segment_length_b))
3585 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3586 segment_length_a,
3587 segment_length_b,
3588 access_size_a,
3589 access_size_b);
3590 if (res >= 0 && dump_enabled_p ())
3592 dump_printf_loc (MSG_NOTE, vect_location,
3593 "can tell at compile time that %T and %T",
3594 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3595 if (res == 0)
3596 dump_printf (MSG_NOTE, " do not alias\n");
3597 else
3598 dump_printf (MSG_NOTE, " alias\n");
3601 if (res == 0)
3602 continue;
3604 if (res == 1)
3605 return opt_result::failure_at (stmt_info_b->stmt,
3606 "not vectorized:"
3607 " compilation time alias: %G%G",
3608 stmt_info_a->stmt,
3609 stmt_info_b->stmt);
3612 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3613 access_size_a, align_a);
3614 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3615 access_size_b, align_b);
3616 /* Canonicalize the order to be the one that's needed for accurate
3617 RAW, WAR and WAW flags, in cases where the data references are
3618 well-ordered. The order doesn't really matter otherwise,
3619 but we might as well be consistent. */
3620 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3621 std::swap (dr_a, dr_b);
3623 dr_with_seg_len_pair_t dr_with_seg_len_pair
3624 (dr_a, dr_b, (preserves_scalar_order_p
3625 ? dr_with_seg_len_pair_t::WELL_ORDERED
3626 : dr_with_seg_len_pair_t::REORDERED));
3628 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3631 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3633 unsigned int count = (comp_alias_ddrs.length ()
3634 + check_unequal_addrs.length ());
3636 if (dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "improved number of alias checks from %d to %d\n",
3639 may_alias_ddrs.length (), count);
3640 unsigned limit = param_vect_max_version_for_alias_checks;
3641 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3642 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3643 if (count > limit)
3644 return opt_result::failure_at
3645 (vect_location,
3646 "number of versioning for alias run-time tests exceeds %d "
3647 "(--param vect-max-version-for-alias-checks)\n", limit);
3649 return opt_result::success ();
3652 /* Check whether we can use an internal function for a gather load
3653 or scatter store. READ_P is true for loads and false for stores.
3654 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3655 the type of the memory elements being loaded or stored. OFFSET_TYPE
3656 is the type of the offset that is being applied to the invariant
3657 base address. SCALE is the amount by which the offset should
3658 be multiplied *after* it has been converted to address width.
3660 Return true if the function is supported, storing the function id in
3661 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3663 bool
3664 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3665 tree vectype, tree memory_type, tree offset_type,
3666 int scale, internal_fn *ifn_out,
3667 tree *offset_vectype_out)
3669 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3670 unsigned int element_bits = vector_element_bits (vectype);
3671 if (element_bits != memory_bits)
3672 /* For now the vector elements must be the same width as the
3673 memory elements. */
3674 return false;
3676 /* Work out which function we need. */
3677 internal_fn ifn;
3678 if (read_p)
3679 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3680 else
3681 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3683 for (;;)
3685 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3686 if (!offset_vectype)
3687 return false;
3689 /* Test whether the target supports this combination. */
3690 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3691 offset_vectype, scale))
3693 *ifn_out = ifn;
3694 *offset_vectype_out = offset_vectype;
3695 return true;
3698 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3699 && TYPE_PRECISION (offset_type) >= element_bits)
3700 return false;
3702 offset_type = build_nonstandard_integer_type
3703 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3707 /* STMT_INFO is a call to an internal gather load or scatter store function.
3708 Describe the operation in INFO. */
3710 static void
3711 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3712 gather_scatter_info *info)
3714 gcall *call = as_a <gcall *> (stmt_info->stmt);
3715 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3716 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3718 info->ifn = gimple_call_internal_fn (call);
3719 info->decl = NULL_TREE;
3720 info->base = gimple_call_arg (call, 0);
3721 info->offset = gimple_call_arg (call, 1);
3722 info->offset_dt = vect_unknown_def_type;
3723 info->offset_vectype = NULL_TREE;
3724 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3725 info->element_type = TREE_TYPE (vectype);
3726 info->memory_type = TREE_TYPE (DR_REF (dr));
3729 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3730 gather load or scatter store. Describe the operation in *INFO if so. */
3732 bool
3733 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3734 gather_scatter_info *info)
3736 HOST_WIDE_INT scale = 1;
3737 poly_int64 pbitpos, pbitsize;
3738 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3739 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3740 tree offtype = NULL_TREE;
3741 tree decl = NULL_TREE, base, off;
3742 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3743 tree memory_type = TREE_TYPE (DR_REF (dr));
3744 machine_mode pmode;
3745 int punsignedp, reversep, pvolatilep = 0;
3746 internal_fn ifn;
3747 tree offset_vectype;
3748 bool masked_p = false;
3750 /* See whether this is already a call to a gather/scatter internal function.
3751 If not, see whether it's a masked load or store. */
3752 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3753 if (call && gimple_call_internal_p (call))
3755 ifn = gimple_call_internal_fn (call);
3756 if (internal_gather_scatter_fn_p (ifn))
3758 vect_describe_gather_scatter_call (stmt_info, info);
3759 return true;
3761 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3764 /* True if we should aim to use internal functions rather than
3765 built-in functions. */
3766 bool use_ifn_p = (DR_IS_READ (dr)
3767 ? supports_vec_gather_load_p ()
3768 : supports_vec_scatter_store_p ());
3770 base = DR_REF (dr);
3771 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3772 see if we can use the def stmt of the address. */
3773 if (masked_p
3774 && TREE_CODE (base) == MEM_REF
3775 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3776 && integer_zerop (TREE_OPERAND (base, 1))
3777 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3779 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3780 if (is_gimple_assign (def_stmt)
3781 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3782 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3785 /* The gather and scatter builtins need address of the form
3786 loop_invariant + vector * {1, 2, 4, 8}
3788 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3789 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3790 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3791 multiplications and additions in it. To get a vector, we need
3792 a single SSA_NAME that will be defined in the loop and will
3793 contain everything that is not loop invariant and that can be
3794 vectorized. The following code attempts to find such a preexistng
3795 SSA_NAME OFF and put the loop invariants into a tree BASE
3796 that can be gimplified before the loop. */
3797 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3798 &punsignedp, &reversep, &pvolatilep);
3799 if (reversep)
3800 return false;
3802 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3804 if (TREE_CODE (base) == MEM_REF)
3806 if (!integer_zerop (TREE_OPERAND (base, 1)))
3808 if (off == NULL_TREE)
3809 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3810 else
3811 off = size_binop (PLUS_EXPR, off,
3812 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3814 base = TREE_OPERAND (base, 0);
3816 else
3817 base = build_fold_addr_expr (base);
3819 if (off == NULL_TREE)
3820 off = size_zero_node;
3822 /* If base is not loop invariant, either off is 0, then we start with just
3823 the constant offset in the loop invariant BASE and continue with base
3824 as OFF, otherwise give up.
3825 We could handle that case by gimplifying the addition of base + off
3826 into some SSA_NAME and use that as off, but for now punt. */
3827 if (!expr_invariant_in_loop_p (loop, base))
3829 if (!integer_zerop (off))
3830 return false;
3831 off = base;
3832 base = size_int (pbytepos);
3834 /* Otherwise put base + constant offset into the loop invariant BASE
3835 and continue with OFF. */
3836 else
3838 base = fold_convert (sizetype, base);
3839 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3842 /* OFF at this point may be either a SSA_NAME or some tree expression
3843 from get_inner_reference. Try to peel off loop invariants from it
3844 into BASE as long as possible. */
3845 STRIP_NOPS (off);
3846 while (offtype == NULL_TREE)
3848 enum tree_code code;
3849 tree op0, op1, add = NULL_TREE;
3851 if (TREE_CODE (off) == SSA_NAME)
3853 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3855 if (expr_invariant_in_loop_p (loop, off))
3856 return false;
3858 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3859 break;
3861 op0 = gimple_assign_rhs1 (def_stmt);
3862 code = gimple_assign_rhs_code (def_stmt);
3863 op1 = gimple_assign_rhs2 (def_stmt);
3865 else
3867 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3868 return false;
3869 code = TREE_CODE (off);
3870 extract_ops_from_tree (off, &code, &op0, &op1);
3872 switch (code)
3874 case POINTER_PLUS_EXPR:
3875 case PLUS_EXPR:
3876 if (expr_invariant_in_loop_p (loop, op0))
3878 add = op0;
3879 off = op1;
3880 do_add:
3881 add = fold_convert (sizetype, add);
3882 if (scale != 1)
3883 add = size_binop (MULT_EXPR, add, size_int (scale));
3884 base = size_binop (PLUS_EXPR, base, add);
3885 continue;
3887 if (expr_invariant_in_loop_p (loop, op1))
3889 add = op1;
3890 off = op0;
3891 goto do_add;
3893 break;
3894 case MINUS_EXPR:
3895 if (expr_invariant_in_loop_p (loop, op1))
3897 add = fold_convert (sizetype, op1);
3898 add = size_binop (MINUS_EXPR, size_zero_node, add);
3899 off = op0;
3900 goto do_add;
3902 break;
3903 case MULT_EXPR:
3904 if (scale == 1 && tree_fits_shwi_p (op1))
3906 int new_scale = tree_to_shwi (op1);
3907 /* Only treat this as a scaling operation if the target
3908 supports it for at least some offset type. */
3909 if (use_ifn_p
3910 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3911 masked_p, vectype, memory_type,
3912 signed_char_type_node,
3913 new_scale, &ifn,
3914 &offset_vectype)
3915 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3916 masked_p, vectype, memory_type,
3917 unsigned_char_type_node,
3918 new_scale, &ifn,
3919 &offset_vectype))
3920 break;
3921 scale = new_scale;
3922 off = op0;
3923 continue;
3925 break;
3926 case SSA_NAME:
3927 off = op0;
3928 continue;
3929 CASE_CONVERT:
3930 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3931 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3932 break;
3934 /* Don't include the conversion if the target is happy with
3935 the current offset type. */
3936 if (use_ifn_p
3937 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3938 masked_p, vectype, memory_type,
3939 TREE_TYPE (off), scale, &ifn,
3940 &offset_vectype))
3941 break;
3943 if (TYPE_PRECISION (TREE_TYPE (op0))
3944 == TYPE_PRECISION (TREE_TYPE (off)))
3946 off = op0;
3947 continue;
3950 if (TYPE_PRECISION (TREE_TYPE (op0))
3951 < TYPE_PRECISION (TREE_TYPE (off)))
3953 off = op0;
3954 offtype = TREE_TYPE (off);
3955 STRIP_NOPS (off);
3956 continue;
3958 break;
3959 default:
3960 break;
3962 break;
3965 /* If at the end OFF still isn't a SSA_NAME or isn't
3966 defined in the loop, punt. */
3967 if (TREE_CODE (off) != SSA_NAME
3968 || expr_invariant_in_loop_p (loop, off))
3969 return false;
3971 if (offtype == NULL_TREE)
3972 offtype = TREE_TYPE (off);
3974 if (use_ifn_p)
3976 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
3977 vectype, memory_type, offtype, scale,
3978 &ifn, &offset_vectype))
3979 return false;
3981 else
3983 if (DR_IS_READ (dr))
3985 if (targetm.vectorize.builtin_gather)
3986 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3988 else
3990 if (targetm.vectorize.builtin_scatter)
3991 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3994 if (!decl)
3995 return false;
3997 ifn = IFN_LAST;
3998 /* The offset vector type will be read from DECL when needed. */
3999 offset_vectype = NULL_TREE;
4002 info->ifn = ifn;
4003 info->decl = decl;
4004 info->base = base;
4005 info->offset = off;
4006 info->offset_dt = vect_unknown_def_type;
4007 info->offset_vectype = offset_vectype;
4008 info->scale = scale;
4009 info->element_type = TREE_TYPE (vectype);
4010 info->memory_type = memory_type;
4011 return true;
4014 /* Find the data references in STMT, analyze them with respect to LOOP and
4015 append them to DATAREFS. Return false if datarefs in this stmt cannot
4016 be handled. */
4018 opt_result
4019 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4020 vec<data_reference_p> *datarefs,
4021 vec<int> *dataref_groups, int group_id)
4023 /* We can ignore clobbers for dataref analysis - they are removed during
4024 loop vectorization and BB vectorization checks dependences with a
4025 stmt walk. */
4026 if (gimple_clobber_p (stmt))
4027 return opt_result::success ();
4029 if (gimple_has_volatile_ops (stmt))
4030 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4031 stmt);
4033 if (stmt_can_throw_internal (cfun, stmt))
4034 return opt_result::failure_at (stmt,
4035 "not vectorized:"
4036 " statement can throw an exception: %G",
4037 stmt);
4039 auto_vec<data_reference_p, 2> refs;
4040 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4041 if (!res)
4042 return res;
4044 if (refs.is_empty ())
4045 return opt_result::success ();
4047 if (refs.length () > 1)
4048 return opt_result::failure_at (stmt,
4049 "not vectorized:"
4050 " more than one data ref in stmt: %G", stmt);
4052 if (gcall *call = dyn_cast <gcall *> (stmt))
4053 if (!gimple_call_internal_p (call)
4054 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4055 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4056 return opt_result::failure_at (stmt,
4057 "not vectorized: dr in a call %G", stmt);
4059 data_reference_p dr = refs.pop ();
4060 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4061 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4062 return opt_result::failure_at (stmt,
4063 "not vectorized:"
4064 " statement is bitfield access %G", stmt);
4066 if (DR_BASE_ADDRESS (dr)
4067 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4068 return opt_result::failure_at (stmt,
4069 "not vectorized:"
4070 " base addr of dr is a constant\n");
4072 /* Check whether this may be a SIMD lane access and adjust the
4073 DR to make it easier for us to handle it. */
4074 if (loop
4075 && loop->simduid
4076 && (!DR_BASE_ADDRESS (dr)
4077 || !DR_OFFSET (dr)
4078 || !DR_INIT (dr)
4079 || !DR_STEP (dr)))
4081 struct data_reference *newdr
4082 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4083 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4084 if (DR_BASE_ADDRESS (newdr)
4085 && DR_OFFSET (newdr)
4086 && DR_INIT (newdr)
4087 && DR_STEP (newdr)
4088 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4089 && integer_zerop (DR_STEP (newdr)))
4091 tree base_address = DR_BASE_ADDRESS (newdr);
4092 tree off = DR_OFFSET (newdr);
4093 tree step = ssize_int (1);
4094 if (integer_zerop (off)
4095 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4097 off = TREE_OPERAND (base_address, 1);
4098 base_address = TREE_OPERAND (base_address, 0);
4100 STRIP_NOPS (off);
4101 if (TREE_CODE (off) == MULT_EXPR
4102 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4104 step = TREE_OPERAND (off, 1);
4105 off = TREE_OPERAND (off, 0);
4106 STRIP_NOPS (off);
4108 if (CONVERT_EXPR_P (off)
4109 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4110 < TYPE_PRECISION (TREE_TYPE (off))))
4111 off = TREE_OPERAND (off, 0);
4112 if (TREE_CODE (off) == SSA_NAME)
4114 gimple *def = SSA_NAME_DEF_STMT (off);
4115 /* Look through widening conversion. */
4116 if (is_gimple_assign (def)
4117 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4119 tree rhs1 = gimple_assign_rhs1 (def);
4120 if (TREE_CODE (rhs1) == SSA_NAME
4121 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4122 && (TYPE_PRECISION (TREE_TYPE (off))
4123 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4124 def = SSA_NAME_DEF_STMT (rhs1);
4126 if (is_gimple_call (def)
4127 && gimple_call_internal_p (def)
4128 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4130 tree arg = gimple_call_arg (def, 0);
4131 tree reft = TREE_TYPE (DR_REF (newdr));
4132 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4133 arg = SSA_NAME_VAR (arg);
4134 if (arg == loop->simduid
4135 /* For now. */
4136 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4138 DR_BASE_ADDRESS (newdr) = base_address;
4139 DR_OFFSET (newdr) = ssize_int (0);
4140 DR_STEP (newdr) = step;
4141 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4142 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4143 /* Mark as simd-lane access. */
4144 tree arg2 = gimple_call_arg (def, 1);
4145 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4146 free_data_ref (dr);
4147 datarefs->safe_push (newdr);
4148 if (dataref_groups)
4149 dataref_groups->safe_push (group_id);
4150 return opt_result::success ();
4155 free_data_ref (newdr);
4158 datarefs->safe_push (dr);
4159 if (dataref_groups)
4160 dataref_groups->safe_push (group_id);
4161 return opt_result::success ();
4164 /* Function vect_analyze_data_refs.
4166 Find all the data references in the loop or basic block.
4168 The general structure of the analysis of data refs in the vectorizer is as
4169 follows:
4170 1- vect_analyze_data_refs(loop/bb): call
4171 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4172 in the loop/bb and their dependences.
4173 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4174 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4175 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4179 opt_result
4180 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4182 class loop *loop = NULL;
4183 unsigned int i;
4184 struct data_reference *dr;
4185 tree scalar_type;
4187 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4189 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4190 loop = LOOP_VINFO_LOOP (loop_vinfo);
4192 /* Go through the data-refs, check that the analysis succeeded. Update
4193 pointer from stmt_vec_info struct to DR and vectype. */
4195 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4196 FOR_EACH_VEC_ELT (datarefs, i, dr)
4198 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4199 poly_uint64 vf;
4201 gcc_assert (DR_REF (dr));
4202 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4203 gcc_assert (!stmt_info->dr_aux.dr);
4204 stmt_info->dr_aux.dr = dr;
4205 stmt_info->dr_aux.stmt = stmt_info;
4207 /* Check that analysis of the data-ref succeeded. */
4208 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4209 || !DR_STEP (dr))
4211 bool maybe_gather
4212 = DR_IS_READ (dr)
4213 && !TREE_THIS_VOLATILE (DR_REF (dr))
4214 && (targetm.vectorize.builtin_gather != NULL
4215 || supports_vec_gather_load_p ());
4216 bool maybe_scatter
4217 = DR_IS_WRITE (dr)
4218 && !TREE_THIS_VOLATILE (DR_REF (dr))
4219 && (targetm.vectorize.builtin_scatter != NULL
4220 || supports_vec_scatter_store_p ());
4222 /* If target supports vector gather loads or scatter stores,
4223 see if they can't be used. */
4224 if (is_a <loop_vec_info> (vinfo)
4225 && !nested_in_vect_loop_p (loop, stmt_info))
4227 if (maybe_gather || maybe_scatter)
4229 if (maybe_gather)
4230 gatherscatter = GATHER;
4231 else
4232 gatherscatter = SCATTER;
4236 if (gatherscatter == SG_NONE)
4238 if (dump_enabled_p ())
4239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4240 "not vectorized: data ref analysis "
4241 "failed %G", stmt_info->stmt);
4242 if (is_a <bb_vec_info> (vinfo))
4244 /* In BB vectorization the ref can still participate
4245 in dependence analysis, we just can't vectorize it. */
4246 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4247 continue;
4249 return opt_result::failure_at (stmt_info->stmt,
4250 "not vectorized:"
4251 " data ref analysis failed: %G",
4252 stmt_info->stmt);
4256 /* See if this was detected as SIMD lane access. */
4257 if (dr->aux == (void *)-1
4258 || dr->aux == (void *)-2
4259 || dr->aux == (void *)-3
4260 || dr->aux == (void *)-4)
4262 if (nested_in_vect_loop_p (loop, stmt_info))
4263 return opt_result::failure_at (stmt_info->stmt,
4264 "not vectorized:"
4265 " data ref analysis failed: %G",
4266 stmt_info->stmt);
4267 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4268 = -(uintptr_t) dr->aux;
4271 tree base = get_base_address (DR_REF (dr));
4272 if (base && VAR_P (base) && DECL_NONALIASED (base))
4274 if (dump_enabled_p ())
4275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4276 "not vectorized: base object not addressable "
4277 "for stmt: %G", stmt_info->stmt);
4278 if (is_a <bb_vec_info> (vinfo))
4280 /* In BB vectorization the ref can still participate
4281 in dependence analysis, we just can't vectorize it. */
4282 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4283 continue;
4285 return opt_result::failure_at (stmt_info->stmt,
4286 "not vectorized: base object not"
4287 " addressable for stmt: %G",
4288 stmt_info->stmt);
4291 if (is_a <loop_vec_info> (vinfo)
4292 && DR_STEP (dr)
4293 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4295 if (nested_in_vect_loop_p (loop, stmt_info))
4296 return opt_result::failure_at (stmt_info->stmt,
4297 "not vectorized: "
4298 "not suitable for strided load %G",
4299 stmt_info->stmt);
4300 STMT_VINFO_STRIDED_P (stmt_info) = true;
4303 /* Update DR field in stmt_vec_info struct. */
4305 /* If the dataref is in an inner-loop of the loop that is considered for
4306 for vectorization, we also want to analyze the access relative to
4307 the outer-loop (DR contains information only relative to the
4308 inner-most enclosing loop). We do that by building a reference to the
4309 first location accessed by the inner-loop, and analyze it relative to
4310 the outer-loop. */
4311 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4313 /* Build a reference to the first location accessed by the
4314 inner loop: *(BASE + INIT + OFFSET). By construction,
4315 this address must be invariant in the inner loop, so we
4316 can consider it as being used in the outer loop. */
4317 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4318 tree offset = unshare_expr (DR_OFFSET (dr));
4319 tree init = unshare_expr (DR_INIT (dr));
4320 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4321 init, offset);
4322 tree init_addr = fold_build_pointer_plus (base, init_offset);
4323 tree init_ref = build_fold_indirect_ref (init_addr);
4325 if (dump_enabled_p ())
4326 dump_printf_loc (MSG_NOTE, vect_location,
4327 "analyze in outer loop: %T\n", init_ref);
4329 opt_result res
4330 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4331 init_ref, loop, stmt_info->stmt);
4332 if (!res)
4333 /* dr_analyze_innermost already explained the failure. */
4334 return res;
4336 if (dump_enabled_p ())
4337 dump_printf_loc (MSG_NOTE, vect_location,
4338 "\touter base_address: %T\n"
4339 "\touter offset from base address: %T\n"
4340 "\touter constant offset from base address: %T\n"
4341 "\touter step: %T\n"
4342 "\touter base alignment: %d\n\n"
4343 "\touter base misalignment: %d\n"
4344 "\touter offset alignment: %d\n"
4345 "\touter step alignment: %d\n",
4346 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4347 STMT_VINFO_DR_OFFSET (stmt_info),
4348 STMT_VINFO_DR_INIT (stmt_info),
4349 STMT_VINFO_DR_STEP (stmt_info),
4350 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4351 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4352 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4353 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4356 /* Set vectype for STMT. */
4357 scalar_type = TREE_TYPE (DR_REF (dr));
4358 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4359 if (!vectype)
4361 if (dump_enabled_p ())
4363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4364 "not vectorized: no vectype for stmt: %G",
4365 stmt_info->stmt);
4366 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4368 scalar_type);
4369 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4372 if (is_a <bb_vec_info> (vinfo))
4374 /* No vector type is fine, the ref can still participate
4375 in dependence analysis, we just can't vectorize it. */
4376 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4377 continue;
4379 if (fatal)
4380 *fatal = false;
4381 return opt_result::failure_at (stmt_info->stmt,
4382 "not vectorized:"
4383 " no vectype for stmt: %G"
4384 " scalar_type: %T\n",
4385 stmt_info->stmt, scalar_type);
4387 else
4389 if (dump_enabled_p ())
4390 dump_printf_loc (MSG_NOTE, vect_location,
4391 "got vectype for stmt: %G%T\n",
4392 stmt_info->stmt, vectype);
4395 /* Adjust the minimal vectorization factor according to the
4396 vector type. */
4397 vf = TYPE_VECTOR_SUBPARTS (vectype);
4398 *min_vf = upper_bound (*min_vf, vf);
4400 /* Leave the BB vectorizer to pick the vector type later, based on
4401 the final dataref group size and SLP node size. */
4402 if (is_a <loop_vec_info> (vinfo))
4403 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4405 if (gatherscatter != SG_NONE)
4407 gather_scatter_info gs_info;
4408 if (!vect_check_gather_scatter (stmt_info,
4409 as_a <loop_vec_info> (vinfo),
4410 &gs_info)
4411 || !get_vectype_for_scalar_type (vinfo,
4412 TREE_TYPE (gs_info.offset)))
4414 if (fatal)
4415 *fatal = false;
4416 return opt_result::failure_at
4417 (stmt_info->stmt,
4418 (gatherscatter == GATHER)
4419 ? "not vectorized: not suitable for gather load %G"
4420 : "not vectorized: not suitable for scatter store %G",
4421 stmt_info->stmt);
4423 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4427 /* We used to stop processing and prune the list here. Verify we no
4428 longer need to. */
4429 gcc_assert (i == datarefs.length ());
4431 return opt_result::success ();
4435 /* Function vect_get_new_vect_var.
4437 Returns a name for a new variable. The current naming scheme appends the
4438 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4439 the name of vectorizer generated variables, and appends that to NAME if
4440 provided. */
4442 tree
4443 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4445 const char *prefix;
4446 tree new_vect_var;
4448 switch (var_kind)
4450 case vect_simple_var:
4451 prefix = "vect";
4452 break;
4453 case vect_scalar_var:
4454 prefix = "stmp";
4455 break;
4456 case vect_mask_var:
4457 prefix = "mask";
4458 break;
4459 case vect_pointer_var:
4460 prefix = "vectp";
4461 break;
4462 default:
4463 gcc_unreachable ();
4466 if (name)
4468 char* tmp = concat (prefix, "_", name, NULL);
4469 new_vect_var = create_tmp_reg (type, tmp);
4470 free (tmp);
4472 else
4473 new_vect_var = create_tmp_reg (type, prefix);
4475 return new_vect_var;
4478 /* Like vect_get_new_vect_var but return an SSA name. */
4480 tree
4481 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4483 const char *prefix;
4484 tree new_vect_var;
4486 switch (var_kind)
4488 case vect_simple_var:
4489 prefix = "vect";
4490 break;
4491 case vect_scalar_var:
4492 prefix = "stmp";
4493 break;
4494 case vect_pointer_var:
4495 prefix = "vectp";
4496 break;
4497 default:
4498 gcc_unreachable ();
4501 if (name)
4503 char* tmp = concat (prefix, "_", name, NULL);
4504 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4505 free (tmp);
4507 else
4508 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4510 return new_vect_var;
4513 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4515 static void
4516 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4518 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4519 int misalign = DR_MISALIGNMENT (dr_info);
4520 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4521 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4522 else
4523 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4524 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4525 misalign);
4528 /* Function vect_create_addr_base_for_vector_ref.
4530 Create an expression that computes the address of the first memory location
4531 that will be accessed for a data reference.
4533 Input:
4534 STMT_INFO: The statement containing the data reference.
4535 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4536 OFFSET: Optional. If supplied, it is be added to the initial address.
4537 LOOP: Specify relative to which loop-nest should the address be computed.
4538 For example, when the dataref is in an inner-loop nested in an
4539 outer-loop that is now being vectorized, LOOP can be either the
4540 outer-loop, or the inner-loop. The first memory location accessed
4541 by the following dataref ('in' points to short):
4543 for (i=0; i<N; i++)
4544 for (j=0; j<M; j++)
4545 s += in[i+j]
4547 is as follows:
4548 if LOOP=i_loop: &in (relative to i_loop)
4549 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4550 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4551 initial address. Unlike OFFSET, which is number of elements to
4552 be added, BYTE_OFFSET is measured in bytes.
4554 Output:
4555 1. Return an SSA_NAME whose value is the address of the memory location of
4556 the first vector of the data reference.
4557 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4558 these statement(s) which define the returned SSA_NAME.
4560 FORNOW: We are only handling array accesses with step 1. */
4562 tree
4563 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4564 gimple_seq *new_stmt_list,
4565 tree offset,
4566 tree byte_offset)
4568 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4569 struct data_reference *dr = dr_info->dr;
4570 const char *base_name;
4571 tree addr_base;
4572 tree dest;
4573 gimple_seq seq = NULL;
4574 tree vect_ptr_type;
4575 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4576 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4577 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4579 tree data_ref_base = unshare_expr (drb->base_address);
4580 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4581 tree init = unshare_expr (drb->init);
4583 if (loop_vinfo)
4584 base_name = get_name (data_ref_base);
4585 else
4587 base_offset = ssize_int (0);
4588 init = ssize_int (0);
4589 base_name = get_name (DR_REF (dr));
4592 /* Create base_offset */
4593 base_offset = size_binop (PLUS_EXPR,
4594 fold_convert (sizetype, base_offset),
4595 fold_convert (sizetype, init));
4597 if (offset)
4599 offset = fold_build2 (MULT_EXPR, sizetype,
4600 fold_convert (sizetype, offset), step);
4601 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4602 base_offset, offset);
4604 if (byte_offset)
4606 byte_offset = fold_convert (sizetype, byte_offset);
4607 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4608 base_offset, byte_offset);
4611 /* base + base_offset */
4612 if (loop_vinfo)
4613 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4614 else
4616 addr_base = build1 (ADDR_EXPR,
4617 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4618 unshare_expr (DR_REF (dr)));
4621 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4622 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4623 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4624 gimple_seq_add_seq (new_stmt_list, seq);
4626 if (DR_PTR_INFO (dr)
4627 && TREE_CODE (addr_base) == SSA_NAME
4628 && !SSA_NAME_PTR_INFO (addr_base))
4630 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4631 if (offset || byte_offset)
4632 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4635 if (dump_enabled_p ())
4636 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4638 return addr_base;
4642 /* Function vect_create_data_ref_ptr.
4644 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4645 location accessed in the loop by STMT_INFO, along with the def-use update
4646 chain to appropriately advance the pointer through the loop iterations.
4647 Also set aliasing information for the pointer. This pointer is used by
4648 the callers to this function to create a memory reference expression for
4649 vector load/store access.
4651 Input:
4652 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4653 GIMPLE_ASSIGN <name, data-ref> or
4654 GIMPLE_ASSIGN <data-ref, name>.
4655 2. AGGR_TYPE: the type of the reference, which should be either a vector
4656 or an array.
4657 3. AT_LOOP: the loop where the vector memref is to be created.
4658 4. OFFSET (optional): an offset to be added to the initial address accessed
4659 by the data-ref in STMT_INFO.
4660 5. BSI: location where the new stmts are to be placed if there is no loop
4661 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4662 pointing to the initial address.
4663 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4664 to the initial address accessed by the data-ref in STMT_INFO. This is
4665 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4666 in bytes.
4667 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4668 to the IV during each iteration of the loop. NULL says to move
4669 by one copy of AGGR_TYPE up or down, depending on the step of the
4670 data reference.
4672 Output:
4673 1. Declare a new ptr to vector_type, and have it point to the base of the
4674 data reference (initial addressed accessed by the data reference).
4675 For example, for vector of type V8HI, the following code is generated:
4677 v8hi *ap;
4678 ap = (v8hi *)initial_address;
4680 if OFFSET is not supplied:
4681 initial_address = &a[init];
4682 if OFFSET is supplied:
4683 initial_address = &a[init + OFFSET];
4684 if BYTE_OFFSET is supplied:
4685 initial_address = &a[init] + BYTE_OFFSET;
4687 Return the initial_address in INITIAL_ADDRESS.
4689 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4690 update the pointer in each iteration of the loop.
4692 Return the increment stmt that updates the pointer in PTR_INCR.
4694 3. Return the pointer. */
4696 tree
4697 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4698 tree aggr_type, class loop *at_loop, tree offset,
4699 tree *initial_address, gimple_stmt_iterator *gsi,
4700 gimple **ptr_incr, bool only_init,
4701 tree byte_offset, tree iv_step)
4703 const char *base_name;
4704 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4705 class loop *loop = NULL;
4706 bool nested_in_vect_loop = false;
4707 class loop *containing_loop = NULL;
4708 tree aggr_ptr_type;
4709 tree aggr_ptr;
4710 tree new_temp;
4711 gimple_seq new_stmt_list = NULL;
4712 edge pe = NULL;
4713 basic_block new_bb;
4714 tree aggr_ptr_init;
4715 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4716 struct data_reference *dr = dr_info->dr;
4717 tree aptr;
4718 gimple_stmt_iterator incr_gsi;
4719 bool insert_after;
4720 tree indx_before_incr, indx_after_incr;
4721 gimple *incr;
4722 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4724 gcc_assert (iv_step != NULL_TREE
4725 || TREE_CODE (aggr_type) == ARRAY_TYPE
4726 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4728 if (loop_vinfo)
4730 loop = LOOP_VINFO_LOOP (loop_vinfo);
4731 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4732 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4733 pe = loop_preheader_edge (loop);
4735 else
4737 gcc_assert (bb_vinfo);
4738 only_init = true;
4739 *ptr_incr = NULL;
4742 /* Create an expression for the first address accessed by this load
4743 in LOOP. */
4744 base_name = get_name (DR_BASE_ADDRESS (dr));
4746 if (dump_enabled_p ())
4748 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4749 dump_printf_loc (MSG_NOTE, vect_location,
4750 "create %s-pointer variable to type: %T",
4751 get_tree_code_name (TREE_CODE (aggr_type)),
4752 aggr_type);
4753 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4754 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4755 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4756 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4757 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4758 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4759 else
4760 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4761 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4764 /* (1) Create the new aggregate-pointer variable.
4765 Vector and array types inherit the alias set of their component
4766 type by default so we need to use a ref-all pointer if the data
4767 reference does not conflict with the created aggregated data
4768 reference because it is not addressable. */
4769 bool need_ref_all = false;
4770 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4771 get_alias_set (DR_REF (dr))))
4772 need_ref_all = true;
4773 /* Likewise for any of the data references in the stmt group. */
4774 else if (DR_GROUP_SIZE (stmt_info) > 1)
4776 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4779 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4780 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4781 get_alias_set (DR_REF (sdr))))
4783 need_ref_all = true;
4784 break;
4786 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4788 while (sinfo);
4790 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4791 need_ref_all);
4792 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4795 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4796 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4797 def-use update cycles for the pointer: one relative to the outer-loop
4798 (LOOP), which is what steps (3) and (4) below do. The other is relative
4799 to the inner-loop (which is the inner-most loop containing the dataref),
4800 and this is done be step (5) below.
4802 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4803 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4804 redundant. Steps (3),(4) create the following:
4806 vp0 = &base_addr;
4807 LOOP: vp1 = phi(vp0,vp2)
4810 vp2 = vp1 + step
4811 goto LOOP
4813 If there is an inner-loop nested in loop, then step (5) will also be
4814 applied, and an additional update in the inner-loop will be created:
4816 vp0 = &base_addr;
4817 LOOP: vp1 = phi(vp0,vp2)
4819 inner: vp3 = phi(vp1,vp4)
4820 vp4 = vp3 + inner_step
4821 if () goto inner
4823 vp2 = vp1 + step
4824 if () goto LOOP */
4826 /* (2) Calculate the initial address of the aggregate-pointer, and set
4827 the aggregate-pointer to point to it before the loop. */
4829 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4831 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4832 stmt_info, &new_stmt_list,
4833 offset, byte_offset);
4834 if (new_stmt_list)
4836 if (pe)
4838 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4839 gcc_assert (!new_bb);
4841 else
4842 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4845 *initial_address = new_temp;
4846 aggr_ptr_init = new_temp;
4848 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4849 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4850 inner-loop nested in LOOP (during outer-loop vectorization). */
4852 /* No update in loop is required. */
4853 if (only_init && (!loop_vinfo || at_loop == loop))
4854 aptr = aggr_ptr_init;
4855 else
4857 /* Accesses to invariant addresses should be handled specially
4858 by the caller. */
4859 tree step = vect_dr_behavior (vinfo, dr_info)->step;
4860 gcc_assert (!integer_zerop (step));
4862 if (iv_step == NULL_TREE)
4864 /* The step of the aggregate pointer is the type size,
4865 negated for downward accesses. */
4866 iv_step = TYPE_SIZE_UNIT (aggr_type);
4867 if (tree_int_cst_sgn (step) == -1)
4868 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4871 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4873 create_iv (aggr_ptr_init,
4874 fold_convert (aggr_ptr_type, iv_step),
4875 aggr_ptr, loop, &incr_gsi, insert_after,
4876 &indx_before_incr, &indx_after_incr);
4877 incr = gsi_stmt (incr_gsi);
4879 /* Copy the points-to information if it exists. */
4880 if (DR_PTR_INFO (dr))
4882 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4883 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4885 if (ptr_incr)
4886 *ptr_incr = incr;
4888 aptr = indx_before_incr;
4891 if (!nested_in_vect_loop || only_init)
4892 return aptr;
4895 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4896 nested in LOOP, if exists. */
4898 gcc_assert (nested_in_vect_loop);
4899 if (!only_init)
4901 standard_iv_increment_position (containing_loop, &incr_gsi,
4902 &insert_after);
4903 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4904 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4905 &indx_after_incr);
4906 incr = gsi_stmt (incr_gsi);
4908 /* Copy the points-to information if it exists. */
4909 if (DR_PTR_INFO (dr))
4911 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4912 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4914 if (ptr_incr)
4915 *ptr_incr = incr;
4917 return indx_before_incr;
4919 else
4920 gcc_unreachable ();
4924 /* Function bump_vector_ptr
4926 Increment a pointer (to a vector type) by vector-size. If requested,
4927 i.e. if PTR-INCR is given, then also connect the new increment stmt
4928 to the existing def-use update-chain of the pointer, by modifying
4929 the PTR_INCR as illustrated below:
4931 The pointer def-use update-chain before this function:
4932 DATAREF_PTR = phi (p_0, p_2)
4933 ....
4934 PTR_INCR: p_2 = DATAREF_PTR + step
4936 The pointer def-use update-chain after this function:
4937 DATAREF_PTR = phi (p_0, p_2)
4938 ....
4939 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4940 ....
4941 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4943 Input:
4944 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4945 in the loop.
4946 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4947 the loop. The increment amount across iterations is expected
4948 to be vector_size.
4949 BSI - location where the new update stmt is to be placed.
4950 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4951 BUMP - optional. The offset by which to bump the pointer. If not given,
4952 the offset is assumed to be vector_size.
4954 Output: Return NEW_DATAREF_PTR as illustrated above.
4958 tree
4959 bump_vector_ptr (vec_info *vinfo,
4960 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4961 stmt_vec_info stmt_info, tree bump)
4963 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4964 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4965 tree update = TYPE_SIZE_UNIT (vectype);
4966 gassign *incr_stmt;
4967 ssa_op_iter iter;
4968 use_operand_p use_p;
4969 tree new_dataref_ptr;
4971 if (bump)
4972 update = bump;
4974 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4975 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4976 else
4977 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4978 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4979 dataref_ptr, update);
4980 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
4982 /* Copy the points-to information if it exists. */
4983 if (DR_PTR_INFO (dr))
4985 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4986 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4989 if (!ptr_incr)
4990 return new_dataref_ptr;
4992 /* Update the vector-pointer's cross-iteration increment. */
4993 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4995 tree use = USE_FROM_PTR (use_p);
4997 if (use == dataref_ptr)
4998 SET_USE (use_p, new_dataref_ptr);
4999 else
5000 gcc_assert (operand_equal_p (use, update, 0));
5003 return new_dataref_ptr;
5007 /* Copy memory reference info such as base/clique from the SRC reference
5008 to the DEST MEM_REF. */
5010 void
5011 vect_copy_ref_info (tree dest, tree src)
5013 if (TREE_CODE (dest) != MEM_REF)
5014 return;
5016 tree src_base = src;
5017 while (handled_component_p (src_base))
5018 src_base = TREE_OPERAND (src_base, 0);
5019 if (TREE_CODE (src_base) != MEM_REF
5020 && TREE_CODE (src_base) != TARGET_MEM_REF)
5021 return;
5023 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5024 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5028 /* Function vect_create_destination_var.
5030 Create a new temporary of type VECTYPE. */
5032 tree
5033 vect_create_destination_var (tree scalar_dest, tree vectype)
5035 tree vec_dest;
5036 const char *name;
5037 char *new_name;
5038 tree type;
5039 enum vect_var_kind kind;
5041 kind = vectype
5042 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5043 ? vect_mask_var
5044 : vect_simple_var
5045 : vect_scalar_var;
5046 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5048 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5050 name = get_name (scalar_dest);
5051 if (name)
5052 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5053 else
5054 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5055 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5056 free (new_name);
5058 return vec_dest;
5061 /* Function vect_grouped_store_supported.
5063 Returns TRUE if interleave high and interleave low permutations
5064 are supported, and FALSE otherwise. */
5066 bool
5067 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5069 machine_mode mode = TYPE_MODE (vectype);
5071 /* vect_permute_store_chain requires the group size to be equal to 3 or
5072 be a power of two. */
5073 if (count != 3 && exact_log2 (count) == -1)
5075 if (dump_enabled_p ())
5076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5077 "the size of the group of accesses"
5078 " is not a power of 2 or not eqaul to 3\n");
5079 return false;
5082 /* Check that the permutation is supported. */
5083 if (VECTOR_MODE_P (mode))
5085 unsigned int i;
5086 if (count == 3)
5088 unsigned int j0 = 0, j1 = 0, j2 = 0;
5089 unsigned int i, j;
5091 unsigned int nelt;
5092 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5094 if (dump_enabled_p ())
5095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5096 "cannot handle groups of 3 stores for"
5097 " variable-length vectors\n");
5098 return false;
5101 vec_perm_builder sel (nelt, nelt, 1);
5102 sel.quick_grow (nelt);
5103 vec_perm_indices indices;
5104 for (j = 0; j < 3; j++)
5106 int nelt0 = ((3 - j) * nelt) % 3;
5107 int nelt1 = ((3 - j) * nelt + 1) % 3;
5108 int nelt2 = ((3 - j) * nelt + 2) % 3;
5109 for (i = 0; i < nelt; i++)
5111 if (3 * i + nelt0 < nelt)
5112 sel[3 * i + nelt0] = j0++;
5113 if (3 * i + nelt1 < nelt)
5114 sel[3 * i + nelt1] = nelt + j1++;
5115 if (3 * i + nelt2 < nelt)
5116 sel[3 * i + nelt2] = 0;
5118 indices.new_vector (sel, 2, nelt);
5119 if (!can_vec_perm_const_p (mode, indices))
5121 if (dump_enabled_p ())
5122 dump_printf (MSG_MISSED_OPTIMIZATION,
5123 "permutation op not supported by target.\n");
5124 return false;
5127 for (i = 0; i < nelt; i++)
5129 if (3 * i + nelt0 < nelt)
5130 sel[3 * i + nelt0] = 3 * i + nelt0;
5131 if (3 * i + nelt1 < nelt)
5132 sel[3 * i + nelt1] = 3 * i + nelt1;
5133 if (3 * i + nelt2 < nelt)
5134 sel[3 * i + nelt2] = nelt + j2++;
5136 indices.new_vector (sel, 2, nelt);
5137 if (!can_vec_perm_const_p (mode, indices))
5139 if (dump_enabled_p ())
5140 dump_printf (MSG_MISSED_OPTIMIZATION,
5141 "permutation op not supported by target.\n");
5142 return false;
5145 return true;
5147 else
5149 /* If length is not equal to 3 then only power of 2 is supported. */
5150 gcc_assert (pow2p_hwi (count));
5151 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5153 /* The encoding has 2 interleaved stepped patterns. */
5154 vec_perm_builder sel (nelt, 2, 3);
5155 sel.quick_grow (6);
5156 for (i = 0; i < 3; i++)
5158 sel[i * 2] = i;
5159 sel[i * 2 + 1] = i + nelt;
5161 vec_perm_indices indices (sel, 2, nelt);
5162 if (can_vec_perm_const_p (mode, indices))
5164 for (i = 0; i < 6; i++)
5165 sel[i] += exact_div (nelt, 2);
5166 indices.new_vector (sel, 2, nelt);
5167 if (can_vec_perm_const_p (mode, indices))
5168 return true;
5173 if (dump_enabled_p ())
5174 dump_printf (MSG_MISSED_OPTIMIZATION,
5175 "permutation op not supported by target.\n");
5176 return false;
5180 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5181 type VECTYPE. MASKED_P says whether the masked form is needed. */
5183 bool
5184 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5185 bool masked_p)
5187 if (masked_p)
5188 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5189 vec_mask_store_lanes_optab,
5190 vectype, count);
5191 else
5192 return vect_lanes_optab_supported_p ("vec_store_lanes",
5193 vec_store_lanes_optab,
5194 vectype, count);
5198 /* Function vect_permute_store_chain.
5200 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5201 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5202 the data correctly for the stores. Return the final references for stores
5203 in RESULT_CHAIN.
5205 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5206 The input is 4 vectors each containing 8 elements. We assign a number to
5207 each element, the input sequence is:
5209 1st vec: 0 1 2 3 4 5 6 7
5210 2nd vec: 8 9 10 11 12 13 14 15
5211 3rd vec: 16 17 18 19 20 21 22 23
5212 4th vec: 24 25 26 27 28 29 30 31
5214 The output sequence should be:
5216 1st vec: 0 8 16 24 1 9 17 25
5217 2nd vec: 2 10 18 26 3 11 19 27
5218 3rd vec: 4 12 20 28 5 13 21 30
5219 4th vec: 6 14 22 30 7 15 23 31
5221 i.e., we interleave the contents of the four vectors in their order.
5223 We use interleave_high/low instructions to create such output. The input of
5224 each interleave_high/low operation is two vectors:
5225 1st vec 2nd vec
5226 0 1 2 3 4 5 6 7
5227 the even elements of the result vector are obtained left-to-right from the
5228 high/low elements of the first vector. The odd elements of the result are
5229 obtained left-to-right from the high/low elements of the second vector.
5230 The output of interleave_high will be: 0 4 1 5
5231 and of interleave_low: 2 6 3 7
5234 The permutation is done in log LENGTH stages. In each stage interleave_high
5235 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5236 where the first argument is taken from the first half of DR_CHAIN and the
5237 second argument from it's second half.
5238 In our example,
5240 I1: interleave_high (1st vec, 3rd vec)
5241 I2: interleave_low (1st vec, 3rd vec)
5242 I3: interleave_high (2nd vec, 4th vec)
5243 I4: interleave_low (2nd vec, 4th vec)
5245 The output for the first stage is:
5247 I1: 0 16 1 17 2 18 3 19
5248 I2: 4 20 5 21 6 22 7 23
5249 I3: 8 24 9 25 10 26 11 27
5250 I4: 12 28 13 29 14 30 15 31
5252 The output of the second stage, i.e. the final result is:
5254 I1: 0 8 16 24 1 9 17 25
5255 I2: 2 10 18 26 3 11 19 27
5256 I3: 4 12 20 28 5 13 21 30
5257 I4: 6 14 22 30 7 15 23 31. */
5259 void
5260 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5261 unsigned int length,
5262 stmt_vec_info stmt_info,
5263 gimple_stmt_iterator *gsi,
5264 vec<tree> *result_chain)
5266 tree vect1, vect2, high, low;
5267 gimple *perm_stmt;
5268 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5269 tree perm_mask_low, perm_mask_high;
5270 tree data_ref;
5271 tree perm3_mask_low, perm3_mask_high;
5272 unsigned int i, j, n, log_length = exact_log2 (length);
5274 result_chain->quick_grow (length);
5275 memcpy (result_chain->address (), dr_chain.address (),
5276 length * sizeof (tree));
5278 if (length == 3)
5280 /* vect_grouped_store_supported ensures that this is constant. */
5281 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5282 unsigned int j0 = 0, j1 = 0, j2 = 0;
5284 vec_perm_builder sel (nelt, nelt, 1);
5285 sel.quick_grow (nelt);
5286 vec_perm_indices indices;
5287 for (j = 0; j < 3; j++)
5289 int nelt0 = ((3 - j) * nelt) % 3;
5290 int nelt1 = ((3 - j) * nelt + 1) % 3;
5291 int nelt2 = ((3 - j) * nelt + 2) % 3;
5293 for (i = 0; i < nelt; i++)
5295 if (3 * i + nelt0 < nelt)
5296 sel[3 * i + nelt0] = j0++;
5297 if (3 * i + nelt1 < nelt)
5298 sel[3 * i + nelt1] = nelt + j1++;
5299 if (3 * i + nelt2 < nelt)
5300 sel[3 * i + nelt2] = 0;
5302 indices.new_vector (sel, 2, nelt);
5303 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5305 for (i = 0; i < nelt; i++)
5307 if (3 * i + nelt0 < nelt)
5308 sel[3 * i + nelt0] = 3 * i + nelt0;
5309 if (3 * i + nelt1 < nelt)
5310 sel[3 * i + nelt1] = 3 * i + nelt1;
5311 if (3 * i + nelt2 < nelt)
5312 sel[3 * i + nelt2] = nelt + j2++;
5314 indices.new_vector (sel, 2, nelt);
5315 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5317 vect1 = dr_chain[0];
5318 vect2 = dr_chain[1];
5320 /* Create interleaving stmt:
5321 low = VEC_PERM_EXPR <vect1, vect2,
5322 {j, nelt, *, j + 1, nelt + j + 1, *,
5323 j + 2, nelt + j + 2, *, ...}> */
5324 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5325 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5326 vect2, perm3_mask_low);
5327 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5329 vect1 = data_ref;
5330 vect2 = dr_chain[2];
5331 /* Create interleaving stmt:
5332 low = VEC_PERM_EXPR <vect1, vect2,
5333 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5334 6, 7, nelt + j + 2, ...}> */
5335 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5336 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5337 vect2, perm3_mask_high);
5338 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5339 (*result_chain)[j] = data_ref;
5342 else
5344 /* If length is not equal to 3 then only power of 2 is supported. */
5345 gcc_assert (pow2p_hwi (length));
5347 /* The encoding has 2 interleaved stepped patterns. */
5348 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5349 vec_perm_builder sel (nelt, 2, 3);
5350 sel.quick_grow (6);
5351 for (i = 0; i < 3; i++)
5353 sel[i * 2] = i;
5354 sel[i * 2 + 1] = i + nelt;
5356 vec_perm_indices indices (sel, 2, nelt);
5357 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5359 for (i = 0; i < 6; i++)
5360 sel[i] += exact_div (nelt, 2);
5361 indices.new_vector (sel, 2, nelt);
5362 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5364 for (i = 0, n = log_length; i < n; i++)
5366 for (j = 0; j < length/2; j++)
5368 vect1 = dr_chain[j];
5369 vect2 = dr_chain[j+length/2];
5371 /* Create interleaving stmt:
5372 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5373 ...}> */
5374 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5375 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5376 vect2, perm_mask_high);
5377 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5378 (*result_chain)[2*j] = high;
5380 /* Create interleaving stmt:
5381 low = VEC_PERM_EXPR <vect1, vect2,
5382 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5383 ...}> */
5384 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5385 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5386 vect2, perm_mask_low);
5387 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5388 (*result_chain)[2*j+1] = low;
5390 memcpy (dr_chain.address (), result_chain->address (),
5391 length * sizeof (tree));
5396 /* Function vect_setup_realignment
5398 This function is called when vectorizing an unaligned load using
5399 the dr_explicit_realign[_optimized] scheme.
5400 This function generates the following code at the loop prolog:
5402 p = initial_addr;
5403 x msq_init = *(floor(p)); # prolog load
5404 realignment_token = call target_builtin;
5405 loop:
5406 x msq = phi (msq_init, ---)
5408 The stmts marked with x are generated only for the case of
5409 dr_explicit_realign_optimized.
5411 The code above sets up a new (vector) pointer, pointing to the first
5412 location accessed by STMT_INFO, and a "floor-aligned" load using that
5413 pointer. It also generates code to compute the "realignment-token"
5414 (if the relevant target hook was defined), and creates a phi-node at the
5415 loop-header bb whose arguments are the result of the prolog-load (created
5416 by this function) and the result of a load that takes place in the loop
5417 (to be created by the caller to this function).
5419 For the case of dr_explicit_realign_optimized:
5420 The caller to this function uses the phi-result (msq) to create the
5421 realignment code inside the loop, and sets up the missing phi argument,
5422 as follows:
5423 loop:
5424 msq = phi (msq_init, lsq)
5425 lsq = *(floor(p')); # load in loop
5426 result = realign_load (msq, lsq, realignment_token);
5428 For the case of dr_explicit_realign:
5429 loop:
5430 msq = *(floor(p)); # load in loop
5431 p' = p + (VS-1);
5432 lsq = *(floor(p')); # load in loop
5433 result = realign_load (msq, lsq, realignment_token);
5435 Input:
5436 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5437 a memory location that may be unaligned.
5438 BSI - place where new code is to be inserted.
5439 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5440 is used.
5442 Output:
5443 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5444 target hook, if defined.
5445 Return value - the result of the loop-header phi node. */
5447 tree
5448 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5449 gimple_stmt_iterator *gsi, tree *realignment_token,
5450 enum dr_alignment_support alignment_support_scheme,
5451 tree init_addr,
5452 class loop **at_loop)
5454 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5455 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5456 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5457 struct data_reference *dr = dr_info->dr;
5458 class loop *loop = NULL;
5459 edge pe = NULL;
5460 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5461 tree vec_dest;
5462 gimple *inc;
5463 tree ptr;
5464 tree data_ref;
5465 basic_block new_bb;
5466 tree msq_init = NULL_TREE;
5467 tree new_temp;
5468 gphi *phi_stmt;
5469 tree msq = NULL_TREE;
5470 gimple_seq stmts = NULL;
5471 bool compute_in_loop = false;
5472 bool nested_in_vect_loop = false;
5473 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5474 class loop *loop_for_initial_load = NULL;
5476 if (loop_vinfo)
5478 loop = LOOP_VINFO_LOOP (loop_vinfo);
5479 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5482 gcc_assert (alignment_support_scheme == dr_explicit_realign
5483 || alignment_support_scheme == dr_explicit_realign_optimized);
5485 /* We need to generate three things:
5486 1. the misalignment computation
5487 2. the extra vector load (for the optimized realignment scheme).
5488 3. the phi node for the two vectors from which the realignment is
5489 done (for the optimized realignment scheme). */
5491 /* 1. Determine where to generate the misalignment computation.
5493 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5494 calculation will be generated by this function, outside the loop (in the
5495 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5496 caller, inside the loop.
5498 Background: If the misalignment remains fixed throughout the iterations of
5499 the loop, then both realignment schemes are applicable, and also the
5500 misalignment computation can be done outside LOOP. This is because we are
5501 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5502 are a multiple of VS (the Vector Size), and therefore the misalignment in
5503 different vectorized LOOP iterations is always the same.
5504 The problem arises only if the memory access is in an inner-loop nested
5505 inside LOOP, which is now being vectorized using outer-loop vectorization.
5506 This is the only case when the misalignment of the memory access may not
5507 remain fixed throughout the iterations of the inner-loop (as explained in
5508 detail in vect_supportable_dr_alignment). In this case, not only is the
5509 optimized realignment scheme not applicable, but also the misalignment
5510 computation (and generation of the realignment token that is passed to
5511 REALIGN_LOAD) have to be done inside the loop.
5513 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5514 or not, which in turn determines if the misalignment is computed inside
5515 the inner-loop, or outside LOOP. */
5517 if (init_addr != NULL_TREE || !loop_vinfo)
5519 compute_in_loop = true;
5520 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5524 /* 2. Determine where to generate the extra vector load.
5526 For the optimized realignment scheme, instead of generating two vector
5527 loads in each iteration, we generate a single extra vector load in the
5528 preheader of the loop, and in each iteration reuse the result of the
5529 vector load from the previous iteration. In case the memory access is in
5530 an inner-loop nested inside LOOP, which is now being vectorized using
5531 outer-loop vectorization, we need to determine whether this initial vector
5532 load should be generated at the preheader of the inner-loop, or can be
5533 generated at the preheader of LOOP. If the memory access has no evolution
5534 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5535 to be generated inside LOOP (in the preheader of the inner-loop). */
5537 if (nested_in_vect_loop)
5539 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5540 bool invariant_in_outerloop =
5541 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5542 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5544 else
5545 loop_for_initial_load = loop;
5546 if (at_loop)
5547 *at_loop = loop_for_initial_load;
5549 if (loop_for_initial_load)
5550 pe = loop_preheader_edge (loop_for_initial_load);
5552 /* 3. For the case of the optimized realignment, create the first vector
5553 load at the loop preheader. */
5555 if (alignment_support_scheme == dr_explicit_realign_optimized)
5557 /* Create msq_init = *(floor(p1)) in the loop preheader */
5558 gassign *new_stmt;
5560 gcc_assert (!compute_in_loop);
5561 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5562 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5563 loop_for_initial_load, NULL_TREE,
5564 &init_addr, NULL, &inc, true);
5565 if (TREE_CODE (ptr) == SSA_NAME)
5566 new_temp = copy_ssa_name (ptr);
5567 else
5568 new_temp = make_ssa_name (TREE_TYPE (ptr));
5569 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5570 tree type = TREE_TYPE (ptr);
5571 new_stmt = gimple_build_assign
5572 (new_temp, BIT_AND_EXPR, ptr,
5573 fold_build2 (MINUS_EXPR, type,
5574 build_int_cst (type, 0),
5575 build_int_cst (type, align)));
5576 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5577 gcc_assert (!new_bb);
5578 data_ref
5579 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5580 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5581 vect_copy_ref_info (data_ref, DR_REF (dr));
5582 new_stmt = gimple_build_assign (vec_dest, data_ref);
5583 new_temp = make_ssa_name (vec_dest, new_stmt);
5584 gimple_assign_set_lhs (new_stmt, new_temp);
5585 if (pe)
5587 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5588 gcc_assert (!new_bb);
5590 else
5591 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5593 msq_init = gimple_assign_lhs (new_stmt);
5596 /* 4. Create realignment token using a target builtin, if available.
5597 It is done either inside the containing loop, or before LOOP (as
5598 determined above). */
5600 if (targetm.vectorize.builtin_mask_for_load)
5602 gcall *new_stmt;
5603 tree builtin_decl;
5605 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5606 if (!init_addr)
5608 /* Generate the INIT_ADDR computation outside LOOP. */
5609 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5610 stmt_info, &stmts,
5611 NULL_TREE);
5612 if (loop)
5614 pe = loop_preheader_edge (loop);
5615 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5616 gcc_assert (!new_bb);
5618 else
5619 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5622 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5623 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5624 vec_dest =
5625 vect_create_destination_var (scalar_dest,
5626 gimple_call_return_type (new_stmt));
5627 new_temp = make_ssa_name (vec_dest, new_stmt);
5628 gimple_call_set_lhs (new_stmt, new_temp);
5630 if (compute_in_loop)
5631 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5632 else
5634 /* Generate the misalignment computation outside LOOP. */
5635 pe = loop_preheader_edge (loop);
5636 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5637 gcc_assert (!new_bb);
5640 *realignment_token = gimple_call_lhs (new_stmt);
5642 /* The result of the CALL_EXPR to this builtin is determined from
5643 the value of the parameter and no global variables are touched
5644 which makes the builtin a "const" function. Requiring the
5645 builtin to have the "const" attribute makes it unnecessary
5646 to call mark_call_clobbered. */
5647 gcc_assert (TREE_READONLY (builtin_decl));
5650 if (alignment_support_scheme == dr_explicit_realign)
5651 return msq;
5653 gcc_assert (!compute_in_loop);
5654 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5657 /* 5. Create msq = phi <msq_init, lsq> in loop */
5659 pe = loop_preheader_edge (containing_loop);
5660 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5661 msq = make_ssa_name (vec_dest);
5662 phi_stmt = create_phi_node (msq, containing_loop->header);
5663 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5665 return msq;
5669 /* Function vect_grouped_load_supported.
5671 COUNT is the size of the load group (the number of statements plus the
5672 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5673 only one statement, with a gap of COUNT - 1.
5675 Returns true if a suitable permute exists. */
5677 bool
5678 vect_grouped_load_supported (tree vectype, bool single_element_p,
5679 unsigned HOST_WIDE_INT count)
5681 machine_mode mode = TYPE_MODE (vectype);
5683 /* If this is single-element interleaving with an element distance
5684 that leaves unused vector loads around punt - we at least create
5685 very sub-optimal code in that case (and blow up memory,
5686 see PR65518). */
5687 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5689 if (dump_enabled_p ())
5690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691 "single-element interleaving not supported "
5692 "for not adjacent vector loads\n");
5693 return false;
5696 /* vect_permute_load_chain requires the group size to be equal to 3 or
5697 be a power of two. */
5698 if (count != 3 && exact_log2 (count) == -1)
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "the size of the group of accesses"
5703 " is not a power of 2 or not equal to 3\n");
5704 return false;
5707 /* Check that the permutation is supported. */
5708 if (VECTOR_MODE_P (mode))
5710 unsigned int i, j;
5711 if (count == 3)
5713 unsigned int nelt;
5714 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5716 if (dump_enabled_p ())
5717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5718 "cannot handle groups of 3 loads for"
5719 " variable-length vectors\n");
5720 return false;
5723 vec_perm_builder sel (nelt, nelt, 1);
5724 sel.quick_grow (nelt);
5725 vec_perm_indices indices;
5726 unsigned int k;
5727 for (k = 0; k < 3; k++)
5729 for (i = 0; i < nelt; i++)
5730 if (3 * i + k < 2 * nelt)
5731 sel[i] = 3 * i + k;
5732 else
5733 sel[i] = 0;
5734 indices.new_vector (sel, 2, nelt);
5735 if (!can_vec_perm_const_p (mode, indices))
5737 if (dump_enabled_p ())
5738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5739 "shuffle of 3 loads is not supported by"
5740 " target\n");
5741 return false;
5743 for (i = 0, j = 0; i < nelt; i++)
5744 if (3 * i + k < 2 * nelt)
5745 sel[i] = i;
5746 else
5747 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5748 indices.new_vector (sel, 2, nelt);
5749 if (!can_vec_perm_const_p (mode, indices))
5751 if (dump_enabled_p ())
5752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753 "shuffle of 3 loads is not supported by"
5754 " target\n");
5755 return false;
5758 return true;
5760 else
5762 /* If length is not equal to 3 then only power of 2 is supported. */
5763 gcc_assert (pow2p_hwi (count));
5764 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5766 /* The encoding has a single stepped pattern. */
5767 vec_perm_builder sel (nelt, 1, 3);
5768 sel.quick_grow (3);
5769 for (i = 0; i < 3; i++)
5770 sel[i] = i * 2;
5771 vec_perm_indices indices (sel, 2, nelt);
5772 if (can_vec_perm_const_p (mode, indices))
5774 for (i = 0; i < 3; i++)
5775 sel[i] = i * 2 + 1;
5776 indices.new_vector (sel, 2, nelt);
5777 if (can_vec_perm_const_p (mode, indices))
5778 return true;
5783 if (dump_enabled_p ())
5784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5785 "extract even/odd not supported by target\n");
5786 return false;
5789 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5790 type VECTYPE. MASKED_P says whether the masked form is needed. */
5792 bool
5793 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5794 bool masked_p)
5796 if (masked_p)
5797 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5798 vec_mask_load_lanes_optab,
5799 vectype, count);
5800 else
5801 return vect_lanes_optab_supported_p ("vec_load_lanes",
5802 vec_load_lanes_optab,
5803 vectype, count);
5806 /* Function vect_permute_load_chain.
5808 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5809 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5810 the input data correctly. Return the final references for loads in
5811 RESULT_CHAIN.
5813 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5814 The input is 4 vectors each containing 8 elements. We assign a number to each
5815 element, the input sequence is:
5817 1st vec: 0 1 2 3 4 5 6 7
5818 2nd vec: 8 9 10 11 12 13 14 15
5819 3rd vec: 16 17 18 19 20 21 22 23
5820 4th vec: 24 25 26 27 28 29 30 31
5822 The output sequence should be:
5824 1st vec: 0 4 8 12 16 20 24 28
5825 2nd vec: 1 5 9 13 17 21 25 29
5826 3rd vec: 2 6 10 14 18 22 26 30
5827 4th vec: 3 7 11 15 19 23 27 31
5829 i.e., the first output vector should contain the first elements of each
5830 interleaving group, etc.
5832 We use extract_even/odd instructions to create such output. The input of
5833 each extract_even/odd operation is two vectors
5834 1st vec 2nd vec
5835 0 1 2 3 4 5 6 7
5837 and the output is the vector of extracted even/odd elements. The output of
5838 extract_even will be: 0 2 4 6
5839 and of extract_odd: 1 3 5 7
5842 The permutation is done in log LENGTH stages. In each stage extract_even
5843 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5844 their order. In our example,
5846 E1: extract_even (1st vec, 2nd vec)
5847 E2: extract_odd (1st vec, 2nd vec)
5848 E3: extract_even (3rd vec, 4th vec)
5849 E4: extract_odd (3rd vec, 4th vec)
5851 The output for the first stage will be:
5853 E1: 0 2 4 6 8 10 12 14
5854 E2: 1 3 5 7 9 11 13 15
5855 E3: 16 18 20 22 24 26 28 30
5856 E4: 17 19 21 23 25 27 29 31
5858 In order to proceed and create the correct sequence for the next stage (or
5859 for the correct output, if the second stage is the last one, as in our
5860 example), we first put the output of extract_even operation and then the
5861 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5862 The input for the second stage is:
5864 1st vec (E1): 0 2 4 6 8 10 12 14
5865 2nd vec (E3): 16 18 20 22 24 26 28 30
5866 3rd vec (E2): 1 3 5 7 9 11 13 15
5867 4th vec (E4): 17 19 21 23 25 27 29 31
5869 The output of the second stage:
5871 E1: 0 4 8 12 16 20 24 28
5872 E2: 2 6 10 14 18 22 26 30
5873 E3: 1 5 9 13 17 21 25 29
5874 E4: 3 7 11 15 19 23 27 31
5876 And RESULT_CHAIN after reordering:
5878 1st vec (E1): 0 4 8 12 16 20 24 28
5879 2nd vec (E3): 1 5 9 13 17 21 25 29
5880 3rd vec (E2): 2 6 10 14 18 22 26 30
5881 4th vec (E4): 3 7 11 15 19 23 27 31. */
5883 static void
5884 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5885 unsigned int length,
5886 stmt_vec_info stmt_info,
5887 gimple_stmt_iterator *gsi,
5888 vec<tree> *result_chain)
5890 tree data_ref, first_vect, second_vect;
5891 tree perm_mask_even, perm_mask_odd;
5892 tree perm3_mask_low, perm3_mask_high;
5893 gimple *perm_stmt;
5894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5895 unsigned int i, j, log_length = exact_log2 (length);
5897 result_chain->quick_grow (length);
5898 memcpy (result_chain->address (), dr_chain.address (),
5899 length * sizeof (tree));
5901 if (length == 3)
5903 /* vect_grouped_load_supported ensures that this is constant. */
5904 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5905 unsigned int k;
5907 vec_perm_builder sel (nelt, nelt, 1);
5908 sel.quick_grow (nelt);
5909 vec_perm_indices indices;
5910 for (k = 0; k < 3; k++)
5912 for (i = 0; i < nelt; i++)
5913 if (3 * i + k < 2 * nelt)
5914 sel[i] = 3 * i + k;
5915 else
5916 sel[i] = 0;
5917 indices.new_vector (sel, 2, nelt);
5918 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5920 for (i = 0, j = 0; i < nelt; i++)
5921 if (3 * i + k < 2 * nelt)
5922 sel[i] = i;
5923 else
5924 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5925 indices.new_vector (sel, 2, nelt);
5926 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5928 first_vect = dr_chain[0];
5929 second_vect = dr_chain[1];
5931 /* Create interleaving stmt (low part of):
5932 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5933 ...}> */
5934 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5935 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5936 second_vect, perm3_mask_low);
5937 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5939 /* Create interleaving stmt (high part of):
5940 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5941 ...}> */
5942 first_vect = data_ref;
5943 second_vect = dr_chain[2];
5944 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5945 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5946 second_vect, perm3_mask_high);
5947 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5948 (*result_chain)[k] = data_ref;
5951 else
5953 /* If length is not equal to 3 then only power of 2 is supported. */
5954 gcc_assert (pow2p_hwi (length));
5956 /* The encoding has a single stepped pattern. */
5957 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5958 vec_perm_builder sel (nelt, 1, 3);
5959 sel.quick_grow (3);
5960 for (i = 0; i < 3; ++i)
5961 sel[i] = i * 2;
5962 vec_perm_indices indices (sel, 2, nelt);
5963 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5965 for (i = 0; i < 3; ++i)
5966 sel[i] = i * 2 + 1;
5967 indices.new_vector (sel, 2, nelt);
5968 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5970 for (i = 0; i < log_length; i++)
5972 for (j = 0; j < length; j += 2)
5974 first_vect = dr_chain[j];
5975 second_vect = dr_chain[j+1];
5977 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5978 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5979 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5980 first_vect, second_vect,
5981 perm_mask_even);
5982 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5983 (*result_chain)[j/2] = data_ref;
5985 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5986 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5987 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5988 first_vect, second_vect,
5989 perm_mask_odd);
5990 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5991 (*result_chain)[j/2+length/2] = data_ref;
5993 memcpy (dr_chain.address (), result_chain->address (),
5994 length * sizeof (tree));
5999 /* Function vect_shift_permute_load_chain.
6001 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6002 sequence of stmts to reorder the input data accordingly.
6003 Return the final references for loads in RESULT_CHAIN.
6004 Return true if successed, false otherwise.
6006 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6007 The input is 3 vectors each containing 8 elements. We assign a
6008 number to each element, the input sequence is:
6010 1st vec: 0 1 2 3 4 5 6 7
6011 2nd vec: 8 9 10 11 12 13 14 15
6012 3rd vec: 16 17 18 19 20 21 22 23
6014 The output sequence should be:
6016 1st vec: 0 3 6 9 12 15 18 21
6017 2nd vec: 1 4 7 10 13 16 19 22
6018 3rd vec: 2 5 8 11 14 17 20 23
6020 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6022 First we shuffle all 3 vectors to get correct elements order:
6024 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6025 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6026 3rd vec: (16 19 22) (17 20 23) (18 21)
6028 Next we unite and shift vector 3 times:
6030 1st step:
6031 shift right by 6 the concatenation of:
6032 "1st vec" and "2nd vec"
6033 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6034 "2nd vec" and "3rd vec"
6035 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6036 "3rd vec" and "1st vec"
6037 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6038 | New vectors |
6040 So that now new vectors are:
6042 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6043 2nd vec: (10 13) (16 19 22) (17 20 23)
6044 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6046 2nd step:
6047 shift right by 5 the concatenation of:
6048 "1st vec" and "3rd vec"
6049 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6050 "2nd vec" and "1st vec"
6051 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6052 "3rd vec" and "2nd vec"
6053 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6054 | New vectors |
6056 So that now new vectors are:
6058 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6059 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6060 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6062 3rd step:
6063 shift right by 5 the concatenation of:
6064 "1st vec" and "1st vec"
6065 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6066 shift right by 3 the concatenation of:
6067 "2nd vec" and "2nd vec"
6068 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6069 | New vectors |
6071 So that now all vectors are READY:
6072 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6073 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6074 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6076 This algorithm is faster than one in vect_permute_load_chain if:
6077 1. "shift of a concatination" is faster than general permutation.
6078 This is usually so.
6079 2. The TARGET machine can't execute vector instructions in parallel.
6080 This is because each step of the algorithm depends on previous.
6081 The algorithm in vect_permute_load_chain is much more parallel.
6083 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6086 static bool
6087 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6088 unsigned int length,
6089 stmt_vec_info stmt_info,
6090 gimple_stmt_iterator *gsi,
6091 vec<tree> *result_chain)
6093 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6094 tree perm2_mask1, perm2_mask2, perm3_mask;
6095 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6096 gimple *perm_stmt;
6098 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6099 unsigned int i;
6100 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6102 unsigned HOST_WIDE_INT nelt, vf;
6103 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6104 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6105 /* Not supported for variable-length vectors. */
6106 return false;
6108 vec_perm_builder sel (nelt, nelt, 1);
6109 sel.quick_grow (nelt);
6111 result_chain->quick_grow (length);
6112 memcpy (result_chain->address (), dr_chain.address (),
6113 length * sizeof (tree));
6115 if (pow2p_hwi (length) && vf > 4)
6117 unsigned int j, log_length = exact_log2 (length);
6118 for (i = 0; i < nelt / 2; ++i)
6119 sel[i] = i * 2;
6120 for (i = 0; i < nelt / 2; ++i)
6121 sel[nelt / 2 + i] = i * 2 + 1;
6122 vec_perm_indices indices (sel, 2, nelt);
6123 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127 "shuffle of 2 fields structure is not \
6128 supported by target\n");
6129 return false;
6131 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6133 for (i = 0; i < nelt / 2; ++i)
6134 sel[i] = i * 2 + 1;
6135 for (i = 0; i < nelt / 2; ++i)
6136 sel[nelt / 2 + i] = i * 2;
6137 indices.new_vector (sel, 2, nelt);
6138 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6140 if (dump_enabled_p ())
6141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6142 "shuffle of 2 fields structure is not \
6143 supported by target\n");
6144 return false;
6146 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6148 /* Generating permutation constant to shift all elements.
6149 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6150 for (i = 0; i < nelt; i++)
6151 sel[i] = nelt / 2 + i;
6152 indices.new_vector (sel, 2, nelt);
6153 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6155 if (dump_enabled_p ())
6156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6157 "shift permutation is not supported by target\n");
6158 return false;
6160 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6162 /* Generating permutation constant to select vector from 2.
6163 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6164 for (i = 0; i < nelt / 2; i++)
6165 sel[i] = i;
6166 for (i = nelt / 2; i < nelt; i++)
6167 sel[i] = nelt + i;
6168 indices.new_vector (sel, 2, nelt);
6169 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6171 if (dump_enabled_p ())
6172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6173 "select is not supported by target\n");
6174 return false;
6176 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6178 for (i = 0; i < log_length; i++)
6180 for (j = 0; j < length; j += 2)
6182 first_vect = dr_chain[j];
6183 second_vect = dr_chain[j + 1];
6185 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6186 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187 first_vect, first_vect,
6188 perm2_mask1);
6189 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6190 vect[0] = data_ref;
6192 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6193 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6194 second_vect, second_vect,
6195 perm2_mask2);
6196 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6197 vect[1] = data_ref;
6199 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6200 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6201 vect[0], vect[1], shift1_mask);
6202 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6203 (*result_chain)[j/2 + length/2] = data_ref;
6205 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6206 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6207 vect[0], vect[1], select_mask);
6208 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6209 (*result_chain)[j/2] = data_ref;
6211 memcpy (dr_chain.address (), result_chain->address (),
6212 length * sizeof (tree));
6214 return true;
6216 if (length == 3 && vf > 2)
6218 unsigned int k = 0, l = 0;
6220 /* Generating permutation constant to get all elements in rigth order.
6221 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6222 for (i = 0; i < nelt; i++)
6224 if (3 * k + (l % 3) >= nelt)
6226 k = 0;
6227 l += (3 - (nelt % 3));
6229 sel[i] = 3 * k + (l % 3);
6230 k++;
6232 vec_perm_indices indices (sel, 2, nelt);
6233 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6235 if (dump_enabled_p ())
6236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6237 "shuffle of 3 fields structure is not \
6238 supported by target\n");
6239 return false;
6241 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6243 /* Generating permutation constant to shift all elements.
6244 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6245 for (i = 0; i < nelt; i++)
6246 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6247 indices.new_vector (sel, 2, nelt);
6248 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6250 if (dump_enabled_p ())
6251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6252 "shift permutation is not supported by target\n");
6253 return false;
6255 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6257 /* Generating permutation constant to shift all elements.
6258 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6259 for (i = 0; i < nelt; i++)
6260 sel[i] = 2 * (nelt / 3) + 1 + i;
6261 indices.new_vector (sel, 2, nelt);
6262 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6264 if (dump_enabled_p ())
6265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6266 "shift permutation is not supported by target\n");
6267 return false;
6269 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6271 /* Generating permutation constant to shift all elements.
6272 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6273 for (i = 0; i < nelt; i++)
6274 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6275 indices.new_vector (sel, 2, nelt);
6276 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6278 if (dump_enabled_p ())
6279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6280 "shift permutation is not supported by target\n");
6281 return false;
6283 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6285 /* Generating permutation constant to shift all elements.
6286 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6287 for (i = 0; i < nelt; i++)
6288 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6289 indices.new_vector (sel, 2, nelt);
6290 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6292 if (dump_enabled_p ())
6293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6294 "shift permutation is not supported by target\n");
6295 return false;
6297 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6299 for (k = 0; k < 3; k++)
6301 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6302 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303 dr_chain[k], dr_chain[k],
6304 perm3_mask);
6305 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6306 vect[k] = data_ref;
6309 for (k = 0; k < 3; k++)
6311 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6312 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6313 vect[k % 3], vect[(k + 1) % 3],
6314 shift1_mask);
6315 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6316 vect_shift[k] = data_ref;
6319 for (k = 0; k < 3; k++)
6321 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6322 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6323 vect_shift[(4 - k) % 3],
6324 vect_shift[(3 - k) % 3],
6325 shift2_mask);
6326 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6327 vect[k] = data_ref;
6330 (*result_chain)[3 - (nelt % 3)] = vect[2];
6332 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6333 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6334 vect[0], shift3_mask);
6335 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6336 (*result_chain)[nelt % 3] = data_ref;
6338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6340 vect[1], shift4_mask);
6341 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6342 (*result_chain)[0] = data_ref;
6343 return true;
6345 return false;
6348 /* Function vect_transform_grouped_load.
6350 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6351 to perform their permutation and ascribe the result vectorized statements to
6352 the scalar statements.
6355 void
6356 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6357 vec<tree> dr_chain,
6358 int size, gimple_stmt_iterator *gsi)
6360 machine_mode mode;
6361 vec<tree> result_chain = vNULL;
6363 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6364 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6365 vectors, that are ready for vector computation. */
6366 result_chain.create (size);
6368 /* If reassociation width for vector type is 2 or greater target machine can
6369 execute 2 or more vector instructions in parallel. Otherwise try to
6370 get chain for loads group using vect_shift_permute_load_chain. */
6371 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6372 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6373 || pow2p_hwi (size)
6374 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6375 gsi, &result_chain))
6376 vect_permute_load_chain (vinfo, dr_chain,
6377 size, stmt_info, gsi, &result_chain);
6378 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6379 result_chain.release ();
6382 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6383 generated as part of the vectorization of STMT_INFO. Assign the statement
6384 for each vector to the associated scalar statement. */
6386 void
6387 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6388 vec<tree> result_chain)
6390 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6391 unsigned int i, gap_count;
6392 tree tmp_data_ref;
6394 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6395 Since we scan the chain starting from it's first node, their order
6396 corresponds the order of data-refs in RESULT_CHAIN. */
6397 stmt_vec_info next_stmt_info = first_stmt_info;
6398 gap_count = 1;
6399 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6401 if (!next_stmt_info)
6402 break;
6404 /* Skip the gaps. Loads created for the gaps will be removed by dead
6405 code elimination pass later. No need to check for the first stmt in
6406 the group, since it always exists.
6407 DR_GROUP_GAP is the number of steps in elements from the previous
6408 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6409 correspond to the gaps. */
6410 if (next_stmt_info != first_stmt_info
6411 && gap_count < DR_GROUP_GAP (next_stmt_info))
6413 gap_count++;
6414 continue;
6417 /* ??? The following needs cleanup after the removal of
6418 DR_GROUP_SAME_DR_STMT. */
6419 if (next_stmt_info)
6421 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6422 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6423 copies, and we put the new vector statement last. */
6424 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6426 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6427 gap_count = 1;
6432 /* Function vect_force_dr_alignment_p.
6434 Returns whether the alignment of a DECL can be forced to be aligned
6435 on ALIGNMENT bit boundary. */
6437 bool
6438 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6440 if (!VAR_P (decl))
6441 return false;
6443 if (decl_in_symtab_p (decl)
6444 && !symtab_node::get (decl)->can_increase_alignment_p ())
6445 return false;
6447 if (TREE_STATIC (decl))
6448 return (known_le (alignment,
6449 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6450 else
6451 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6455 /* Return whether the data reference DR_INFO is supported with respect to its
6456 alignment.
6457 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6458 it is aligned, i.e., check if it is possible to vectorize it with different
6459 alignment. */
6461 enum dr_alignment_support
6462 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6463 bool check_aligned_accesses)
6465 data_reference *dr = dr_info->dr;
6466 stmt_vec_info stmt_info = dr_info->stmt;
6467 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6468 machine_mode mode = TYPE_MODE (vectype);
6469 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6470 class loop *vect_loop = NULL;
6471 bool nested_in_vect_loop = false;
6473 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6474 return dr_aligned;
6476 /* For now assume all conditional loads/stores support unaligned
6477 access without any special code. */
6478 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6479 if (gimple_call_internal_p (stmt)
6480 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6481 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6482 return dr_unaligned_supported;
6484 if (loop_vinfo)
6486 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6487 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6490 /* Possibly unaligned access. */
6492 /* We can choose between using the implicit realignment scheme (generating
6493 a misaligned_move stmt) and the explicit realignment scheme (generating
6494 aligned loads with a REALIGN_LOAD). There are two variants to the
6495 explicit realignment scheme: optimized, and unoptimized.
6496 We can optimize the realignment only if the step between consecutive
6497 vector loads is equal to the vector size. Since the vector memory
6498 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6499 is guaranteed that the misalignment amount remains the same throughout the
6500 execution of the vectorized loop. Therefore, we can create the
6501 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6502 at the loop preheader.
6504 However, in the case of outer-loop vectorization, when vectorizing a
6505 memory access in the inner-loop nested within the LOOP that is now being
6506 vectorized, while it is guaranteed that the misalignment of the
6507 vectorized memory access will remain the same in different outer-loop
6508 iterations, it is *not* guaranteed that is will remain the same throughout
6509 the execution of the inner-loop. This is because the inner-loop advances
6510 with the original scalar step (and not in steps of VS). If the inner-loop
6511 step happens to be a multiple of VS, then the misalignment remains fixed
6512 and we can use the optimized realignment scheme. For example:
6514 for (i=0; i<N; i++)
6515 for (j=0; j<M; j++)
6516 s += a[i+j];
6518 When vectorizing the i-loop in the above example, the step between
6519 consecutive vector loads is 1, and so the misalignment does not remain
6520 fixed across the execution of the inner-loop, and the realignment cannot
6521 be optimized (as illustrated in the following pseudo vectorized loop):
6523 for (i=0; i<N; i+=4)
6524 for (j=0; j<M; j++){
6525 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6526 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6527 // (assuming that we start from an aligned address).
6530 We therefore have to use the unoptimized realignment scheme:
6532 for (i=0; i<N; i+=4)
6533 for (j=k; j<M; j+=4)
6534 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6535 // that the misalignment of the initial address is
6536 // 0).
6538 The loop can then be vectorized as follows:
6540 for (k=0; k<4; k++){
6541 rt = get_realignment_token (&vp[k]);
6542 for (i=0; i<N; i+=4){
6543 v1 = vp[i+k];
6544 for (j=k; j<M; j+=4){
6545 v2 = vp[i+j+VS-1];
6546 va = REALIGN_LOAD <v1,v2,rt>;
6547 vs += va;
6548 v1 = v2;
6551 } */
6553 if (DR_IS_READ (dr))
6555 bool is_packed = false;
6556 tree type = (TREE_TYPE (DR_REF (dr)));
6558 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6559 && (!targetm.vectorize.builtin_mask_for_load
6560 || targetm.vectorize.builtin_mask_for_load ()))
6562 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6564 /* If we are doing SLP then the accesses need not have the
6565 same alignment, instead it depends on the SLP group size. */
6566 if (loop_vinfo
6567 && STMT_SLP_TYPE (stmt_info)
6568 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569 * (DR_GROUP_SIZE
6570 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6571 TYPE_VECTOR_SUBPARTS (vectype)))
6573 else if (!loop_vinfo
6574 || (nested_in_vect_loop
6575 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6576 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6577 return dr_explicit_realign;
6578 else
6579 return dr_explicit_realign_optimized;
6581 if (!known_alignment_for_access_p (dr_info))
6582 is_packed = not_size_aligned (DR_REF (dr));
6584 if (targetm.vectorize.support_vector_misalignment
6585 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6586 /* Can't software pipeline the loads, but can at least do them. */
6587 return dr_unaligned_supported;
6589 else
6591 bool is_packed = false;
6592 tree type = (TREE_TYPE (DR_REF (dr)));
6594 if (!known_alignment_for_access_p (dr_info))
6595 is_packed = not_size_aligned (DR_REF (dr));
6597 if (targetm.vectorize.support_vector_misalignment
6598 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6599 return dr_unaligned_supported;
6602 /* Unsupported. */
6603 return dr_unaligned_unsupported;