Fix PR ada/97504 on hppa*-*-hpux*.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob18e36c89d1475dbe1fe92ff757f21839e5db9d59
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode, array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
96 return true;
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
160 if (i < gimple_call_num_args (call))
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
182 static opt_result
183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return opt_result::success ();
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
205 static void
206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
224 static bool
225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
235 /* STMT_A and STMT_B belong to overlapping groups. All loads are
236 emitted at the position of the first scalar load.
237 Stores in a group are emitted at the position of the last scalar store.
238 Compute that position and check whether the resulting order matches
239 the current one. */
240 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
241 if (il_a)
243 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
244 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
245 s = DR_GROUP_NEXT_ELEMENT (s))
246 il_a = get_later_stmt (il_a, s);
247 else /* DR_IS_READ */
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 if (get_later_stmt (il_a, s) == il_a)
251 il_a = s;
253 else
254 il_a = stmtinfo_a;
255 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
256 if (il_b)
258 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
259 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
260 s = DR_GROUP_NEXT_ELEMENT (s))
261 il_b = get_later_stmt (il_b, s);
262 else /* DR_IS_READ */
263 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
264 s = DR_GROUP_NEXT_ELEMENT (s))
265 if (get_later_stmt (il_b, s) == il_b)
266 il_b = s;
268 else
269 il_b = stmtinfo_b;
270 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
271 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
274 /* A subroutine of vect_analyze_data_ref_dependence. Handle
275 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
276 distances. These distances are conservatively correct but they don't
277 reflect a guaranteed dependence.
279 Return true if this function does all the work necessary to avoid
280 an alias or false if the caller should use the dependence distances
281 to limit the vectorization factor in the usual way. LOOP_DEPTH is
282 the depth of the loop described by LOOP_VINFO and the other arguments
283 are as for vect_analyze_data_ref_dependence. */
285 static bool
286 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
287 loop_vec_info loop_vinfo,
288 int loop_depth, unsigned int *max_vf)
290 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
291 lambda_vector dist_v;
292 unsigned int i;
293 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
295 int dist = dist_v[loop_depth];
296 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
298 /* If the user asserted safelen >= DIST consecutive iterations
299 can be executed concurrently, assume independence.
301 ??? An alternative would be to add the alias check even
302 in this case, and vectorize the fallback loop with the
303 maximum VF set to safelen. However, if the user has
304 explicitly given a length, it's less likely that that
305 would be a win. */
306 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
308 if ((unsigned int) loop->safelen < *max_vf)
309 *max_vf = loop->safelen;
310 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
311 continue;
314 /* For dependence distances of 2 or more, we have the option
315 of limiting VF or checking for an alias at runtime.
316 Prefer to check at runtime if we can, to avoid limiting
317 the VF unnecessarily when the bases are in fact independent.
319 Note that the alias checks will be removed if the VF ends up
320 being small enough. */
321 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
322 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
323 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
324 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
325 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
328 return true;
332 /* Function vect_analyze_data_ref_dependence.
334 FIXME: I needed to change the sense of the returned flag.
336 Return FALSE if there (might) exist a dependence between a memory-reference
337 DRA and a memory-reference DRB. When versioning for alias may check a
338 dependence at run-time, return TRUE. Adjust *MAX_VF according to
339 the data dependence. */
341 static opt_result
342 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
343 loop_vec_info loop_vinfo,
344 unsigned int *max_vf)
346 unsigned int i;
347 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
348 struct data_reference *dra = DDR_A (ddr);
349 struct data_reference *drb = DDR_B (ddr);
350 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
351 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
352 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
353 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
354 lambda_vector dist_v;
355 unsigned int loop_depth;
357 /* In loop analysis all data references should be vectorizable. */
358 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
359 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
360 gcc_unreachable ();
362 /* Independent data accesses. */
363 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
364 return opt_result::success ();
366 if (dra == drb
367 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
368 return opt_result::success ();
370 /* We do not have to consider dependences between accesses that belong
371 to the same group, unless the stride could be smaller than the
372 group size. */
373 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
374 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
375 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
376 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
377 return opt_result::success ();
379 /* Even if we have an anti-dependence then, as the vectorized loop covers at
380 least two scalar iterations, there is always also a true dependence.
381 As the vectorizer does not re-order loads and stores we can ignore
382 the anti-dependence if TBAA can disambiguate both DRs similar to the
383 case with known negative distance anti-dependences (positive
384 distance anti-dependences would violate TBAA constraints). */
385 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
386 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
387 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
388 get_alias_set (DR_REF (drb))))
389 return opt_result::success ();
391 /* Unknown data dependence. */
392 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
394 /* If user asserted safelen consecutive iterations can be
395 executed concurrently, assume independence. */
396 if (loop->safelen >= 2)
398 if ((unsigned int) loop->safelen < *max_vf)
399 *max_vf = loop->safelen;
400 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
401 return opt_result::success ();
404 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
405 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
406 return opt_result::failure_at
407 (stmtinfo_a->stmt,
408 "versioning for alias not supported for: "
409 "can't determine dependence between %T and %T\n",
410 DR_REF (dra), DR_REF (drb));
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
414 "versioning for alias required: "
415 "can't determine dependence between %T and %T\n",
416 DR_REF (dra), DR_REF (drb));
418 /* Add to list of ddrs that need to be tested at run-time. */
419 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
422 /* Known data dependence. */
423 if (DDR_NUM_DIST_VECTS (ddr) == 0)
425 /* If user asserted safelen consecutive iterations can be
426 executed concurrently, assume independence. */
427 if (loop->safelen >= 2)
429 if ((unsigned int) loop->safelen < *max_vf)
430 *max_vf = loop->safelen;
431 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
432 return opt_result::success ();
435 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
436 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
437 return opt_result::failure_at
438 (stmtinfo_a->stmt,
439 "versioning for alias not supported for: "
440 "bad dist vector for %T and %T\n",
441 DR_REF (dra), DR_REF (drb));
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
445 "versioning for alias required: "
446 "bad dist vector for %T and %T\n",
447 DR_REF (dra), DR_REF (drb));
448 /* Add to list of ddrs that need to be tested at run-time. */
449 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
452 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
455 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
456 loop_depth, max_vf))
457 return opt_result::success ();
459 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461 int dist = dist_v[loop_depth];
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location,
465 "dependence distance = %d.\n", dist);
467 if (dist == 0)
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "dependence distance == 0 between %T and %T\n",
472 DR_REF (dra), DR_REF (drb));
474 /* When we perform grouped accesses and perform implicit CSE
475 by detecting equal accesses and doing disambiguation with
476 runtime alias tests like for
477 .. = a[i];
478 .. = a[i+1];
479 a[i] = ..;
480 a[i+1] = ..;
481 *p = ..;
482 .. = a[i];
483 .. = a[i+1];
484 where we will end up loading { a[i], a[i+1] } once, make
485 sure that inserting group loads before the first load and
486 stores after the last store will do the right thing.
487 Similar for groups like
488 a[i] = ...;
489 ... = a[i];
490 a[i+1] = ...;
491 where loads from the group interleave with the store. */
492 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
493 return opt_result::failure_at (stmtinfo_a->stmt,
494 "READ_WRITE dependence"
495 " in interleaving.\n");
497 if (loop->safelen < 2)
499 tree indicator = dr_zero_step_indicator (dra);
500 if (!indicator || integer_zerop (indicator))
501 return opt_result::failure_at (stmtinfo_a->stmt,
502 "access also has a zero step\n");
503 else if (TREE_CODE (indicator) != INTEGER_CST)
504 vect_check_nonzero_value (loop_vinfo, indicator);
506 continue;
509 if (dist > 0 && DDR_REVERSED_P (ddr))
511 /* If DDR_REVERSED_P the order of the data-refs in DDR was
512 reversed (to make distance vector positive), and the actual
513 distance is negative. */
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "dependence distance negative.\n");
517 /* When doing outer loop vectorization, we need to check if there is
518 a backward dependence at the inner loop level if the dependence
519 at the outer loop is reversed. See PR81740. */
520 if (nested_in_vect_loop_p (loop, stmtinfo_a)
521 || nested_in_vect_loop_p (loop, stmtinfo_b))
523 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
524 DDR_LOOP_NEST (ddr));
525 if (dist_v[inner_depth] < 0)
526 return opt_result::failure_at (stmtinfo_a->stmt,
527 "not vectorized, dependence "
528 "between data-refs %T and %T\n",
529 DR_REF (dra), DR_REF (drb));
531 /* Record a negative dependence distance to later limit the
532 amount of stmt copying / unrolling we can perform.
533 Only need to handle read-after-write dependence. */
534 if (DR_IS_READ (drb)
535 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
536 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
537 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
538 continue;
541 unsigned int abs_dist = abs (dist);
542 if (abs_dist >= 2 && abs_dist < *max_vf)
544 /* The dependence distance requires reduction of the maximal
545 vectorization factor. */
546 *max_vf = abs_dist;
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location,
549 "adjusting maximal vectorization factor to %i\n",
550 *max_vf);
553 if (abs_dist >= *max_vf)
555 /* Dependence distance does not create dependence, as far as
556 vectorization is concerned, in this case. */
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "dependence distance >= VF.\n");
560 continue;
563 return opt_result::failure_at (stmtinfo_a->stmt,
564 "not vectorized, possible dependence "
565 "between data-refs %T and %T\n",
566 DR_REF (dra), DR_REF (drb));
569 return opt_result::success ();
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the loop, and make sure there do not
575 exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 opt_result
579 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
580 unsigned int *max_vf)
582 unsigned int i;
583 struct data_dependence_relation *ddr;
585 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589 LOOP_VINFO_DDRS (loop_vinfo)
590 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
591 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
592 /* We do not need read-read dependences. */
593 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
594 &LOOP_VINFO_DDRS (loop_vinfo),
595 LOOP_VINFO_LOOP_NEST (loop_vinfo),
596 false);
597 gcc_assert (res);
600 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602 /* For epilogues we either have no aliases or alias versioning
603 was applied to original loop. Therefore we may just get max_vf
604 using VF of original loop. */
605 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
606 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
607 else
608 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610 opt_result res
611 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
612 if (!res)
613 return res;
616 return opt_result::success ();
620 /* Function vect_slp_analyze_data_ref_dependence.
622 Return TRUE if there (might) exist a dependence between a memory-reference
623 DRA and a memory-reference DRB for VINFO. When versioning for alias
624 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
625 according to the data dependence. */
627 static bool
628 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
629 struct data_dependence_relation *ddr)
631 struct data_reference *dra = DDR_A (ddr);
632 struct data_reference *drb = DDR_B (ddr);
633 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
634 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636 /* We need to check dependences of statements marked as unvectorizable
637 as well, they still can prohibit vectorization. */
639 /* Independent data accesses. */
640 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
641 return false;
643 if (dra == drb)
644 return false;
646 /* Read-read is OK. */
647 if (DR_IS_READ (dra) && DR_IS_READ (drb))
648 return false;
650 /* If dra and drb are part of the same interleaving chain consider
651 them independent. */
652 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
653 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
654 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
655 return false;
657 /* Unknown data dependence. */
658 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
662 "can't determine dependence between %T and %T\n",
663 DR_REF (dra), DR_REF (drb));
665 else if (dump_enabled_p ())
666 dump_printf_loc (MSG_NOTE, vect_location,
667 "determined dependence between %T and %T\n",
668 DR_REF (dra), DR_REF (drb));
670 return true;
674 /* Analyze dependences involved in the transform of SLP NODE. STORES
675 contain the vector of scalar stores of this instance if we are
676 disambiguating the loads. */
678 static bool
679 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
680 vec<stmt_vec_info> stores,
681 stmt_vec_info last_store_info)
683 /* This walks over all stmts involved in the SLP load/store done
684 in NODE verifying we can sink them up to the last stmt in the
685 group. */
686 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
688 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
689 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
691 stmt_vec_info access_info
692 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
693 if (access_info == last_access_info)
694 continue;
695 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
696 ao_ref ref;
697 bool ref_initialized_p = false;
698 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
699 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
701 gimple *stmt = gsi_stmt (gsi);
702 if (! gimple_vuse (stmt))
703 continue;
705 /* If we couldn't record a (single) data reference for this
706 stmt we have to resort to the alias oracle. */
707 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
708 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
709 if (!dr_b)
711 /* We are moving a store - this means
712 we cannot use TBAA for disambiguation. */
713 if (!ref_initialized_p)
714 ao_ref_init (&ref, DR_REF (dr_a));
715 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
716 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
717 return false;
718 continue;
721 bool dependent = false;
722 /* If we run into a store of this same instance (we've just
723 marked those) then delay dependence checking until we run
724 into the last store because this is where it will have
725 been sunk to (and we verify if we can do that as well). */
726 if (gimple_visited_p (stmt))
728 if (stmt_info != last_store_info)
729 continue;
730 unsigned i;
731 stmt_vec_info store_info;
732 FOR_EACH_VEC_ELT (stores, i, store_info)
734 data_reference *store_dr
735 = STMT_VINFO_DATA_REF (store_info);
736 ddr_p ddr = initialize_data_dependence_relation
737 (dr_a, store_dr, vNULL);
738 dependent
739 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
740 free_dependence_relation (ddr);
741 if (dependent)
742 break;
745 else
747 ddr_p ddr = initialize_data_dependence_relation (dr_a,
748 dr_b, vNULL);
749 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
750 free_dependence_relation (ddr);
752 if (dependent)
753 return false;
757 else /* DR_IS_READ */
759 stmt_vec_info first_access_info
760 = vect_find_first_scalar_stmt_in_slp (node);
761 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
763 stmt_vec_info access_info
764 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
765 if (access_info == first_access_info)
766 continue;
767 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
768 ao_ref ref;
769 bool ref_initialized_p = false;
770 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
771 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
773 gimple *stmt = gsi_stmt (gsi);
774 if (! gimple_vdef (stmt))
775 continue;
777 /* If we couldn't record a (single) data reference for this
778 stmt we have to resort to the alias oracle. */
779 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
780 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
781 if (!dr_b)
783 /* We are hoisting a load - this means we can use
784 TBAA for disambiguation. */
785 if (!ref_initialized_p)
786 ao_ref_init (&ref, DR_REF (dr_a));
787 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
788 return false;
789 continue;
792 bool dependent = false;
793 /* If we run into a store of this same instance (we've just
794 marked those) then delay dependence checking until we run
795 into the last store because this is where it will have
796 been sunk to (and we verify if we can do that as well). */
797 if (gimple_visited_p (stmt))
799 if (stmt_info != last_store_info)
800 continue;
801 unsigned i;
802 stmt_vec_info store_info;
803 FOR_EACH_VEC_ELT (stores, i, store_info)
805 data_reference *store_dr
806 = STMT_VINFO_DATA_REF (store_info);
807 ddr_p ddr = initialize_data_dependence_relation
808 (dr_a, store_dr, vNULL);
809 dependent
810 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
811 free_dependence_relation (ddr);
812 if (dependent)
813 break;
816 else
818 ddr_p ddr = initialize_data_dependence_relation (dr_a,
819 dr_b, vNULL);
820 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
821 free_dependence_relation (ddr);
823 if (dependent)
824 return false;
828 return true;
832 /* Function vect_analyze_data_ref_dependences.
834 Examine all the data references in the basic-block, and make sure there
835 do not exist any data dependences between them. Set *MAX_VF according to
836 the maximum vectorization factor the data dependences allow. */
838 bool
839 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
841 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
843 /* The stores of this instance are at the root of the SLP tree. */
844 slp_tree store = SLP_INSTANCE_TREE (instance);
845 if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
846 store = NULL;
848 /* Verify we can sink stores to the vectorized stmt insert location. */
849 stmt_vec_info last_store_info = NULL;
850 if (store)
852 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
853 return false;
855 /* Mark stores in this instance and remember the last one. */
856 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
857 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
858 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
861 bool res = true;
863 /* Verify we can sink loads to the vectorized stmt insert location,
864 special-casing stores of this instance. */
865 slp_tree load;
866 unsigned int i;
867 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
868 if (! vect_slp_analyze_node_dependences (vinfo, load,
869 store
870 ? SLP_TREE_SCALAR_STMTS (store)
871 : vNULL, last_store_info))
873 res = false;
874 break;
877 /* Unset the visited flag. */
878 if (store)
879 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
880 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
882 return res;
885 /* Record the base alignment guarantee given by DRB, which occurs
886 in STMT_INFO. */
888 static void
889 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
890 innermost_loop_behavior *drb)
892 bool existed;
893 innermost_loop_behavior *&entry
894 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
895 if (!existed || entry->base_alignment < drb->base_alignment)
897 entry = drb;
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_NOTE, vect_location,
900 "recording new base alignment for %T\n"
901 " alignment: %d\n"
902 " misalignment: %d\n"
903 " based on: %G",
904 drb->base_address,
905 drb->base_alignment,
906 drb->base_misalignment,
907 stmt_info->stmt);
911 /* If the region we're going to vectorize is reached, all unconditional
912 data references occur at least once. We can therefore pool the base
913 alignment guarantees from each unconditional reference. Do this by
914 going through all the data references in VINFO and checking whether
915 the containing statement makes the reference unconditionally. If so,
916 record the alignment of the base address in VINFO so that it can be
917 used for all other references with the same base. */
919 void
920 vect_record_base_alignments (vec_info *vinfo)
922 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
923 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
924 data_reference *dr;
925 unsigned int i;
926 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
928 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
929 stmt_vec_info stmt_info = dr_info->stmt;
930 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
931 && STMT_VINFO_VECTORIZABLE (stmt_info)
932 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
934 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
936 /* If DR is nested in the loop that is being vectorized, we can also
937 record the alignment of the base wrt the outer loop. */
938 if (loop && nested_in_vect_loop_p (loop, stmt_info))
939 vect_record_base_alignment
940 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
945 /* Return the target alignment for the vectorized form of DR_INFO. */
947 static poly_uint64
948 vect_calculate_target_alignment (dr_vec_info *dr_info)
950 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
951 return targetm.vectorize.preferred_vector_alignment (vectype);
954 /* Function vect_compute_data_ref_alignment
956 Compute the misalignment of the data reference DR_INFO.
958 Output:
959 1. DR_MISALIGNMENT (DR_INFO) is defined.
961 FOR NOW: No analysis is actually performed. Misalignment is calculated
962 only for trivial cases. TODO. */
964 static void
965 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
967 stmt_vec_info stmt_info = dr_info->stmt;
968 vec_base_alignments *base_alignments = &vinfo->base_alignments;
969 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
970 class loop *loop = NULL;
971 tree ref = DR_REF (dr_info->dr);
972 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
974 if (dump_enabled_p ())
975 dump_printf_loc (MSG_NOTE, vect_location,
976 "vect_compute_data_ref_alignment:\n");
978 if (loop_vinfo)
979 loop = LOOP_VINFO_LOOP (loop_vinfo);
981 /* Initialize misalignment to unknown. */
982 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
984 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
985 return;
987 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
988 bool step_preserves_misalignment_p;
990 poly_uint64 vector_alignment
991 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
992 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
994 /* If the main loop has peeled for alignment we have no way of knowing
995 whether the data accesses in the epilogues are aligned. We can't at
996 compile time answer the question whether we have entered the main loop or
997 not. Fixes PR 92351. */
998 if (loop_vinfo)
1000 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1001 if (orig_loop_vinfo
1002 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1003 return;
1006 unsigned HOST_WIDE_INT vect_align_c;
1007 if (!vector_alignment.is_constant (&vect_align_c))
1008 return;
1010 /* No step for BB vectorization. */
1011 if (!loop)
1013 gcc_assert (integer_zerop (drb->step));
1014 step_preserves_misalignment_p = true;
1017 /* In case the dataref is in an inner-loop of the loop that is being
1018 vectorized (LOOP), we use the base and misalignment information
1019 relative to the outer-loop (LOOP). This is ok only if the misalignment
1020 stays the same throughout the execution of the inner-loop, which is why
1021 we have to check that the stride of the dataref in the inner-loop evenly
1022 divides by the vector alignment. */
1023 else if (nested_in_vect_loop_p (loop, stmt_info))
1025 step_preserves_misalignment_p
1026 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1028 if (dump_enabled_p ())
1030 if (step_preserves_misalignment_p)
1031 dump_printf_loc (MSG_NOTE, vect_location,
1032 "inner step divides the vector alignment.\n");
1033 else
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "inner step doesn't divide the vector"
1036 " alignment.\n");
1040 /* Similarly we can only use base and misalignment information relative to
1041 an innermost loop if the misalignment stays the same throughout the
1042 execution of the loop. As above, this is the case if the stride of
1043 the dataref evenly divides by the alignment. */
1044 else
1046 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1047 step_preserves_misalignment_p
1048 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1050 if (!step_preserves_misalignment_p && dump_enabled_p ())
1051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052 "step doesn't divide the vector alignment.\n");
1055 unsigned int base_alignment = drb->base_alignment;
1056 unsigned int base_misalignment = drb->base_misalignment;
1058 /* Calculate the maximum of the pooled base address alignment and the
1059 alignment that we can compute for DR itself. */
1060 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1061 if (entry && base_alignment < (*entry)->base_alignment)
1063 base_alignment = (*entry)->base_alignment;
1064 base_misalignment = (*entry)->base_misalignment;
1067 if (drb->offset_alignment < vect_align_c
1068 || !step_preserves_misalignment_p
1069 /* We need to know whether the step wrt the vectorized loop is
1070 negative when computing the starting misalignment below. */
1071 || TREE_CODE (drb->step) != INTEGER_CST)
1073 if (dump_enabled_p ())
1074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1075 "Unknown alignment for access: %T\n", ref);
1076 return;
1079 if (base_alignment < vect_align_c)
1081 unsigned int max_alignment;
1082 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1083 if (max_alignment < vect_align_c
1084 || !vect_can_force_dr_alignment_p (base,
1085 vect_align_c * BITS_PER_UNIT))
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_NOTE, vect_location,
1089 "can't force alignment of ref: %T\n", ref);
1090 return;
1093 /* Force the alignment of the decl.
1094 NOTE: This is the only change to the code we make during
1095 the analysis phase, before deciding to vectorize the loop. */
1096 if (dump_enabled_p ())
1097 dump_printf_loc (MSG_NOTE, vect_location,
1098 "force alignment of %T\n", ref);
1100 dr_info->base_decl = base;
1101 dr_info->base_misaligned = true;
1102 base_misalignment = 0;
1104 poly_int64 misalignment
1105 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1107 /* If this is a backward running DR then first access in the larger
1108 vectype actually is N-1 elements before the address in the DR.
1109 Adjust misalign accordingly. */
1110 if (tree_int_cst_sgn (drb->step) < 0)
1111 /* PLUS because STEP is negative. */
1112 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1113 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1115 unsigned int const_misalignment;
1116 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120 "Non-constant misalignment for access: %T\n", ref);
1121 return;
1124 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "misalign = %d bytes of ref %T\n",
1129 DR_MISALIGNMENT (dr_info), ref);
1131 return;
1134 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1135 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1136 is made aligned via peeling. */
1138 static bool
1139 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1140 dr_vec_info *dr_peel_info)
1142 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1143 DR_TARGET_ALIGNMENT (dr_info)))
1145 poly_offset_int diff
1146 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1147 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1148 if (known_eq (diff, 0)
1149 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1150 return true;
1152 return false;
1155 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1156 aligned via peeling. */
1158 static bool
1159 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1160 dr_vec_info *dr_peel_info)
1162 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1163 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1164 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1165 DR_OFFSET (dr_peel_info->dr), 0)
1166 || !operand_equal_p (DR_STEP (dr_info->dr),
1167 DR_STEP (dr_peel_info->dr), 0))
1168 return false;
1170 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1173 /* Function vect_update_misalignment_for_peel.
1174 Sets DR_INFO's misalignment
1175 - to 0 if it has the same alignment as DR_PEEL_INFO,
1176 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1177 - to -1 (unknown) otherwise.
1179 DR_INFO - the data reference whose misalignment is to be adjusted.
1180 DR_PEEL_INFO - the data reference whose misalignment is being made
1181 zero in the vector loop by the peel.
1182 NPEEL - the number of iterations in the peel loop if the misalignment
1183 of DR_PEEL_INFO is known at compile time. */
1185 static void
1186 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1187 dr_vec_info *dr_peel_info, int npeel)
1189 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1190 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1192 SET_DR_MISALIGNMENT (dr_info, 0);
1193 return;
1196 unsigned HOST_WIDE_INT alignment;
1197 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1198 && known_alignment_for_access_p (dr_info)
1199 && known_alignment_for_access_p (dr_peel_info))
1201 int misal = DR_MISALIGNMENT (dr_info);
1202 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1203 misal &= alignment - 1;
1204 SET_DR_MISALIGNMENT (dr_info, misal);
1205 return;
1208 if (dump_enabled_p ())
1209 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1210 "to unknown (-1).\n");
1211 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1214 /* Return true if alignment is relevant for DR_INFO. */
1216 static bool
1217 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1219 stmt_vec_info stmt_info = dr_info->stmt;
1221 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1222 return false;
1224 /* For interleaving, only the alignment of the first access matters. */
1225 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1226 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1227 return false;
1229 /* Scatter-gather and invariant accesses continue to address individual
1230 scalars, so vector-level alignment is irrelevant. */
1231 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1232 || integer_zerop (DR_STEP (dr_info->dr)))
1233 return false;
1235 /* Strided accesses perform only component accesses, alignment is
1236 irrelevant for them. */
1237 if (STMT_VINFO_STRIDED_P (stmt_info)
1238 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1239 return false;
1241 return true;
1244 /* Given an memory reference EXP return whether its alignment is less
1245 than its size. */
1247 static bool
1248 not_size_aligned (tree exp)
1250 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1251 return true;
1253 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1254 > get_object_alignment (exp));
1257 /* Function vector_alignment_reachable_p
1259 Return true if vector alignment for DR_INFO is reachable by peeling
1260 a few loop iterations. Return false otherwise. */
1262 static bool
1263 vector_alignment_reachable_p (dr_vec_info *dr_info)
1265 stmt_vec_info stmt_info = dr_info->stmt;
1266 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1268 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1270 /* For interleaved access we peel only if number of iterations in
1271 the prolog loop ({VF - misalignment}), is a multiple of the
1272 number of the interleaved accesses. */
1273 int elem_size, mis_in_elements;
1275 /* FORNOW: handle only known alignment. */
1276 if (!known_alignment_for_access_p (dr_info))
1277 return false;
1279 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1280 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1281 elem_size = vector_element_size (vector_size, nelements);
1282 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1284 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1285 return false;
1288 /* If misalignment is known at the compile time then allow peeling
1289 only if natural alignment is reachable through peeling. */
1290 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1292 HOST_WIDE_INT elmsize =
1293 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1294 if (dump_enabled_p ())
1296 dump_printf_loc (MSG_NOTE, vect_location,
1297 "data size = %wd. misalignment = %d.\n", elmsize,
1298 DR_MISALIGNMENT (dr_info));
1300 if (DR_MISALIGNMENT (dr_info) % elmsize)
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1304 "data size does not divide the misalignment.\n");
1305 return false;
1309 if (!known_alignment_for_access_p (dr_info))
1311 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1312 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1313 if (dump_enabled_p ())
1314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315 "Unknown misalignment, %snaturally aligned\n",
1316 is_packed ? "not " : "");
1317 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1320 return true;
1324 /* Calculate the cost of the memory access represented by DR_INFO. */
1326 static void
1327 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1328 unsigned int *inside_cost,
1329 unsigned int *outside_cost,
1330 stmt_vector_for_cost *body_cost_vec,
1331 stmt_vector_for_cost *prologue_cost_vec)
1333 stmt_vec_info stmt_info = dr_info->stmt;
1334 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1335 int ncopies;
1337 if (PURE_SLP_STMT (stmt_info))
1338 ncopies = 1;
1339 else
1340 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1342 if (DR_IS_READ (dr_info->dr))
1343 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1344 outside_cost, prologue_cost_vec, body_cost_vec, false);
1345 else
1346 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1348 if (dump_enabled_p ())
1349 dump_printf_loc (MSG_NOTE, vect_location,
1350 "vect_get_data_access_cost: inside_cost = %d, "
1351 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1355 typedef struct _vect_peel_info
1357 dr_vec_info *dr_info;
1358 int npeel;
1359 unsigned int count;
1360 } *vect_peel_info;
1362 typedef struct _vect_peel_extended_info
1364 vec_info *vinfo;
1365 struct _vect_peel_info peel_info;
1366 unsigned int inside_cost;
1367 unsigned int outside_cost;
1368 } *vect_peel_extended_info;
1371 /* Peeling hashtable helpers. */
1373 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1375 static inline hashval_t hash (const _vect_peel_info *);
1376 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1379 inline hashval_t
1380 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1382 return (hashval_t) peel_info->npeel;
1385 inline bool
1386 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1388 return (a->npeel == b->npeel);
1392 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1394 static void
1395 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1396 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1397 int npeel)
1399 struct _vect_peel_info elem, *slot;
1400 _vect_peel_info **new_slot;
1401 bool supportable_dr_alignment
1402 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1404 elem.npeel = npeel;
1405 slot = peeling_htab->find (&elem);
1406 if (slot)
1407 slot->count++;
1408 else
1410 slot = XNEW (struct _vect_peel_info);
1411 slot->npeel = npeel;
1412 slot->dr_info = dr_info;
1413 slot->count = 1;
1414 new_slot = peeling_htab->find_slot (slot, INSERT);
1415 *new_slot = slot;
1418 if (!supportable_dr_alignment
1419 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1420 slot->count += VECT_MAX_COST;
1424 /* Traverse peeling hash table to find peeling option that aligns maximum
1425 number of data accesses. */
1428 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1429 _vect_peel_extended_info *max)
1431 vect_peel_info elem = *slot;
1433 if (elem->count > max->peel_info.count
1434 || (elem->count == max->peel_info.count
1435 && max->peel_info.npeel > elem->npeel))
1437 max->peel_info.npeel = elem->npeel;
1438 max->peel_info.count = elem->count;
1439 max->peel_info.dr_info = elem->dr_info;
1442 return 1;
1445 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1446 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1447 we assume DR0_INFO's misalignment will be zero after peeling. */
1449 static void
1450 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1451 dr_vec_info *dr0_info,
1452 unsigned int *inside_cost,
1453 unsigned int *outside_cost,
1454 stmt_vector_for_cost *body_cost_vec,
1455 stmt_vector_for_cost *prologue_cost_vec,
1456 unsigned int npeel,
1457 bool unknown_misalignment)
1459 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1460 unsigned i;
1461 data_reference *dr;
1463 FOR_EACH_VEC_ELT (datarefs, i, dr)
1465 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1466 if (!vect_relevant_for_alignment_p (dr_info))
1467 continue;
1469 int save_misalignment;
1470 save_misalignment = DR_MISALIGNMENT (dr_info);
1471 if (npeel == 0)
1473 else if (unknown_misalignment && dr_info == dr0_info)
1474 SET_DR_MISALIGNMENT (dr_info, 0);
1475 else
1476 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1477 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1478 body_cost_vec, prologue_cost_vec);
1479 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1483 /* Traverse peeling hash table and calculate cost for each peeling option.
1484 Find the one with the lowest cost. */
1487 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1488 _vect_peel_extended_info *min)
1490 vect_peel_info elem = *slot;
1491 int dummy;
1492 unsigned int inside_cost = 0, outside_cost = 0;
1493 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1494 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1495 epilogue_cost_vec;
1497 prologue_cost_vec.create (2);
1498 body_cost_vec.create (2);
1499 epilogue_cost_vec.create (2);
1501 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1502 &outside_cost, &body_cost_vec,
1503 &prologue_cost_vec, elem->npeel, false);
1505 body_cost_vec.release ();
1507 outside_cost += vect_get_known_peeling_cost
1508 (loop_vinfo, elem->npeel, &dummy,
1509 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1510 &prologue_cost_vec, &epilogue_cost_vec);
1512 /* Prologue and epilogue costs are added to the target model later.
1513 These costs depend only on the scalar iteration cost, the
1514 number of peeling iterations finally chosen, and the number of
1515 misaligned statements. So discard the information found here. */
1516 prologue_cost_vec.release ();
1517 epilogue_cost_vec.release ();
1519 if (inside_cost < min->inside_cost
1520 || (inside_cost == min->inside_cost
1521 && outside_cost < min->outside_cost))
1523 min->inside_cost = inside_cost;
1524 min->outside_cost = outside_cost;
1525 min->peel_info.dr_info = elem->dr_info;
1526 min->peel_info.npeel = elem->npeel;
1527 min->peel_info.count = elem->count;
1530 return 1;
1534 /* Choose best peeling option by traversing peeling hash table and either
1535 choosing an option with the lowest cost (if cost model is enabled) or the
1536 option that aligns as many accesses as possible. */
1538 static struct _vect_peel_extended_info
1539 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1540 loop_vec_info loop_vinfo)
1542 struct _vect_peel_extended_info res;
1544 res.peel_info.dr_info = NULL;
1545 res.vinfo = loop_vinfo;
1547 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1549 res.inside_cost = INT_MAX;
1550 res.outside_cost = INT_MAX;
1551 peeling_htab->traverse <_vect_peel_extended_info *,
1552 vect_peeling_hash_get_lowest_cost> (&res);
1554 else
1556 res.peel_info.count = 0;
1557 peeling_htab->traverse <_vect_peel_extended_info *,
1558 vect_peeling_hash_get_most_frequent> (&res);
1559 res.inside_cost = 0;
1560 res.outside_cost = 0;
1563 return res;
1566 /* Return true if the new peeling NPEEL is supported. */
1568 static bool
1569 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1570 unsigned npeel)
1572 unsigned i;
1573 struct data_reference *dr = NULL;
1574 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1575 enum dr_alignment_support supportable_dr_alignment;
1577 /* Ensure that all data refs can be vectorized after the peel. */
1578 FOR_EACH_VEC_ELT (datarefs, i, dr)
1580 int save_misalignment;
1582 if (dr == dr0_info->dr)
1583 continue;
1585 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1586 if (!vect_relevant_for_alignment_p (dr_info))
1587 continue;
1589 save_misalignment = DR_MISALIGNMENT (dr_info);
1590 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1591 supportable_dr_alignment
1592 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1593 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1595 if (!supportable_dr_alignment)
1596 return false;
1599 return true;
1602 /* Compare two data-references DRA and DRB to group them into chunks
1603 with related alignment. */
1605 static int
1606 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1608 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1609 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1610 int cmp;
1612 /* Stabilize sort. */
1613 if (dra == drb)
1614 return 0;
1616 /* Ordering of DRs according to base. */
1617 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1618 DR_BASE_ADDRESS (drb));
1619 if (cmp != 0)
1620 return cmp;
1622 /* And according to DR_OFFSET. */
1623 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1624 if (cmp != 0)
1625 return cmp;
1627 /* And after step. */
1628 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1629 if (cmp != 0)
1630 return cmp;
1632 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1633 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1634 if (cmp == 0)
1635 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1636 return cmp;
1639 /* Function vect_enhance_data_refs_alignment
1641 This pass will use loop versioning and loop peeling in order to enhance
1642 the alignment of data references in the loop.
1644 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1645 original loop is to be vectorized. Any other loops that are created by
1646 the transformations performed in this pass - are not supposed to be
1647 vectorized. This restriction will be relaxed.
1649 This pass will require a cost model to guide it whether to apply peeling
1650 or versioning or a combination of the two. For example, the scheme that
1651 intel uses when given a loop with several memory accesses, is as follows:
1652 choose one memory access ('p') which alignment you want to force by doing
1653 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1654 other accesses are not necessarily aligned, or (2) use loop versioning to
1655 generate one loop in which all accesses are aligned, and another loop in
1656 which only 'p' is necessarily aligned.
1658 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1659 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1660 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1662 Devising a cost model is the most critical aspect of this work. It will
1663 guide us on which access to peel for, whether to use loop versioning, how
1664 many versions to create, etc. The cost model will probably consist of
1665 generic considerations as well as target specific considerations (on
1666 powerpc for example, misaligned stores are more painful than misaligned
1667 loads).
1669 Here are the general steps involved in alignment enhancements:
1671 -- original loop, before alignment analysis:
1672 for (i=0; i<N; i++){
1673 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1674 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1677 -- After vect_compute_data_refs_alignment:
1678 for (i=0; i<N; i++){
1679 x = q[i]; # DR_MISALIGNMENT(q) = 3
1680 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1683 -- Possibility 1: we do loop versioning:
1684 if (p is aligned) {
1685 for (i=0; i<N; i++){ # loop 1A
1686 x = q[i]; # DR_MISALIGNMENT(q) = 3
1687 p[i] = y; # DR_MISALIGNMENT(p) = 0
1690 else {
1691 for (i=0; i<N; i++){ # loop 1B
1692 x = q[i]; # DR_MISALIGNMENT(q) = 3
1693 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1697 -- Possibility 2: we do loop peeling:
1698 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1699 x = q[i];
1700 p[i] = y;
1702 for (i = 3; i < N; i++){ # loop 2A
1703 x = q[i]; # DR_MISALIGNMENT(q) = 0
1704 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1707 -- Possibility 3: combination of loop peeling and versioning:
1708 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1709 x = q[i];
1710 p[i] = y;
1712 if (p is aligned) {
1713 for (i = 3; i<N; i++){ # loop 3A
1714 x = q[i]; # DR_MISALIGNMENT(q) = 0
1715 p[i] = y; # DR_MISALIGNMENT(p) = 0
1718 else {
1719 for (i = 3; i<N; i++){ # loop 3B
1720 x = q[i]; # DR_MISALIGNMENT(q) = 0
1721 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1725 These loops are later passed to loop_transform to be vectorized. The
1726 vectorizer will use the alignment information to guide the transformation
1727 (whether to generate regular loads/stores, or with special handling for
1728 misalignment). */
1730 opt_result
1731 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1733 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1734 enum dr_alignment_support supportable_dr_alignment;
1735 dr_vec_info *first_store = NULL;
1736 dr_vec_info *dr0_info = NULL;
1737 struct data_reference *dr;
1738 unsigned int i;
1739 bool do_peeling = false;
1740 bool do_versioning = false;
1741 unsigned int npeel = 0;
1742 bool one_misalignment_known = false;
1743 bool one_misalignment_unknown = false;
1744 bool one_dr_unsupportable = false;
1745 dr_vec_info *unsupportable_dr_info = NULL;
1746 unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1747 hash_table<peel_info_hasher> peeling_htab (1);
1749 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1751 /* Reset data so we can safely be called multiple times. */
1752 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1753 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1755 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1756 return opt_result::success ();
1758 /* Sort the vector of datarefs so DRs that have the same or dependent
1759 alignment are next to each other. */
1760 auto_vec<data_reference_p> datarefs
1761 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1762 datarefs.qsort (dr_align_group_sort_cmp);
1764 /* Compute the number of DRs that become aligned when we peel
1765 a dataref so it becomes aligned. */
1766 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1767 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1768 unsigned i0;
1769 for (i0 = 0; i0 < datarefs.length (); ++i0)
1770 if (DR_BASE_ADDRESS (datarefs[i0]))
1771 break;
1772 for (i = i0 + 1; i <= datarefs.length (); ++i)
1774 if (i == datarefs.length ()
1775 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1776 DR_BASE_ADDRESS (datarefs[i]), 0)
1777 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1778 DR_OFFSET (datarefs[i]), 0)
1779 || !operand_equal_p (DR_STEP (datarefs[i0]),
1780 DR_STEP (datarefs[i]), 0))
1782 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1783 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1784 will get known misalignment if we align one of the refs
1785 with the largest DR_TARGET_ALIGNMENT. */
1786 for (unsigned j = i0; j < i; ++j)
1788 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1789 for (unsigned k = i0; k < i; ++k)
1791 if (k == j)
1792 continue;
1793 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1794 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1795 dr_infoj))
1796 n_same_align_refs[j]++;
1799 i0 = i;
1803 /* While cost model enhancements are expected in the future, the high level
1804 view of the code at this time is as follows:
1806 A) If there is a misaligned access then see if peeling to align
1807 this access can make all data references satisfy
1808 vect_supportable_dr_alignment. If so, update data structures
1809 as needed and return true.
1811 B) If peeling wasn't possible and there is a data reference with an
1812 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1813 then see if loop versioning checks can be used to make all data
1814 references satisfy vect_supportable_dr_alignment. If so, update
1815 data structures as needed and return true.
1817 C) If neither peeling nor versioning were successful then return false if
1818 any data reference does not satisfy vect_supportable_dr_alignment.
1820 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1822 Note, Possibility 3 above (which is peeling and versioning together) is not
1823 being done at this time. */
1825 /* (1) Peeling to force alignment. */
1827 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1828 Considerations:
1829 + How many accesses will become aligned due to the peeling
1830 - How many accesses will become unaligned due to the peeling,
1831 and the cost of misaligned accesses.
1832 - The cost of peeling (the extra runtime checks, the increase
1833 in code size). */
1835 FOR_EACH_VEC_ELT (datarefs, i, dr)
1837 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1838 if (!vect_relevant_for_alignment_p (dr_info))
1839 continue;
1841 stmt_vec_info stmt_info = dr_info->stmt;
1842 supportable_dr_alignment
1843 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1844 do_peeling = vector_alignment_reachable_p (dr_info);
1845 if (do_peeling)
1847 if (known_alignment_for_access_p (dr_info))
1849 unsigned int npeel_tmp = 0;
1850 bool negative = tree_int_cst_compare (DR_STEP (dr),
1851 size_zero_node) < 0;
1853 /* If known_alignment_for_access_p then we have set
1854 DR_MISALIGNMENT which is only done if we know it at compiler
1855 time, so it is safe to assume target alignment is constant.
1857 unsigned int target_align =
1858 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1859 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1860 mis = (negative
1861 ? DR_MISALIGNMENT (dr_info)
1862 : -DR_MISALIGNMENT (dr_info));
1863 if (DR_MISALIGNMENT (dr_info) != 0)
1864 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1866 /* For multiple types, it is possible that the bigger type access
1867 will have more than one peeling option. E.g., a loop with two
1868 types: one of size (vector size / 4), and the other one of
1869 size (vector size / 8). Vectorization factor will 8. If both
1870 accesses are misaligned by 3, the first one needs one scalar
1871 iteration to be aligned, and the second one needs 5. But the
1872 first one will be aligned also by peeling 5 scalar
1873 iterations, and in that case both accesses will be aligned.
1874 Hence, except for the immediate peeling amount, we also want
1875 to try to add full vector size, while we don't exceed
1876 vectorization factor.
1877 We do this automatically for cost model, since we calculate
1878 cost for every peeling option. */
1879 poly_uint64 nscalars = npeel_tmp;
1880 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1882 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1883 nscalars = (STMT_SLP_TYPE (stmt_info)
1884 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1887 /* Save info about DR in the hash table. Also include peeling
1888 amounts according to the explanation above. */
1889 while (known_le (npeel_tmp, nscalars))
1891 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1892 dr_info, npeel_tmp);
1893 npeel_tmp += MAX (1, target_align / dr_size);
1896 one_misalignment_known = true;
1898 else
1900 /* If we don't know any misalignment values, we prefer
1901 peeling for data-ref that has the maximum number of data-refs
1902 with the same alignment, unless the target prefers to align
1903 stores over load. */
1904 unsigned same_align_drs = n_same_align_refs[i];
1905 if (!dr0_info
1906 || dr0_same_align_drs < same_align_drs)
1908 dr0_same_align_drs = same_align_drs;
1909 dr0_info = dr_info;
1911 /* For data-refs with the same number of related
1912 accesses prefer the one where the misalign
1913 computation will be invariant in the outermost loop. */
1914 else if (dr0_same_align_drs == same_align_drs)
1916 class loop *ivloop0, *ivloop;
1917 ivloop0 = outermost_invariant_loop_for_expr
1918 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1919 ivloop = outermost_invariant_loop_for_expr
1920 (loop, DR_BASE_ADDRESS (dr));
1921 if ((ivloop && !ivloop0)
1922 || (ivloop && ivloop0
1923 && flow_loop_nested_p (ivloop, ivloop0)))
1924 dr0_info = dr_info;
1927 one_misalignment_unknown = true;
1929 /* Check for data refs with unsupportable alignment that
1930 can be peeled. */
1931 if (!supportable_dr_alignment)
1933 one_dr_unsupportable = true;
1934 unsupportable_dr_info = dr_info;
1937 if (!first_store && DR_IS_WRITE (dr))
1939 first_store = dr_info;
1940 first_store_same_align_drs = same_align_drs;
1944 else
1946 if (!aligned_access_p (dr_info))
1948 if (dump_enabled_p ())
1949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1950 "vector alignment may not be reachable\n");
1951 break;
1956 /* Check if we can possibly peel the loop. */
1957 if (!vect_can_advance_ivs_p (loop_vinfo)
1958 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1959 || loop->inner)
1960 do_peeling = false;
1962 struct _vect_peel_extended_info peel_for_known_alignment;
1963 struct _vect_peel_extended_info peel_for_unknown_alignment;
1964 struct _vect_peel_extended_info best_peel;
1966 peel_for_unknown_alignment.inside_cost = INT_MAX;
1967 peel_for_unknown_alignment.outside_cost = INT_MAX;
1968 peel_for_unknown_alignment.peel_info.count = 0;
1970 if (do_peeling
1971 && one_misalignment_unknown)
1973 /* Check if the target requires to prefer stores over loads, i.e., if
1974 misaligned stores are more expensive than misaligned loads (taking
1975 drs with same alignment into account). */
1976 unsigned int load_inside_cost = 0;
1977 unsigned int load_outside_cost = 0;
1978 unsigned int store_inside_cost = 0;
1979 unsigned int store_outside_cost = 0;
1980 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1982 stmt_vector_for_cost dummy;
1983 dummy.create (2);
1984 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1985 &load_inside_cost,
1986 &load_outside_cost,
1987 &dummy, &dummy, estimated_npeels, true);
1988 dummy.release ();
1990 if (first_store)
1992 dummy.create (2);
1993 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1994 &store_inside_cost,
1995 &store_outside_cost,
1996 &dummy, &dummy,
1997 estimated_npeels, true);
1998 dummy.release ();
2000 else
2002 store_inside_cost = INT_MAX;
2003 store_outside_cost = INT_MAX;
2006 if (load_inside_cost > store_inside_cost
2007 || (load_inside_cost == store_inside_cost
2008 && load_outside_cost > store_outside_cost))
2010 dr0_info = first_store;
2011 dr0_same_align_drs = first_store_same_align_drs;
2012 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2013 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2015 else
2017 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2018 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2021 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2022 prologue_cost_vec.create (2);
2023 epilogue_cost_vec.create (2);
2025 int dummy2;
2026 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2027 (loop_vinfo, estimated_npeels, &dummy2,
2028 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2029 &prologue_cost_vec, &epilogue_cost_vec);
2031 prologue_cost_vec.release ();
2032 epilogue_cost_vec.release ();
2034 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2037 peel_for_unknown_alignment.peel_info.npeel = 0;
2038 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2040 best_peel = peel_for_unknown_alignment;
2042 peel_for_known_alignment.inside_cost = INT_MAX;
2043 peel_for_known_alignment.outside_cost = INT_MAX;
2044 peel_for_known_alignment.peel_info.count = 0;
2045 peel_for_known_alignment.peel_info.dr_info = NULL;
2047 if (do_peeling && one_misalignment_known)
2049 /* Peeling is possible, but there is no data access that is not supported
2050 unless aligned. So we try to choose the best possible peeling from
2051 the hash table. */
2052 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2053 (&peeling_htab, loop_vinfo);
2056 /* Compare costs of peeling for known and unknown alignment. */
2057 if (peel_for_known_alignment.peel_info.dr_info != NULL
2058 && peel_for_unknown_alignment.inside_cost
2059 >= peel_for_known_alignment.inside_cost)
2061 best_peel = peel_for_known_alignment;
2063 /* If the best peeling for known alignment has NPEEL == 0, perform no
2064 peeling at all except if there is an unsupportable dr that we can
2065 align. */
2066 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2067 do_peeling = false;
2070 /* If there is an unsupportable data ref, prefer this over all choices so far
2071 since we'd have to discard a chosen peeling except when it accidentally
2072 aligned the unsupportable data ref. */
2073 if (one_dr_unsupportable)
2074 dr0_info = unsupportable_dr_info;
2075 else if (do_peeling)
2077 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2078 TODO: Use nopeel_outside_cost or get rid of it? */
2079 unsigned nopeel_inside_cost = 0;
2080 unsigned nopeel_outside_cost = 0;
2082 stmt_vector_for_cost dummy;
2083 dummy.create (2);
2084 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2085 &nopeel_outside_cost, &dummy, &dummy,
2086 0, false);
2087 dummy.release ();
2089 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2090 costs will be recorded. */
2091 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2092 prologue_cost_vec.create (2);
2093 epilogue_cost_vec.create (2);
2095 int dummy2;
2096 nopeel_outside_cost += vect_get_known_peeling_cost
2097 (loop_vinfo, 0, &dummy2,
2098 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2099 &prologue_cost_vec, &epilogue_cost_vec);
2101 prologue_cost_vec.release ();
2102 epilogue_cost_vec.release ();
2104 npeel = best_peel.peel_info.npeel;
2105 dr0_info = best_peel.peel_info.dr_info;
2107 /* If no peeling is not more expensive than the best peeling we
2108 have so far, don't perform any peeling. */
2109 if (nopeel_inside_cost <= best_peel.inside_cost)
2110 do_peeling = false;
2113 if (do_peeling)
2115 stmt_vec_info stmt_info = dr0_info->stmt;
2116 if (known_alignment_for_access_p (dr0_info))
2118 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2119 size_zero_node) < 0;
2120 if (!npeel)
2122 /* Since it's known at compile time, compute the number of
2123 iterations in the peeled loop (the peeling factor) for use in
2124 updating DR_MISALIGNMENT values. The peeling factor is the
2125 vectorization factor minus the misalignment as an element
2126 count. */
2127 mis = (negative
2128 ? DR_MISALIGNMENT (dr0_info)
2129 : -DR_MISALIGNMENT (dr0_info));
2130 /* If known_alignment_for_access_p then we have set
2131 DR_MISALIGNMENT which is only done if we know it at compiler
2132 time, so it is safe to assume target alignment is constant.
2134 unsigned int target_align =
2135 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2136 npeel = ((mis & (target_align - 1))
2137 / vect_get_scalar_dr_size (dr0_info));
2140 /* For interleaved data access every iteration accesses all the
2141 members of the group, therefore we divide the number of iterations
2142 by the group size. */
2143 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2144 npeel /= DR_GROUP_SIZE (stmt_info);
2146 if (dump_enabled_p ())
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "Try peeling by %d\n", npeel);
2151 /* Ensure that all datarefs can be vectorized after the peel. */
2152 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2153 do_peeling = false;
2155 /* Check if all datarefs are supportable and log. */
2156 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2157 return opt_result::success ();
2159 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2160 if (do_peeling)
2162 unsigned max_allowed_peel
2163 = param_vect_max_peeling_for_alignment;
2164 if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
2165 max_allowed_peel = 0;
2166 if (max_allowed_peel != (unsigned)-1)
2168 unsigned max_peel = npeel;
2169 if (max_peel == 0)
2171 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2172 unsigned HOST_WIDE_INT target_align_c;
2173 if (target_align.is_constant (&target_align_c))
2174 max_peel =
2175 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2176 else
2178 do_peeling = false;
2179 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_NOTE, vect_location,
2181 "Disable peeling, max peels set and vector"
2182 " alignment unknown\n");
2185 if (max_peel > max_allowed_peel)
2187 do_peeling = false;
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location,
2190 "Disable peeling, max peels reached: %d\n", max_peel);
2195 /* Cost model #2 - if peeling may result in a remaining loop not
2196 iterating enough to be vectorized then do not peel. Since this
2197 is a cost heuristic rather than a correctness decision, use the
2198 most likely runtime value for variable vectorization factors. */
2199 if (do_peeling
2200 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2202 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2203 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2204 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2205 < assumed_vf + max_peel)
2206 do_peeling = false;
2209 if (do_peeling)
2211 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2212 If the misalignment of DR_i is identical to that of dr0 then set
2213 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2214 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2215 by the peeling factor times the element size of DR_i (MOD the
2216 vectorization factor times the size). Otherwise, the
2217 misalignment of DR_i must be set to unknown. */
2218 FOR_EACH_VEC_ELT (datarefs, i, dr)
2219 if (dr != dr0_info->dr)
2221 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2222 if (!vect_relevant_for_alignment_p (dr_info))
2223 continue;
2225 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2228 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2229 if (npeel)
2230 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2231 else
2232 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2233 = DR_MISALIGNMENT (dr0_info);
2234 SET_DR_MISALIGNMENT (dr0_info, 0);
2235 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_NOTE, vect_location,
2238 "Alignment of access forced using peeling.\n");
2239 dump_printf_loc (MSG_NOTE, vect_location,
2240 "Peeling for alignment will be applied.\n");
2243 /* The inside-loop cost will be accounted for in vectorizable_load
2244 and vectorizable_store correctly with adjusted alignments.
2245 Drop the body_cst_vec on the floor here. */
2246 return opt_result::success ();
2250 /* (2) Versioning to force alignment. */
2252 /* Try versioning if:
2253 1) optimize loop for speed and the cost-model is not cheap
2254 2) there is at least one unsupported misaligned data ref with an unknown
2255 misalignment, and
2256 3) all misaligned data refs with a known misalignment are supported, and
2257 4) the number of runtime alignment checks is within reason. */
2259 do_versioning
2260 = (optimize_loop_nest_for_speed_p (loop)
2261 && !loop->inner /* FORNOW */
2262 && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
2264 if (do_versioning)
2266 FOR_EACH_VEC_ELT (datarefs, i, dr)
2268 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2269 if (aligned_access_p (dr_info)
2270 || !vect_relevant_for_alignment_p (dr_info))
2271 continue;
2273 stmt_vec_info stmt_info = dr_info->stmt;
2274 if (STMT_VINFO_STRIDED_P (stmt_info))
2276 do_versioning = false;
2277 break;
2280 supportable_dr_alignment
2281 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2283 if (!supportable_dr_alignment)
2285 int mask;
2286 tree vectype;
2288 if (known_alignment_for_access_p (dr_info)
2289 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2290 >= (unsigned) param_vect_max_version_for_alignment_checks)
2292 do_versioning = false;
2293 break;
2296 vectype = STMT_VINFO_VECTYPE (stmt_info);
2297 gcc_assert (vectype);
2299 /* At present we don't support versioning for alignment
2300 with variable VF, since there's no guarantee that the
2301 VF is a power of two. We could relax this if we added
2302 a way of enforcing a power-of-two size. */
2303 unsigned HOST_WIDE_INT size;
2304 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2306 do_versioning = false;
2307 break;
2310 /* Forcing alignment in the first iteration is no good if
2311 we don't keep it across iterations. For now, just disable
2312 versioning in this case.
2313 ?? We could actually unroll the loop to achieve the required
2314 overall step alignment, and forcing the alignment could be
2315 done by doing some iterations of the non-vectorized loop. */
2316 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2317 * DR_STEP_ALIGNMENT (dr),
2318 DR_TARGET_ALIGNMENT (dr_info)))
2320 do_versioning = false;
2321 break;
2324 /* The rightmost bits of an aligned address must be zeros.
2325 Construct the mask needed for this test. For example,
2326 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2327 mask must be 15 = 0xf. */
2328 mask = size - 1;
2330 /* FORNOW: use the same mask to test all potentially unaligned
2331 references in the loop. */
2332 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2333 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2335 do_versioning = false;
2336 break;
2339 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2340 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2344 /* Versioning requires at least one misaligned data reference. */
2345 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2346 do_versioning = false;
2347 else if (!do_versioning)
2348 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2351 if (do_versioning)
2353 vec<stmt_vec_info> may_misalign_stmts
2354 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2355 stmt_vec_info stmt_info;
2357 /* It can now be assumed that the data references in the statements
2358 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2359 of the loop being vectorized. */
2360 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2362 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2363 SET_DR_MISALIGNMENT (dr_info, 0);
2364 if (dump_enabled_p ())
2365 dump_printf_loc (MSG_NOTE, vect_location,
2366 "Alignment of access forced using versioning.\n");
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_NOTE, vect_location,
2371 "Versioning for alignment will be applied.\n");
2373 /* Peeling and versioning can't be done together at this time. */
2374 gcc_assert (! (do_peeling && do_versioning));
2376 return opt_result::success ();
2379 /* This point is reached if neither peeling nor versioning is being done. */
2380 gcc_assert (! (do_peeling || do_versioning));
2382 return opt_result::success ();
2386 /* Function vect_analyze_data_refs_alignment
2388 Analyze the alignment of the data-references in the loop.
2389 Return FALSE if a data reference is found that cannot be vectorized. */
2391 opt_result
2392 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2394 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2396 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2397 struct data_reference *dr;
2398 unsigned int i;
2400 vect_record_base_alignments (loop_vinfo);
2401 FOR_EACH_VEC_ELT (datarefs, i, dr)
2403 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2404 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2405 vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2408 return opt_result::success ();
2412 /* Analyze alignment of DRs of stmts in NODE. */
2414 static bool
2415 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2417 /* We vectorize from the first scalar stmt in the node unless
2418 the node is permuted in which case we start from the first
2419 element in the group. */
2420 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2421 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2422 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2423 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2425 /* We need to commit to a vector type for the group now. */
2426 if (is_a <bb_vec_info> (vinfo)
2427 && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2429 if (dump_enabled_p ())
2430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2431 "desired vector type conflicts with earlier one "
2432 "for %G", first_stmt_info->stmt);
2433 return false;
2436 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2437 vect_compute_data_ref_alignment (vinfo, dr_info);
2438 /* In several places we need alignment of the first element anyway. */
2439 if (dr_info != first_dr_info)
2440 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2442 /* For creating the data-ref pointer we need alignment of the
2443 first element as well. */
2444 first_stmt_info
2445 = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node));
2446 if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2448 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2449 if (dr_info != first_dr_info)
2450 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2453 return true;
2456 /* Function vect_slp_analyze_instance_alignment
2458 Analyze the alignment of the data-references in the SLP instance.
2459 Return FALSE if a data reference is found that cannot be vectorized. */
2461 bool
2462 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2463 slp_instance instance)
2465 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2467 slp_tree node;
2468 unsigned i;
2469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2470 if (! vect_slp_analyze_node_alignment (vinfo, node))
2471 return false;
2473 node = SLP_INSTANCE_TREE (instance);
2474 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2475 && ! vect_slp_analyze_node_alignment
2476 (vinfo, SLP_INSTANCE_TREE (instance)))
2477 return false;
2479 return true;
2483 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2484 accesses of legal size, step, etc. Detect gaps, single element
2485 interleaving, and other special cases. Set grouped access info.
2486 Collect groups of strided stores for further use in SLP analysis.
2487 Worker for vect_analyze_group_access. */
2489 static bool
2490 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2492 data_reference *dr = dr_info->dr;
2493 tree step = DR_STEP (dr);
2494 tree scalar_type = TREE_TYPE (DR_REF (dr));
2495 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2496 stmt_vec_info stmt_info = dr_info->stmt;
2497 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2498 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2499 HOST_WIDE_INT dr_step = -1;
2500 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2501 bool slp_impossible = false;
2503 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2504 size of the interleaving group (including gaps). */
2505 if (tree_fits_shwi_p (step))
2507 dr_step = tree_to_shwi (step);
2508 /* Check that STEP is a multiple of type size. Otherwise there is
2509 a non-element-sized gap at the end of the group which we
2510 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2511 ??? As we can handle non-constant step fine here we should
2512 simply remove uses of DR_GROUP_GAP between the last and first
2513 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2514 simply not include that gap. */
2515 if ((dr_step % type_size) != 0)
2517 if (dump_enabled_p ())
2518 dump_printf_loc (MSG_NOTE, vect_location,
2519 "Step %T is not a multiple of the element size"
2520 " for %T\n",
2521 step, DR_REF (dr));
2522 return false;
2524 groupsize = absu_hwi (dr_step) / type_size;
2526 else
2527 groupsize = 0;
2529 /* Not consecutive access is possible only if it is a part of interleaving. */
2530 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2532 /* Check if it this DR is a part of interleaving, and is a single
2533 element of the group that is accessed in the loop. */
2535 /* Gaps are supported only for loads. STEP must be a multiple of the type
2536 size. */
2537 if (DR_IS_READ (dr)
2538 && (dr_step % type_size) == 0
2539 && groupsize > 0)
2541 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2542 DR_GROUP_SIZE (stmt_info) = groupsize;
2543 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE, vect_location,
2546 "Detected single element interleaving %T"
2547 " step %T\n",
2548 DR_REF (dr), step);
2550 return true;
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "not consecutive access %G", stmt_info->stmt);
2557 if (bb_vinfo)
2559 /* Mark the statement as unvectorizable. */
2560 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2561 return true;
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2566 STMT_VINFO_STRIDED_P (stmt_info) = true;
2567 return true;
2570 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2572 /* First stmt in the interleaving chain. Check the chain. */
2573 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2574 struct data_reference *data_ref = dr;
2575 unsigned int count = 1;
2576 tree prev_init = DR_INIT (data_ref);
2577 HOST_WIDE_INT diff, gaps = 0;
2579 /* By construction, all group members have INTEGER_CST DR_INITs. */
2580 while (next)
2582 /* We never have the same DR multiple times. */
2583 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2584 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2586 data_ref = STMT_VINFO_DATA_REF (next);
2588 /* All group members have the same STEP by construction. */
2589 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2591 /* Check that the distance between two accesses is equal to the type
2592 size. Otherwise, we have gaps. */
2593 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2594 - TREE_INT_CST_LOW (prev_init)) / type_size;
2595 if (diff != 1)
2597 /* FORNOW: SLP of accesses with gaps is not supported. */
2598 slp_impossible = true;
2599 if (DR_IS_WRITE (data_ref))
2601 if (dump_enabled_p ())
2602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603 "interleaved store with gaps\n");
2604 return false;
2607 gaps += diff - 1;
2610 last_accessed_element += diff;
2612 /* Store the gap from the previous member of the group. If there is no
2613 gap in the access, DR_GROUP_GAP is always 1. */
2614 DR_GROUP_GAP (next) = diff;
2616 prev_init = DR_INIT (data_ref);
2617 next = DR_GROUP_NEXT_ELEMENT (next);
2618 /* Count the number of data-refs in the chain. */
2619 count++;
2622 if (groupsize == 0)
2623 groupsize = count + gaps;
2625 /* This could be UINT_MAX but as we are generating code in a very
2626 inefficient way we have to cap earlier. See PR78699 for example. */
2627 if (groupsize > 4096)
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "group is too large\n");
2632 return false;
2635 /* Check that the size of the interleaving is equal to count for stores,
2636 i.e., that there are no gaps. */
2637 if (groupsize != count
2638 && !DR_IS_READ (dr))
2640 groupsize = count;
2641 STMT_VINFO_STRIDED_P (stmt_info) = true;
2644 /* If there is a gap after the last load in the group it is the
2645 difference between the groupsize and the last accessed
2646 element.
2647 When there is no gap, this difference should be 0. */
2648 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2650 DR_GROUP_SIZE (stmt_info) = groupsize;
2651 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "Detected interleaving ");
2655 if (DR_IS_READ (dr))
2656 dump_printf (MSG_NOTE, "load ");
2657 else if (STMT_VINFO_STRIDED_P (stmt_info))
2658 dump_printf (MSG_NOTE, "strided store ");
2659 else
2660 dump_printf (MSG_NOTE, "store ");
2661 dump_printf (MSG_NOTE, "of size %u\n",
2662 (unsigned)groupsize);
2663 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2664 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2665 while (next)
2667 if (DR_GROUP_GAP (next) != 1)
2668 dump_printf_loc (MSG_NOTE, vect_location,
2669 "\t<gap of %d elements>\n",
2670 DR_GROUP_GAP (next) - 1);
2671 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2672 next = DR_GROUP_NEXT_ELEMENT (next);
2674 if (DR_GROUP_GAP (stmt_info) != 0)
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "\t<gap of %d elements>\n",
2677 DR_GROUP_GAP (stmt_info));
2680 /* SLP: create an SLP data structure for every interleaving group of
2681 stores for further analysis in vect_analyse_slp. */
2682 if (DR_IS_WRITE (dr) && !slp_impossible)
2684 if (loop_vinfo)
2685 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2686 if (bb_vinfo)
2687 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2691 return true;
2694 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2695 accesses of legal size, step, etc. Detect gaps, single element
2696 interleaving, and other special cases. Set grouped access info.
2697 Collect groups of strided stores for further use in SLP analysis. */
2699 static bool
2700 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2702 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2704 /* Dissolve the group if present. */
2705 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2706 while (stmt_info)
2708 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2709 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2710 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2711 stmt_info = next;
2713 return false;
2715 return true;
2718 /* Analyze the access pattern of the data-reference DR_INFO.
2719 In case of non-consecutive accesses call vect_analyze_group_access() to
2720 analyze groups of accesses. */
2722 static bool
2723 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2725 data_reference *dr = dr_info->dr;
2726 tree step = DR_STEP (dr);
2727 tree scalar_type = TREE_TYPE (DR_REF (dr));
2728 stmt_vec_info stmt_info = dr_info->stmt;
2729 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2730 class loop *loop = NULL;
2732 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2733 return true;
2735 if (loop_vinfo)
2736 loop = LOOP_VINFO_LOOP (loop_vinfo);
2738 if (loop_vinfo && !step)
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742 "bad data-ref access in loop\n");
2743 return false;
2746 /* Allow loads with zero step in inner-loop vectorization. */
2747 if (loop_vinfo && integer_zerop (step))
2749 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2750 if (!nested_in_vect_loop_p (loop, stmt_info))
2751 return DR_IS_READ (dr);
2752 /* Allow references with zero step for outer loops marked
2753 with pragma omp simd only - it guarantees absence of
2754 loop-carried dependencies between inner loop iterations. */
2755 if (loop->safelen < 2)
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location,
2759 "zero step in inner loop of nest\n");
2760 return false;
2764 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2766 /* Interleaved accesses are not yet supported within outer-loop
2767 vectorization for references in the inner-loop. */
2768 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2770 /* For the rest of the analysis we use the outer-loop step. */
2771 step = STMT_VINFO_DR_STEP (stmt_info);
2772 if (integer_zerop (step))
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_NOTE, vect_location,
2776 "zero step in outer loop.\n");
2777 return DR_IS_READ (dr);
2781 /* Consecutive? */
2782 if (TREE_CODE (step) == INTEGER_CST)
2784 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2785 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2786 || (dr_step < 0
2787 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2789 /* Mark that it is not interleaving. */
2790 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2791 return true;
2795 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2797 if (dump_enabled_p ())
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "grouped access in outer loop.\n");
2800 return false;
2804 /* Assume this is a DR handled by non-constant strided load case. */
2805 if (TREE_CODE (step) != INTEGER_CST)
2806 return (STMT_VINFO_STRIDED_P (stmt_info)
2807 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2808 || vect_analyze_group_access (vinfo, dr_info)));
2810 /* Not consecutive access - check if it's a part of interleaving group. */
2811 return vect_analyze_group_access (vinfo, dr_info);
2814 typedef std::pair<data_reference_p, int> data_ref_pair;
2816 /* Compare two data-references DRA and DRB to group them into chunks
2817 suitable for grouping. */
2819 static int
2820 dr_group_sort_cmp (const void *dra_, const void *drb_)
2822 data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2823 data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2824 data_reference_p dra = dra_pair.first;
2825 data_reference_p drb = drb_pair.first;
2826 int cmp;
2828 /* Stabilize sort. */
2829 if (dra == drb)
2830 return 0;
2832 /* DRs in different basic-blocks never belong to the same group. */
2833 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2834 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2835 if (bb_index1 != bb_index2)
2836 return bb_index1 < bb_index2 ? -1 : 1;
2838 /* Different group IDs lead never belong to the same group. */
2839 if (dra_pair.second != drb_pair.second)
2840 return dra_pair.second < drb_pair.second ? -1 : 1;
2842 /* Ordering of DRs according to base. */
2843 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2844 DR_BASE_ADDRESS (drb));
2845 if (cmp != 0)
2846 return cmp;
2848 /* And according to DR_OFFSET. */
2849 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2850 if (cmp != 0)
2851 return cmp;
2853 /* Put reads before writes. */
2854 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2855 return DR_IS_READ (dra) ? -1 : 1;
2857 /* Then sort after access size. */
2858 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2859 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2860 if (cmp != 0)
2861 return cmp;
2863 /* And after step. */
2864 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2865 if (cmp != 0)
2866 return cmp;
2868 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2869 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2870 if (cmp == 0)
2871 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2872 return cmp;
2875 /* If OP is the result of a conversion, return the unconverted value,
2876 otherwise return null. */
2878 static tree
2879 strip_conversion (tree op)
2881 if (TREE_CODE (op) != SSA_NAME)
2882 return NULL_TREE;
2883 gimple *stmt = SSA_NAME_DEF_STMT (op);
2884 if (!is_gimple_assign (stmt)
2885 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2886 return NULL_TREE;
2887 return gimple_assign_rhs1 (stmt);
2890 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2891 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2892 be grouped in SLP mode. */
2894 static bool
2895 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2896 bool allow_slp_p)
2898 if (gimple_assign_single_p (stmt1_info->stmt))
2899 return gimple_assign_single_p (stmt2_info->stmt);
2901 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2902 if (call1 && gimple_call_internal_p (call1))
2904 /* Check for two masked loads or two masked stores. */
2905 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2906 if (!call2 || !gimple_call_internal_p (call2))
2907 return false;
2908 internal_fn ifn = gimple_call_internal_fn (call1);
2909 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2910 return false;
2911 if (ifn != gimple_call_internal_fn (call2))
2912 return false;
2914 /* Check that the masks are the same. Cope with casts of masks,
2915 like those created by build_mask_conversion. */
2916 tree mask1 = gimple_call_arg (call1, 2);
2917 tree mask2 = gimple_call_arg (call2, 2);
2918 if (!operand_equal_p (mask1, mask2, 0)
2919 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2921 mask1 = strip_conversion (mask1);
2922 if (!mask1)
2923 return false;
2924 mask2 = strip_conversion (mask2);
2925 if (!mask2)
2926 return false;
2927 if (!operand_equal_p (mask1, mask2, 0))
2928 return false;
2930 return true;
2933 return false;
2936 /* Function vect_analyze_data_ref_accesses.
2938 Analyze the access pattern of all the data references in the loop.
2940 FORNOW: the only access pattern that is considered vectorizable is a
2941 simple step 1 (consecutive) access.
2943 FORNOW: handle only arrays and pointer accesses. */
2945 opt_result
2946 vect_analyze_data_ref_accesses (vec_info *vinfo,
2947 vec<int> *dataref_groups)
2949 unsigned int i;
2950 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2952 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2954 if (datarefs.is_empty ())
2955 return opt_result::success ();
2957 /* Sort the array of datarefs to make building the interleaving chains
2958 linear. Don't modify the original vector's order, it is needed for
2959 determining what dependencies are reversed. */
2960 vec<data_ref_pair> datarefs_copy;
2961 datarefs_copy.create (datarefs.length ());
2962 for (unsigned i = 0; i < datarefs.length (); i++)
2964 int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2965 datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2967 datarefs_copy.qsort (dr_group_sort_cmp);
2968 hash_set<stmt_vec_info> to_fixup;
2970 /* Build the interleaving chains. */
2971 for (i = 0; i < datarefs_copy.length () - 1;)
2973 data_reference_p dra = datarefs_copy[i].first;
2974 int dra_group_id = datarefs_copy[i].second;
2975 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2976 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2977 stmt_vec_info lastinfo = NULL;
2978 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2979 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2981 ++i;
2982 continue;
2984 for (i = i + 1; i < datarefs_copy.length (); ++i)
2986 data_reference_p drb = datarefs_copy[i].first;
2987 int drb_group_id = datarefs_copy[i].second;
2988 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2989 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2990 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2991 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2992 break;
2994 /* ??? Imperfect sorting (non-compatible types, non-modulo
2995 accesses, same accesses) can lead to a group to be artificially
2996 split here as we don't just skip over those. If it really
2997 matters we can push those to a worklist and re-iterate
2998 over them. The we can just skip ahead to the next DR here. */
3000 /* DRs in a different BBs should not be put into the same
3001 interleaving group. */
3002 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
3003 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
3004 if (bb_index1 != bb_index2)
3005 break;
3007 if (dra_group_id != drb_group_id)
3008 break;
3010 /* Check that the data-refs have same first location (except init)
3011 and they are both either store or load (not load and store,
3012 not masked loads or stores). */
3013 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3014 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3015 DR_BASE_ADDRESS (drb)) != 0
3016 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3017 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3018 break;
3020 /* Check that the data-refs have the same constant size. */
3021 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3022 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3023 if (!tree_fits_uhwi_p (sza)
3024 || !tree_fits_uhwi_p (szb)
3025 || !tree_int_cst_equal (sza, szb))
3026 break;
3028 /* Check that the data-refs have the same step. */
3029 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3030 break;
3032 /* Check the types are compatible.
3033 ??? We don't distinguish this during sorting. */
3034 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3035 TREE_TYPE (DR_REF (drb))))
3036 break;
3038 /* Check that the DR_INITs are compile-time constants. */
3039 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3040 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3041 break;
3043 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3044 just hold extra information. */
3045 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3046 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3047 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3048 break;
3050 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3051 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3052 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3053 HOST_WIDE_INT init_prev
3054 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3055 gcc_assert (init_a <= init_b
3056 && init_a <= init_prev
3057 && init_prev <= init_b);
3059 /* Do not place the same access in the interleaving chain twice. */
3060 if (init_b == init_prev)
3062 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3063 < gimple_uid (DR_STMT (drb)));
3064 /* Simply link in duplicates and fix up the chain below. */
3066 else
3068 /* If init_b == init_a + the size of the type * k, we have an
3069 interleaving, and DRA is accessed before DRB. */
3070 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3071 if (type_size_a == 0
3072 || (init_b - init_a) % type_size_a != 0)
3073 break;
3075 /* If we have a store, the accesses are adjacent. This splits
3076 groups into chunks we support (we don't support vectorization
3077 of stores with gaps). */
3078 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3079 break;
3081 /* If the step (if not zero or non-constant) is smaller than the
3082 difference between data-refs' inits this splits groups into
3083 suitable sizes. */
3084 if (tree_fits_shwi_p (DR_STEP (dra)))
3086 unsigned HOST_WIDE_INT step
3087 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3088 if (step != 0
3089 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3090 break;
3094 if (dump_enabled_p ())
3095 dump_printf_loc (MSG_NOTE, vect_location,
3096 DR_IS_READ (dra)
3097 ? "Detected interleaving load %T and %T\n"
3098 : "Detected interleaving store %T and %T\n",
3099 DR_REF (dra), DR_REF (drb));
3101 /* Link the found element into the group list. */
3102 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3104 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3105 lastinfo = stmtinfo_a;
3107 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3108 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3109 lastinfo = stmtinfo_b;
3111 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3112 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3114 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3115 dump_printf_loc (MSG_NOTE, vect_location,
3116 "Load suitable for SLP vectorization only.\n");
3118 if (init_b == init_prev
3119 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3120 && dump_enabled_p ())
3121 dump_printf_loc (MSG_NOTE, vect_location,
3122 "Queuing group with duplicate access for fixup\n");
3126 /* Fixup groups with duplicate entries by splitting it. */
3127 while (1)
3129 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3130 if (!(it != to_fixup.end ()))
3131 break;
3132 stmt_vec_info grp = *it;
3133 to_fixup.remove (grp);
3135 /* Find the earliest duplicate group member. */
3136 unsigned first_duplicate = -1u;
3137 stmt_vec_info next, g = grp;
3138 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3140 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3141 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3142 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3143 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3144 g = next;
3146 if (first_duplicate == -1U)
3147 continue;
3149 /* Then move all stmts after the first duplicate to a new group.
3150 Note this is a heuristic but one with the property that *it
3151 is fixed up completely. */
3152 g = grp;
3153 stmt_vec_info newgroup = NULL, ng = grp;
3154 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3156 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3158 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3159 if (!newgroup)
3160 newgroup = next;
3161 else
3162 DR_GROUP_NEXT_ELEMENT (ng) = next;
3163 ng = next;
3164 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3166 else
3167 g = DR_GROUP_NEXT_ELEMENT (g);
3169 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3171 /* Fixup the new group which still may contain duplicates. */
3172 to_fixup.add (newgroup);
3175 data_ref_pair *dr_pair;
3176 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3178 dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3179 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3180 && !vect_analyze_data_ref_access (vinfo, dr_info))
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3184 "not vectorized: complicated access pattern.\n");
3186 if (is_a <bb_vec_info> (vinfo))
3188 /* Mark the statement as not vectorizable. */
3189 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3190 continue;
3192 else
3194 datarefs_copy.release ();
3195 return opt_result::failure_at (dr_info->stmt->stmt,
3196 "not vectorized:"
3197 " complicated access pattern.\n");
3202 datarefs_copy.release ();
3203 return opt_result::success ();
3206 /* Function vect_vfa_segment_size.
3208 Input:
3209 DR_INFO: The data reference.
3210 LENGTH_FACTOR: segment length to consider.
3212 Return a value suitable for the dr_with_seg_len::seg_len field.
3213 This is the "distance travelled" by the pointer from the first
3214 iteration in the segment to the last. Note that it does not include
3215 the size of the access; in effect it only describes the first byte. */
3217 static tree
3218 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3220 length_factor = size_binop (MINUS_EXPR,
3221 fold_convert (sizetype, length_factor),
3222 size_one_node);
3223 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3224 length_factor);
3227 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3228 gives the worst-case number of bytes covered by the segment. */
3230 static unsigned HOST_WIDE_INT
3231 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3233 stmt_vec_info stmt_vinfo = dr_info->stmt;
3234 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3235 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3236 unsigned HOST_WIDE_INT access_size = ref_size;
3237 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3239 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3240 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3242 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3243 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3244 == dr_explicit_realign_optimized))
3246 /* We might access a full vector's worth. */
3247 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3248 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3250 return access_size;
3253 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3254 describes. */
3256 static unsigned int
3257 vect_vfa_align (dr_vec_info *dr_info)
3259 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3262 /* Function vect_no_alias_p.
3264 Given data references A and B with equal base and offset, see whether
3265 the alias relation can be decided at compilation time. Return 1 if
3266 it can and the references alias, 0 if it can and the references do
3267 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3268 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3269 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3271 static int
3272 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3273 tree segment_length_a, tree segment_length_b,
3274 unsigned HOST_WIDE_INT access_size_a,
3275 unsigned HOST_WIDE_INT access_size_b)
3277 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3278 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3279 poly_uint64 const_length_a;
3280 poly_uint64 const_length_b;
3282 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3283 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3284 [a, a+12) */
3285 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3287 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3288 offset_a -= const_length_a;
3290 else
3291 const_length_a = tree_to_poly_uint64 (segment_length_a);
3292 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3294 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3295 offset_b -= const_length_b;
3297 else
3298 const_length_b = tree_to_poly_uint64 (segment_length_b);
3300 const_length_a += access_size_a;
3301 const_length_b += access_size_b;
3303 if (ranges_known_overlap_p (offset_a, const_length_a,
3304 offset_b, const_length_b))
3305 return 1;
3307 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3308 offset_b, const_length_b))
3309 return 0;
3311 return -1;
3314 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3315 in DDR is >= VF. */
3317 static bool
3318 dependence_distance_ge_vf (data_dependence_relation *ddr,
3319 unsigned int loop_depth, poly_uint64 vf)
3321 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3322 || DDR_NUM_DIST_VECTS (ddr) == 0)
3323 return false;
3325 /* If the dependence is exact, we should have limited the VF instead. */
3326 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3328 unsigned int i;
3329 lambda_vector dist_v;
3330 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3332 HOST_WIDE_INT dist = dist_v[loop_depth];
3333 if (dist != 0
3334 && !(dist > 0 && DDR_REVERSED_P (ddr))
3335 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3336 return false;
3339 if (dump_enabled_p ())
3340 dump_printf_loc (MSG_NOTE, vect_location,
3341 "dependence distance between %T and %T is >= VF\n",
3342 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3344 return true;
3347 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3349 static void
3350 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3352 dump_printf (dump_kind, "%s (%T) >= ",
3353 lower_bound.unsigned_p ? "unsigned" : "abs",
3354 lower_bound.expr);
3355 dump_dec (dump_kind, lower_bound.min_value);
3358 /* Record that the vectorized loop requires the vec_lower_bound described
3359 by EXPR, UNSIGNED_P and MIN_VALUE. */
3361 static void
3362 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3363 poly_uint64 min_value)
3365 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3366 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3367 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3369 unsigned_p &= lower_bounds[i].unsigned_p;
3370 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3371 if (lower_bounds[i].unsigned_p != unsigned_p
3372 || maybe_lt (lower_bounds[i].min_value, min_value))
3374 lower_bounds[i].unsigned_p = unsigned_p;
3375 lower_bounds[i].min_value = min_value;
3376 if (dump_enabled_p ())
3378 dump_printf_loc (MSG_NOTE, vect_location,
3379 "updating run-time check to ");
3380 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3381 dump_printf (MSG_NOTE, "\n");
3384 return;
3387 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3388 if (dump_enabled_p ())
3390 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3391 dump_lower_bound (MSG_NOTE, lower_bound);
3392 dump_printf (MSG_NOTE, "\n");
3394 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3397 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3398 will span fewer than GAP bytes. */
3400 static bool
3401 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3402 poly_int64 gap)
3404 stmt_vec_info stmt_info = dr_info->stmt;
3405 HOST_WIDE_INT count
3406 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3407 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3408 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3409 return (estimated_poly_value (gap)
3410 <= count * vect_get_scalar_dr_size (dr_info));
3413 /* Return true if we know that there is no alias between DR_INFO_A and
3414 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3415 When returning true, set *LOWER_BOUND_OUT to this N. */
3417 static bool
3418 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3419 poly_uint64 *lower_bound_out)
3421 /* Check that there is a constant gap of known sign between DR_A
3422 and DR_B. */
3423 data_reference *dr_a = dr_info_a->dr;
3424 data_reference *dr_b = dr_info_b->dr;
3425 poly_int64 init_a, init_b;
3426 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3427 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3428 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3429 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3430 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3431 || !ordered_p (init_a, init_b))
3432 return false;
3434 /* Sort DR_A and DR_B by the address they access. */
3435 if (maybe_lt (init_b, init_a))
3437 std::swap (init_a, init_b);
3438 std::swap (dr_info_a, dr_info_b);
3439 std::swap (dr_a, dr_b);
3442 /* If the two accesses could be dependent within a scalar iteration,
3443 make sure that we'd retain their order. */
3444 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3445 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3446 return false;
3448 /* There is no alias if abs (DR_STEP) is greater than or equal to
3449 the bytes spanned by the combination of the two accesses. */
3450 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3451 return true;
3454 /* Function vect_prune_runtime_alias_test_list.
3456 Prune a list of ddrs to be tested at run-time by versioning for alias.
3457 Merge several alias checks into one if possible.
3458 Return FALSE if resulting list of ddrs is longer then allowed by
3459 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3461 opt_result
3462 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3464 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3465 hash_set <tree_pair_hash> compared_objects;
3467 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3468 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3469 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3470 vec<vec_object_pair> &check_unequal_addrs
3471 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3472 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3473 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3475 ddr_p ddr;
3476 unsigned int i;
3477 tree length_factor;
3479 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3481 /* Step values are irrelevant for aliasing if the number of vector
3482 iterations is equal to the number of scalar iterations (which can
3483 happen for fully-SLP loops). */
3484 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3486 if (!ignore_step_p)
3488 /* Convert the checks for nonzero steps into bound tests. */
3489 tree value;
3490 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3491 vect_check_lower_bound (loop_vinfo, value, true, 1);
3494 if (may_alias_ddrs.is_empty ())
3495 return opt_result::success ();
3497 comp_alias_ddrs.create (may_alias_ddrs.length ());
3499 unsigned int loop_depth
3500 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3501 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3503 /* First, we collect all data ref pairs for aliasing checks. */
3504 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3506 poly_uint64 lower_bound;
3507 tree segment_length_a, segment_length_b;
3508 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3509 unsigned int align_a, align_b;
3511 /* Ignore the alias if the VF we chose ended up being no greater
3512 than the dependence distance. */
3513 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3514 continue;
3516 if (DDR_OBJECT_A (ddr))
3518 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3519 if (!compared_objects.add (new_pair))
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location,
3523 "checking that %T and %T"
3524 " have different addresses\n",
3525 new_pair.first, new_pair.second);
3526 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3528 continue;
3531 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3532 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3534 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3535 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3537 bool preserves_scalar_order_p
3538 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3540 /* Skip the pair if inter-iteration dependencies are irrelevant
3541 and intra-iteration dependencies are guaranteed to be honored. */
3542 if (ignore_step_p
3543 && (preserves_scalar_order_p
3544 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3545 &lower_bound)))
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "no need for alias check between "
3550 "%T and %T when VF is 1\n",
3551 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3552 continue;
3555 /* See whether we can handle the alias using a bounds check on
3556 the step, and whether that's likely to be the best approach.
3557 (It might not be, for example, if the minimum step is much larger
3558 than the number of bytes handled by one vector iteration.) */
3559 if (!ignore_step_p
3560 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3561 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3562 &lower_bound)
3563 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3564 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3566 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3567 if (dump_enabled_p ())
3569 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3570 "%T and %T when the step %T is outside ",
3571 DR_REF (dr_info_a->dr),
3572 DR_REF (dr_info_b->dr),
3573 DR_STEP (dr_info_a->dr));
3574 if (unsigned_p)
3575 dump_printf (MSG_NOTE, "[0");
3576 else
3578 dump_printf (MSG_NOTE, "(");
3579 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3581 dump_printf (MSG_NOTE, ", ");
3582 dump_dec (MSG_NOTE, lower_bound);
3583 dump_printf (MSG_NOTE, ")\n");
3585 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3586 unsigned_p, lower_bound);
3587 continue;
3590 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3591 if (dr_group_first_a)
3593 stmt_info_a = dr_group_first_a;
3594 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3597 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3598 if (dr_group_first_b)
3600 stmt_info_b = dr_group_first_b;
3601 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3604 if (ignore_step_p)
3606 segment_length_a = size_zero_node;
3607 segment_length_b = size_zero_node;
3609 else
3611 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3612 DR_STEP (dr_info_b->dr), 0))
3613 length_factor = scalar_loop_iters;
3614 else
3615 length_factor = size_int (vect_factor);
3616 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3617 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3619 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3620 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3621 align_a = vect_vfa_align (dr_info_a);
3622 align_b = vect_vfa_align (dr_info_b);
3624 /* See whether the alias is known at compilation time. */
3625 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3626 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3627 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3628 DR_OFFSET (dr_info_b->dr), 0)
3629 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3630 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3631 && poly_int_tree_p (segment_length_a)
3632 && poly_int_tree_p (segment_length_b))
3634 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3635 segment_length_a,
3636 segment_length_b,
3637 access_size_a,
3638 access_size_b);
3639 if (res >= 0 && dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "can tell at compile time that %T and %T",
3643 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3644 if (res == 0)
3645 dump_printf (MSG_NOTE, " do not alias\n");
3646 else
3647 dump_printf (MSG_NOTE, " alias\n");
3650 if (res == 0)
3651 continue;
3653 if (res == 1)
3654 return opt_result::failure_at (stmt_info_b->stmt,
3655 "not vectorized:"
3656 " compilation time alias: %G%G",
3657 stmt_info_a->stmt,
3658 stmt_info_b->stmt);
3661 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3662 access_size_a, align_a);
3663 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3664 access_size_b, align_b);
3665 /* Canonicalize the order to be the one that's needed for accurate
3666 RAW, WAR and WAW flags, in cases where the data references are
3667 well-ordered. The order doesn't really matter otherwise,
3668 but we might as well be consistent. */
3669 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3670 std::swap (dr_a, dr_b);
3672 dr_with_seg_len_pair_t dr_with_seg_len_pair
3673 (dr_a, dr_b, (preserves_scalar_order_p
3674 ? dr_with_seg_len_pair_t::WELL_ORDERED
3675 : dr_with_seg_len_pair_t::REORDERED));
3677 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3680 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3682 unsigned int count = (comp_alias_ddrs.length ()
3683 + check_unequal_addrs.length ());
3685 if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
3686 return opt_result::failure_at
3687 (vect_location, "would need a runtime alias check\n");
3689 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "improved number of alias checks from %d to %d\n",
3692 may_alias_ddrs.length (), count);
3693 unsigned limit = param_vect_max_version_for_alias_checks;
3694 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3695 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3696 if (count > limit)
3697 return opt_result::failure_at
3698 (vect_location,
3699 "number of versioning for alias run-time tests exceeds %d "
3700 "(--param vect-max-version-for-alias-checks)\n", limit);
3702 return opt_result::success ();
3705 /* Check whether we can use an internal function for a gather load
3706 or scatter store. READ_P is true for loads and false for stores.
3707 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3708 the type of the memory elements being loaded or stored. OFFSET_TYPE
3709 is the type of the offset that is being applied to the invariant
3710 base address. SCALE is the amount by which the offset should
3711 be multiplied *after* it has been converted to address width.
3713 Return true if the function is supported, storing the function id in
3714 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3716 bool
3717 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3718 tree vectype, tree memory_type, tree offset_type,
3719 int scale, internal_fn *ifn_out,
3720 tree *offset_vectype_out)
3722 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3723 unsigned int element_bits = vector_element_bits (vectype);
3724 if (element_bits != memory_bits)
3725 /* For now the vector elements must be the same width as the
3726 memory elements. */
3727 return false;
3729 /* Work out which function we need. */
3730 internal_fn ifn;
3731 if (read_p)
3732 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3733 else
3734 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3736 for (;;)
3738 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3739 if (!offset_vectype)
3740 return false;
3742 /* Test whether the target supports this combination. */
3743 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3744 offset_vectype, scale))
3746 *ifn_out = ifn;
3747 *offset_vectype_out = offset_vectype;
3748 return true;
3751 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3752 && TYPE_PRECISION (offset_type) >= element_bits)
3753 return false;
3755 offset_type = build_nonstandard_integer_type
3756 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3760 /* STMT_INFO is a call to an internal gather load or scatter store function.
3761 Describe the operation in INFO. */
3763 static void
3764 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3765 gather_scatter_info *info)
3767 gcall *call = as_a <gcall *> (stmt_info->stmt);
3768 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3769 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3771 info->ifn = gimple_call_internal_fn (call);
3772 info->decl = NULL_TREE;
3773 info->base = gimple_call_arg (call, 0);
3774 info->offset = gimple_call_arg (call, 1);
3775 info->offset_dt = vect_unknown_def_type;
3776 info->offset_vectype = NULL_TREE;
3777 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3778 info->element_type = TREE_TYPE (vectype);
3779 info->memory_type = TREE_TYPE (DR_REF (dr));
3782 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3783 gather load or scatter store. Describe the operation in *INFO if so. */
3785 bool
3786 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3787 gather_scatter_info *info)
3789 HOST_WIDE_INT scale = 1;
3790 poly_int64 pbitpos, pbitsize;
3791 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3792 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3793 tree offtype = NULL_TREE;
3794 tree decl = NULL_TREE, base, off;
3795 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3796 tree memory_type = TREE_TYPE (DR_REF (dr));
3797 machine_mode pmode;
3798 int punsignedp, reversep, pvolatilep = 0;
3799 internal_fn ifn;
3800 tree offset_vectype;
3801 bool masked_p = false;
3803 /* See whether this is already a call to a gather/scatter internal function.
3804 If not, see whether it's a masked load or store. */
3805 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3806 if (call && gimple_call_internal_p (call))
3808 ifn = gimple_call_internal_fn (call);
3809 if (internal_gather_scatter_fn_p (ifn))
3811 vect_describe_gather_scatter_call (stmt_info, info);
3812 return true;
3814 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3817 /* True if we should aim to use internal functions rather than
3818 built-in functions. */
3819 bool use_ifn_p = (DR_IS_READ (dr)
3820 ? supports_vec_gather_load_p ()
3821 : supports_vec_scatter_store_p ());
3823 base = DR_REF (dr);
3824 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3825 see if we can use the def stmt of the address. */
3826 if (masked_p
3827 && TREE_CODE (base) == MEM_REF
3828 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3829 && integer_zerop (TREE_OPERAND (base, 1))
3830 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3832 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3833 if (is_gimple_assign (def_stmt)
3834 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3835 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3838 /* The gather and scatter builtins need address of the form
3839 loop_invariant + vector * {1, 2, 4, 8}
3841 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3842 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3843 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3844 multiplications and additions in it. To get a vector, we need
3845 a single SSA_NAME that will be defined in the loop and will
3846 contain everything that is not loop invariant and that can be
3847 vectorized. The following code attempts to find such a preexistng
3848 SSA_NAME OFF and put the loop invariants into a tree BASE
3849 that can be gimplified before the loop. */
3850 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3851 &punsignedp, &reversep, &pvolatilep);
3852 if (reversep)
3853 return false;
3855 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3857 if (TREE_CODE (base) == MEM_REF)
3859 if (!integer_zerop (TREE_OPERAND (base, 1)))
3861 if (off == NULL_TREE)
3862 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3863 else
3864 off = size_binop (PLUS_EXPR, off,
3865 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3867 base = TREE_OPERAND (base, 0);
3869 else
3870 base = build_fold_addr_expr (base);
3872 if (off == NULL_TREE)
3873 off = size_zero_node;
3875 /* If base is not loop invariant, either off is 0, then we start with just
3876 the constant offset in the loop invariant BASE and continue with base
3877 as OFF, otherwise give up.
3878 We could handle that case by gimplifying the addition of base + off
3879 into some SSA_NAME and use that as off, but for now punt. */
3880 if (!expr_invariant_in_loop_p (loop, base))
3882 if (!integer_zerop (off))
3883 return false;
3884 off = base;
3885 base = size_int (pbytepos);
3887 /* Otherwise put base + constant offset into the loop invariant BASE
3888 and continue with OFF. */
3889 else
3891 base = fold_convert (sizetype, base);
3892 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3895 /* OFF at this point may be either a SSA_NAME or some tree expression
3896 from get_inner_reference. Try to peel off loop invariants from it
3897 into BASE as long as possible. */
3898 STRIP_NOPS (off);
3899 while (offtype == NULL_TREE)
3901 enum tree_code code;
3902 tree op0, op1, add = NULL_TREE;
3904 if (TREE_CODE (off) == SSA_NAME)
3906 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3908 if (expr_invariant_in_loop_p (loop, off))
3909 return false;
3911 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3912 break;
3914 op0 = gimple_assign_rhs1 (def_stmt);
3915 code = gimple_assign_rhs_code (def_stmt);
3916 op1 = gimple_assign_rhs2 (def_stmt);
3918 else
3920 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3921 return false;
3922 code = TREE_CODE (off);
3923 extract_ops_from_tree (off, &code, &op0, &op1);
3925 switch (code)
3927 case POINTER_PLUS_EXPR:
3928 case PLUS_EXPR:
3929 if (expr_invariant_in_loop_p (loop, op0))
3931 add = op0;
3932 off = op1;
3933 do_add:
3934 add = fold_convert (sizetype, add);
3935 if (scale != 1)
3936 add = size_binop (MULT_EXPR, add, size_int (scale));
3937 base = size_binop (PLUS_EXPR, base, add);
3938 continue;
3940 if (expr_invariant_in_loop_p (loop, op1))
3942 add = op1;
3943 off = op0;
3944 goto do_add;
3946 break;
3947 case MINUS_EXPR:
3948 if (expr_invariant_in_loop_p (loop, op1))
3950 add = fold_convert (sizetype, op1);
3951 add = size_binop (MINUS_EXPR, size_zero_node, add);
3952 off = op0;
3953 goto do_add;
3955 break;
3956 case MULT_EXPR:
3957 if (scale == 1 && tree_fits_shwi_p (op1))
3959 int new_scale = tree_to_shwi (op1);
3960 /* Only treat this as a scaling operation if the target
3961 supports it for at least some offset type. */
3962 if (use_ifn_p
3963 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3964 masked_p, vectype, memory_type,
3965 signed_char_type_node,
3966 new_scale, &ifn,
3967 &offset_vectype)
3968 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3969 masked_p, vectype, memory_type,
3970 unsigned_char_type_node,
3971 new_scale, &ifn,
3972 &offset_vectype))
3973 break;
3974 scale = new_scale;
3975 off = op0;
3976 continue;
3978 break;
3979 case SSA_NAME:
3980 off = op0;
3981 continue;
3982 CASE_CONVERT:
3983 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3984 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3985 break;
3987 /* Don't include the conversion if the target is happy with
3988 the current offset type. */
3989 if (use_ifn_p
3990 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3991 masked_p, vectype, memory_type,
3992 TREE_TYPE (off), scale, &ifn,
3993 &offset_vectype))
3994 break;
3996 if (TYPE_PRECISION (TREE_TYPE (op0))
3997 == TYPE_PRECISION (TREE_TYPE (off)))
3999 off = op0;
4000 continue;
4003 if (TYPE_PRECISION (TREE_TYPE (op0))
4004 < TYPE_PRECISION (TREE_TYPE (off)))
4006 off = op0;
4007 offtype = TREE_TYPE (off);
4008 STRIP_NOPS (off);
4009 continue;
4011 break;
4012 default:
4013 break;
4015 break;
4018 /* If at the end OFF still isn't a SSA_NAME or isn't
4019 defined in the loop, punt. */
4020 if (TREE_CODE (off) != SSA_NAME
4021 || expr_invariant_in_loop_p (loop, off))
4022 return false;
4024 if (offtype == NULL_TREE)
4025 offtype = TREE_TYPE (off);
4027 if (use_ifn_p)
4029 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4030 vectype, memory_type, offtype, scale,
4031 &ifn, &offset_vectype))
4032 return false;
4034 else
4036 if (DR_IS_READ (dr))
4038 if (targetm.vectorize.builtin_gather)
4039 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4041 else
4043 if (targetm.vectorize.builtin_scatter)
4044 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4047 if (!decl)
4048 return false;
4050 ifn = IFN_LAST;
4051 /* The offset vector type will be read from DECL when needed. */
4052 offset_vectype = NULL_TREE;
4055 info->ifn = ifn;
4056 info->decl = decl;
4057 info->base = base;
4058 info->offset = off;
4059 info->offset_dt = vect_unknown_def_type;
4060 info->offset_vectype = offset_vectype;
4061 info->scale = scale;
4062 info->element_type = TREE_TYPE (vectype);
4063 info->memory_type = memory_type;
4064 return true;
4067 /* Find the data references in STMT, analyze them with respect to LOOP and
4068 append them to DATAREFS. Return false if datarefs in this stmt cannot
4069 be handled. */
4071 opt_result
4072 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4073 vec<data_reference_p> *datarefs,
4074 vec<int> *dataref_groups, int group_id)
4076 /* We can ignore clobbers for dataref analysis - they are removed during
4077 loop vectorization and BB vectorization checks dependences with a
4078 stmt walk. */
4079 if (gimple_clobber_p (stmt))
4080 return opt_result::success ();
4082 if (gimple_has_volatile_ops (stmt))
4083 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4084 stmt);
4086 if (stmt_can_throw_internal (cfun, stmt))
4087 return opt_result::failure_at (stmt,
4088 "not vectorized:"
4089 " statement can throw an exception: %G",
4090 stmt);
4092 auto_vec<data_reference_p, 2> refs;
4093 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4094 if (!res)
4095 return res;
4097 if (refs.is_empty ())
4098 return opt_result::success ();
4100 if (refs.length () > 1)
4102 while (!refs.is_empty ())
4103 free_data_ref (refs.pop ());
4104 return opt_result::failure_at (stmt,
4105 "not vectorized: more than one "
4106 "data ref in stmt: %G", stmt);
4109 data_reference_p dr = refs.pop ();
4110 if (gcall *call = dyn_cast <gcall *> (stmt))
4111 if (!gimple_call_internal_p (call)
4112 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4113 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4115 free_data_ref (dr);
4116 return opt_result::failure_at (stmt,
4117 "not vectorized: dr in a call %G", stmt);
4120 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4121 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4123 free_data_ref (dr);
4124 return opt_result::failure_at (stmt,
4125 "not vectorized:"
4126 " statement is bitfield access %G", stmt);
4129 if (DR_BASE_ADDRESS (dr)
4130 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4132 free_data_ref (dr);
4133 return opt_result::failure_at (stmt,
4134 "not vectorized:"
4135 " base addr of dr is a constant\n");
4138 /* Check whether this may be a SIMD lane access and adjust the
4139 DR to make it easier for us to handle it. */
4140 if (loop
4141 && loop->simduid
4142 && (!DR_BASE_ADDRESS (dr)
4143 || !DR_OFFSET (dr)
4144 || !DR_INIT (dr)
4145 || !DR_STEP (dr)))
4147 struct data_reference *newdr
4148 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4149 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4150 if (DR_BASE_ADDRESS (newdr)
4151 && DR_OFFSET (newdr)
4152 && DR_INIT (newdr)
4153 && DR_STEP (newdr)
4154 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4155 && integer_zerop (DR_STEP (newdr)))
4157 tree base_address = DR_BASE_ADDRESS (newdr);
4158 tree off = DR_OFFSET (newdr);
4159 tree step = ssize_int (1);
4160 if (integer_zerop (off)
4161 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4163 off = TREE_OPERAND (base_address, 1);
4164 base_address = TREE_OPERAND (base_address, 0);
4166 STRIP_NOPS (off);
4167 if (TREE_CODE (off) == MULT_EXPR
4168 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4170 step = TREE_OPERAND (off, 1);
4171 off = TREE_OPERAND (off, 0);
4172 STRIP_NOPS (off);
4174 if (CONVERT_EXPR_P (off)
4175 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4176 < TYPE_PRECISION (TREE_TYPE (off))))
4177 off = TREE_OPERAND (off, 0);
4178 if (TREE_CODE (off) == SSA_NAME)
4180 gimple *def = SSA_NAME_DEF_STMT (off);
4181 /* Look through widening conversion. */
4182 if (is_gimple_assign (def)
4183 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4185 tree rhs1 = gimple_assign_rhs1 (def);
4186 if (TREE_CODE (rhs1) == SSA_NAME
4187 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4188 && (TYPE_PRECISION (TREE_TYPE (off))
4189 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4190 def = SSA_NAME_DEF_STMT (rhs1);
4192 if (is_gimple_call (def)
4193 && gimple_call_internal_p (def)
4194 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4196 tree arg = gimple_call_arg (def, 0);
4197 tree reft = TREE_TYPE (DR_REF (newdr));
4198 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4199 arg = SSA_NAME_VAR (arg);
4200 if (arg == loop->simduid
4201 /* For now. */
4202 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4204 DR_BASE_ADDRESS (newdr) = base_address;
4205 DR_OFFSET (newdr) = ssize_int (0);
4206 DR_STEP (newdr) = step;
4207 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4208 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4209 /* Mark as simd-lane access. */
4210 tree arg2 = gimple_call_arg (def, 1);
4211 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4212 free_data_ref (dr);
4213 datarefs->safe_push (newdr);
4214 if (dataref_groups)
4215 dataref_groups->safe_push (group_id);
4216 return opt_result::success ();
4221 free_data_ref (newdr);
4224 datarefs->safe_push (dr);
4225 if (dataref_groups)
4226 dataref_groups->safe_push (group_id);
4227 return opt_result::success ();
4230 /* Function vect_analyze_data_refs.
4232 Find all the data references in the loop or basic block.
4234 The general structure of the analysis of data refs in the vectorizer is as
4235 follows:
4236 1- vect_analyze_data_refs(loop/bb): call
4237 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4238 in the loop/bb and their dependences.
4239 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4240 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4241 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4245 opt_result
4246 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4248 class loop *loop = NULL;
4249 unsigned int i;
4250 struct data_reference *dr;
4251 tree scalar_type;
4253 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4255 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4256 loop = LOOP_VINFO_LOOP (loop_vinfo);
4258 /* Go through the data-refs, check that the analysis succeeded. Update
4259 pointer from stmt_vec_info struct to DR and vectype. */
4261 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4262 FOR_EACH_VEC_ELT (datarefs, i, dr)
4264 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4265 poly_uint64 vf;
4267 gcc_assert (DR_REF (dr));
4268 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4269 gcc_assert (!stmt_info->dr_aux.dr);
4270 stmt_info->dr_aux.dr = dr;
4271 stmt_info->dr_aux.stmt = stmt_info;
4273 /* Check that analysis of the data-ref succeeded. */
4274 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4275 || !DR_STEP (dr))
4277 bool maybe_gather
4278 = DR_IS_READ (dr)
4279 && !TREE_THIS_VOLATILE (DR_REF (dr))
4280 && (targetm.vectorize.builtin_gather != NULL
4281 || supports_vec_gather_load_p ());
4282 bool maybe_scatter
4283 = DR_IS_WRITE (dr)
4284 && !TREE_THIS_VOLATILE (DR_REF (dr))
4285 && (targetm.vectorize.builtin_scatter != NULL
4286 || supports_vec_scatter_store_p ());
4288 /* If target supports vector gather loads or scatter stores,
4289 see if they can't be used. */
4290 if (is_a <loop_vec_info> (vinfo)
4291 && !nested_in_vect_loop_p (loop, stmt_info))
4293 if (maybe_gather || maybe_scatter)
4295 if (maybe_gather)
4296 gatherscatter = GATHER;
4297 else
4298 gatherscatter = SCATTER;
4302 if (gatherscatter == SG_NONE)
4304 if (dump_enabled_p ())
4305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4306 "not vectorized: data ref analysis "
4307 "failed %G", stmt_info->stmt);
4308 if (is_a <bb_vec_info> (vinfo))
4310 /* In BB vectorization the ref can still participate
4311 in dependence analysis, we just can't vectorize it. */
4312 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4313 continue;
4315 return opt_result::failure_at (stmt_info->stmt,
4316 "not vectorized:"
4317 " data ref analysis failed: %G",
4318 stmt_info->stmt);
4322 /* See if this was detected as SIMD lane access. */
4323 if (dr->aux == (void *)-1
4324 || dr->aux == (void *)-2
4325 || dr->aux == (void *)-3
4326 || dr->aux == (void *)-4)
4328 if (nested_in_vect_loop_p (loop, stmt_info))
4329 return opt_result::failure_at (stmt_info->stmt,
4330 "not vectorized:"
4331 " data ref analysis failed: %G",
4332 stmt_info->stmt);
4333 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4334 = -(uintptr_t) dr->aux;
4337 tree base = get_base_address (DR_REF (dr));
4338 if (base && VAR_P (base) && DECL_NONALIASED (base))
4340 if (dump_enabled_p ())
4341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4342 "not vectorized: base object not addressable "
4343 "for stmt: %G", stmt_info->stmt);
4344 if (is_a <bb_vec_info> (vinfo))
4346 /* In BB vectorization the ref can still participate
4347 in dependence analysis, we just can't vectorize it. */
4348 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4349 continue;
4351 return opt_result::failure_at (stmt_info->stmt,
4352 "not vectorized: base object not"
4353 " addressable for stmt: %G",
4354 stmt_info->stmt);
4357 if (is_a <loop_vec_info> (vinfo)
4358 && DR_STEP (dr)
4359 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4361 if (nested_in_vect_loop_p (loop, stmt_info))
4362 return opt_result::failure_at (stmt_info->stmt,
4363 "not vectorized: "
4364 "not suitable for strided load %G",
4365 stmt_info->stmt);
4366 STMT_VINFO_STRIDED_P (stmt_info) = true;
4369 /* Update DR field in stmt_vec_info struct. */
4371 /* If the dataref is in an inner-loop of the loop that is considered for
4372 for vectorization, we also want to analyze the access relative to
4373 the outer-loop (DR contains information only relative to the
4374 inner-most enclosing loop). We do that by building a reference to the
4375 first location accessed by the inner-loop, and analyze it relative to
4376 the outer-loop. */
4377 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4379 /* Build a reference to the first location accessed by the
4380 inner loop: *(BASE + INIT + OFFSET). By construction,
4381 this address must be invariant in the inner loop, so we
4382 can consider it as being used in the outer loop. */
4383 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4384 tree offset = unshare_expr (DR_OFFSET (dr));
4385 tree init = unshare_expr (DR_INIT (dr));
4386 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4387 init, offset);
4388 tree init_addr = fold_build_pointer_plus (base, init_offset);
4389 tree init_ref = build_fold_indirect_ref (init_addr);
4391 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_NOTE, vect_location,
4393 "analyze in outer loop: %T\n", init_ref);
4395 opt_result res
4396 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4397 init_ref, loop, stmt_info->stmt);
4398 if (!res)
4399 /* dr_analyze_innermost already explained the failure. */
4400 return res;
4402 if (dump_enabled_p ())
4403 dump_printf_loc (MSG_NOTE, vect_location,
4404 "\touter base_address: %T\n"
4405 "\touter offset from base address: %T\n"
4406 "\touter constant offset from base address: %T\n"
4407 "\touter step: %T\n"
4408 "\touter base alignment: %d\n\n"
4409 "\touter base misalignment: %d\n"
4410 "\touter offset alignment: %d\n"
4411 "\touter step alignment: %d\n",
4412 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4413 STMT_VINFO_DR_OFFSET (stmt_info),
4414 STMT_VINFO_DR_INIT (stmt_info),
4415 STMT_VINFO_DR_STEP (stmt_info),
4416 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4417 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4418 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4419 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4422 /* Set vectype for STMT. */
4423 scalar_type = TREE_TYPE (DR_REF (dr));
4424 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4425 if (!vectype)
4427 if (dump_enabled_p ())
4429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4430 "not vectorized: no vectype for stmt: %G",
4431 stmt_info->stmt);
4432 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4433 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4434 scalar_type);
4435 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4438 if (is_a <bb_vec_info> (vinfo))
4440 /* No vector type is fine, the ref can still participate
4441 in dependence analysis, we just can't vectorize it. */
4442 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4443 continue;
4445 if (fatal)
4446 *fatal = false;
4447 return opt_result::failure_at (stmt_info->stmt,
4448 "not vectorized:"
4449 " no vectype for stmt: %G"
4450 " scalar_type: %T\n",
4451 stmt_info->stmt, scalar_type);
4453 else
4455 if (dump_enabled_p ())
4456 dump_printf_loc (MSG_NOTE, vect_location,
4457 "got vectype for stmt: %G%T\n",
4458 stmt_info->stmt, vectype);
4461 /* Adjust the minimal vectorization factor according to the
4462 vector type. */
4463 vf = TYPE_VECTOR_SUBPARTS (vectype);
4464 *min_vf = upper_bound (*min_vf, vf);
4466 /* Leave the BB vectorizer to pick the vector type later, based on
4467 the final dataref group size and SLP node size. */
4468 if (is_a <loop_vec_info> (vinfo))
4469 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4471 if (gatherscatter != SG_NONE)
4473 gather_scatter_info gs_info;
4474 if (!vect_check_gather_scatter (stmt_info,
4475 as_a <loop_vec_info> (vinfo),
4476 &gs_info)
4477 || !get_vectype_for_scalar_type (vinfo,
4478 TREE_TYPE (gs_info.offset)))
4480 if (fatal)
4481 *fatal = false;
4482 return opt_result::failure_at
4483 (stmt_info->stmt,
4484 (gatherscatter == GATHER)
4485 ? "not vectorized: not suitable for gather load %G"
4486 : "not vectorized: not suitable for scatter store %G",
4487 stmt_info->stmt);
4489 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4493 /* We used to stop processing and prune the list here. Verify we no
4494 longer need to. */
4495 gcc_assert (i == datarefs.length ());
4497 return opt_result::success ();
4501 /* Function vect_get_new_vect_var.
4503 Returns a name for a new variable. The current naming scheme appends the
4504 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4505 the name of vectorizer generated variables, and appends that to NAME if
4506 provided. */
4508 tree
4509 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4511 const char *prefix;
4512 tree new_vect_var;
4514 switch (var_kind)
4516 case vect_simple_var:
4517 prefix = "vect";
4518 break;
4519 case vect_scalar_var:
4520 prefix = "stmp";
4521 break;
4522 case vect_mask_var:
4523 prefix = "mask";
4524 break;
4525 case vect_pointer_var:
4526 prefix = "vectp";
4527 break;
4528 default:
4529 gcc_unreachable ();
4532 if (name)
4534 char* tmp = concat (prefix, "_", name, NULL);
4535 new_vect_var = create_tmp_reg (type, tmp);
4536 free (tmp);
4538 else
4539 new_vect_var = create_tmp_reg (type, prefix);
4541 return new_vect_var;
4544 /* Like vect_get_new_vect_var but return an SSA name. */
4546 tree
4547 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4549 const char *prefix;
4550 tree new_vect_var;
4552 switch (var_kind)
4554 case vect_simple_var:
4555 prefix = "vect";
4556 break;
4557 case vect_scalar_var:
4558 prefix = "stmp";
4559 break;
4560 case vect_pointer_var:
4561 prefix = "vectp";
4562 break;
4563 default:
4564 gcc_unreachable ();
4567 if (name)
4569 char* tmp = concat (prefix, "_", name, NULL);
4570 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4571 free (tmp);
4573 else
4574 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4576 return new_vect_var;
4579 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4581 static void
4582 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4584 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4585 int misalign = DR_MISALIGNMENT (dr_info);
4586 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4587 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4588 else
4589 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4590 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4591 misalign);
4594 /* Function vect_create_addr_base_for_vector_ref.
4596 Create an expression that computes the address of the first memory location
4597 that will be accessed for a data reference.
4599 Input:
4600 STMT_INFO: The statement containing the data reference.
4601 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4602 OFFSET: Optional. If supplied, it is be added to the initial address.
4603 LOOP: Specify relative to which loop-nest should the address be computed.
4604 For example, when the dataref is in an inner-loop nested in an
4605 outer-loop that is now being vectorized, LOOP can be either the
4606 outer-loop, or the inner-loop. The first memory location accessed
4607 by the following dataref ('in' points to short):
4609 for (i=0; i<N; i++)
4610 for (j=0; j<M; j++)
4611 s += in[i+j]
4613 is as follows:
4614 if LOOP=i_loop: &in (relative to i_loop)
4615 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4616 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4617 initial address. Unlike OFFSET, which is number of elements to
4618 be added, BYTE_OFFSET is measured in bytes.
4620 Output:
4621 1. Return an SSA_NAME whose value is the address of the memory location of
4622 the first vector of the data reference.
4623 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4624 these statement(s) which define the returned SSA_NAME.
4626 FORNOW: We are only handling array accesses with step 1. */
4628 tree
4629 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4630 gimple_seq *new_stmt_list,
4631 tree offset,
4632 tree byte_offset)
4634 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4635 struct data_reference *dr = dr_info->dr;
4636 const char *base_name;
4637 tree addr_base;
4638 tree dest;
4639 gimple_seq seq = NULL;
4640 tree vect_ptr_type;
4641 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4642 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4643 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4645 tree data_ref_base = unshare_expr (drb->base_address);
4646 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4647 tree init = unshare_expr (drb->init);
4649 if (loop_vinfo)
4650 base_name = get_name (data_ref_base);
4651 else
4653 base_offset = ssize_int (0);
4654 init = ssize_int (0);
4655 base_name = get_name (DR_REF (dr));
4658 /* Create base_offset */
4659 base_offset = size_binop (PLUS_EXPR,
4660 fold_convert (sizetype, base_offset),
4661 fold_convert (sizetype, init));
4663 if (offset)
4665 offset = fold_build2 (MULT_EXPR, sizetype,
4666 fold_convert (sizetype, offset), step);
4667 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4668 base_offset, offset);
4670 if (byte_offset)
4672 byte_offset = fold_convert (sizetype, byte_offset);
4673 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4674 base_offset, byte_offset);
4677 /* base + base_offset */
4678 if (loop_vinfo)
4679 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4680 else
4682 addr_base = build1 (ADDR_EXPR,
4683 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4684 unshare_expr (DR_REF (dr)));
4687 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4688 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4689 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4690 gimple_seq_add_seq (new_stmt_list, seq);
4692 if (DR_PTR_INFO (dr)
4693 && TREE_CODE (addr_base) == SSA_NAME
4694 && !SSA_NAME_PTR_INFO (addr_base))
4696 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4697 if (offset || byte_offset)
4698 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4701 if (dump_enabled_p ())
4702 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4704 return addr_base;
4708 /* Function vect_create_data_ref_ptr.
4710 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4711 location accessed in the loop by STMT_INFO, along with the def-use update
4712 chain to appropriately advance the pointer through the loop iterations.
4713 Also set aliasing information for the pointer. This pointer is used by
4714 the callers to this function to create a memory reference expression for
4715 vector load/store access.
4717 Input:
4718 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4719 GIMPLE_ASSIGN <name, data-ref> or
4720 GIMPLE_ASSIGN <data-ref, name>.
4721 2. AGGR_TYPE: the type of the reference, which should be either a vector
4722 or an array.
4723 3. AT_LOOP: the loop where the vector memref is to be created.
4724 4. OFFSET (optional): an offset to be added to the initial address accessed
4725 by the data-ref in STMT_INFO.
4726 5. BSI: location where the new stmts are to be placed if there is no loop
4727 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4728 pointing to the initial address.
4729 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4730 to the initial address accessed by the data-ref in STMT_INFO. This is
4731 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4732 in bytes.
4733 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4734 to the IV during each iteration of the loop. NULL says to move
4735 by one copy of AGGR_TYPE up or down, depending on the step of the
4736 data reference.
4738 Output:
4739 1. Declare a new ptr to vector_type, and have it point to the base of the
4740 data reference (initial addressed accessed by the data reference).
4741 For example, for vector of type V8HI, the following code is generated:
4743 v8hi *ap;
4744 ap = (v8hi *)initial_address;
4746 if OFFSET is not supplied:
4747 initial_address = &a[init];
4748 if OFFSET is supplied:
4749 initial_address = &a[init + OFFSET];
4750 if BYTE_OFFSET is supplied:
4751 initial_address = &a[init] + BYTE_OFFSET;
4753 Return the initial_address in INITIAL_ADDRESS.
4755 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4756 update the pointer in each iteration of the loop.
4758 Return the increment stmt that updates the pointer in PTR_INCR.
4760 3. Return the pointer. */
4762 tree
4763 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4764 tree aggr_type, class loop *at_loop, tree offset,
4765 tree *initial_address, gimple_stmt_iterator *gsi,
4766 gimple **ptr_incr, bool only_init,
4767 tree byte_offset, tree iv_step)
4769 const char *base_name;
4770 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4771 class loop *loop = NULL;
4772 bool nested_in_vect_loop = false;
4773 class loop *containing_loop = NULL;
4774 tree aggr_ptr_type;
4775 tree aggr_ptr;
4776 tree new_temp;
4777 gimple_seq new_stmt_list = NULL;
4778 edge pe = NULL;
4779 basic_block new_bb;
4780 tree aggr_ptr_init;
4781 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4782 struct data_reference *dr = dr_info->dr;
4783 tree aptr;
4784 gimple_stmt_iterator incr_gsi;
4785 bool insert_after;
4786 tree indx_before_incr, indx_after_incr;
4787 gimple *incr;
4788 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4790 gcc_assert (iv_step != NULL_TREE
4791 || TREE_CODE (aggr_type) == ARRAY_TYPE
4792 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4794 if (loop_vinfo)
4796 loop = LOOP_VINFO_LOOP (loop_vinfo);
4797 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4798 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4799 pe = loop_preheader_edge (loop);
4801 else
4803 gcc_assert (bb_vinfo);
4804 only_init = true;
4805 *ptr_incr = NULL;
4808 /* Create an expression for the first address accessed by this load
4809 in LOOP. */
4810 base_name = get_name (DR_BASE_ADDRESS (dr));
4812 if (dump_enabled_p ())
4814 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4815 dump_printf_loc (MSG_NOTE, vect_location,
4816 "create %s-pointer variable to type: %T",
4817 get_tree_code_name (TREE_CODE (aggr_type)),
4818 aggr_type);
4819 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4820 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4821 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4822 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4823 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4824 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4825 else
4826 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4827 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4830 /* (1) Create the new aggregate-pointer variable.
4831 Vector and array types inherit the alias set of their component
4832 type by default so we need to use a ref-all pointer if the data
4833 reference does not conflict with the created aggregated data
4834 reference because it is not addressable. */
4835 bool need_ref_all = false;
4836 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4837 get_alias_set (DR_REF (dr))))
4838 need_ref_all = true;
4839 /* Likewise for any of the data references in the stmt group. */
4840 else if (DR_GROUP_SIZE (stmt_info) > 1)
4842 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4845 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4846 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4847 get_alias_set (DR_REF (sdr))))
4849 need_ref_all = true;
4850 break;
4852 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4854 while (sinfo);
4856 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4857 need_ref_all);
4858 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4861 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4862 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4863 def-use update cycles for the pointer: one relative to the outer-loop
4864 (LOOP), which is what steps (3) and (4) below do. The other is relative
4865 to the inner-loop (which is the inner-most loop containing the dataref),
4866 and this is done be step (5) below.
4868 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4869 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4870 redundant. Steps (3),(4) create the following:
4872 vp0 = &base_addr;
4873 LOOP: vp1 = phi(vp0,vp2)
4876 vp2 = vp1 + step
4877 goto LOOP
4879 If there is an inner-loop nested in loop, then step (5) will also be
4880 applied, and an additional update in the inner-loop will be created:
4882 vp0 = &base_addr;
4883 LOOP: vp1 = phi(vp0,vp2)
4885 inner: vp3 = phi(vp1,vp4)
4886 vp4 = vp3 + inner_step
4887 if () goto inner
4889 vp2 = vp1 + step
4890 if () goto LOOP */
4892 /* (2) Calculate the initial address of the aggregate-pointer, and set
4893 the aggregate-pointer to point to it before the loop. */
4895 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4897 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4898 stmt_info, &new_stmt_list,
4899 offset, byte_offset);
4900 if (new_stmt_list)
4902 if (pe)
4904 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4905 gcc_assert (!new_bb);
4907 else
4908 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4911 *initial_address = new_temp;
4912 aggr_ptr_init = new_temp;
4914 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4915 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4916 inner-loop nested in LOOP (during outer-loop vectorization). */
4918 /* No update in loop is required. */
4919 if (only_init && (!loop_vinfo || at_loop == loop))
4920 aptr = aggr_ptr_init;
4921 else
4923 /* Accesses to invariant addresses should be handled specially
4924 by the caller. */
4925 tree step = vect_dr_behavior (vinfo, dr_info)->step;
4926 gcc_assert (!integer_zerop (step));
4928 if (iv_step == NULL_TREE)
4930 /* The step of the aggregate pointer is the type size,
4931 negated for downward accesses. */
4932 iv_step = TYPE_SIZE_UNIT (aggr_type);
4933 if (tree_int_cst_sgn (step) == -1)
4934 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4937 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4939 create_iv (aggr_ptr_init,
4940 fold_convert (aggr_ptr_type, iv_step),
4941 aggr_ptr, loop, &incr_gsi, insert_after,
4942 &indx_before_incr, &indx_after_incr);
4943 incr = gsi_stmt (incr_gsi);
4945 /* Copy the points-to information if it exists. */
4946 if (DR_PTR_INFO (dr))
4948 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4949 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4951 if (ptr_incr)
4952 *ptr_incr = incr;
4954 aptr = indx_before_incr;
4957 if (!nested_in_vect_loop || only_init)
4958 return aptr;
4961 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4962 nested in LOOP, if exists. */
4964 gcc_assert (nested_in_vect_loop);
4965 if (!only_init)
4967 standard_iv_increment_position (containing_loop, &incr_gsi,
4968 &insert_after);
4969 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4970 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4971 &indx_after_incr);
4972 incr = gsi_stmt (incr_gsi);
4974 /* Copy the points-to information if it exists. */
4975 if (DR_PTR_INFO (dr))
4977 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4978 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4980 if (ptr_incr)
4981 *ptr_incr = incr;
4983 return indx_before_incr;
4985 else
4986 gcc_unreachable ();
4990 /* Function bump_vector_ptr
4992 Increment a pointer (to a vector type) by vector-size. If requested,
4993 i.e. if PTR-INCR is given, then also connect the new increment stmt
4994 to the existing def-use update-chain of the pointer, by modifying
4995 the PTR_INCR as illustrated below:
4997 The pointer def-use update-chain before this function:
4998 DATAREF_PTR = phi (p_0, p_2)
4999 ....
5000 PTR_INCR: p_2 = DATAREF_PTR + step
5002 The pointer def-use update-chain after this function:
5003 DATAREF_PTR = phi (p_0, p_2)
5004 ....
5005 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5006 ....
5007 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5009 Input:
5010 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5011 in the loop.
5012 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5013 the loop. The increment amount across iterations is expected
5014 to be vector_size.
5015 BSI - location where the new update stmt is to be placed.
5016 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5017 BUMP - optional. The offset by which to bump the pointer. If not given,
5018 the offset is assumed to be vector_size.
5020 Output: Return NEW_DATAREF_PTR as illustrated above.
5024 tree
5025 bump_vector_ptr (vec_info *vinfo,
5026 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5027 stmt_vec_info stmt_info, tree bump)
5029 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5030 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5031 tree update = TYPE_SIZE_UNIT (vectype);
5032 gassign *incr_stmt;
5033 ssa_op_iter iter;
5034 use_operand_p use_p;
5035 tree new_dataref_ptr;
5037 if (bump)
5038 update = bump;
5040 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5041 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5042 else
5043 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5044 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5045 dataref_ptr, update);
5046 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5048 /* Copy the points-to information if it exists. */
5049 if (DR_PTR_INFO (dr))
5051 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5052 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5055 if (!ptr_incr)
5056 return new_dataref_ptr;
5058 /* Update the vector-pointer's cross-iteration increment. */
5059 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5061 tree use = USE_FROM_PTR (use_p);
5063 if (use == dataref_ptr)
5064 SET_USE (use_p, new_dataref_ptr);
5065 else
5066 gcc_assert (operand_equal_p (use, update, 0));
5069 return new_dataref_ptr;
5073 /* Copy memory reference info such as base/clique from the SRC reference
5074 to the DEST MEM_REF. */
5076 void
5077 vect_copy_ref_info (tree dest, tree src)
5079 if (TREE_CODE (dest) != MEM_REF)
5080 return;
5082 tree src_base = src;
5083 while (handled_component_p (src_base))
5084 src_base = TREE_OPERAND (src_base, 0);
5085 if (TREE_CODE (src_base) != MEM_REF
5086 && TREE_CODE (src_base) != TARGET_MEM_REF)
5087 return;
5089 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5090 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5094 /* Function vect_create_destination_var.
5096 Create a new temporary of type VECTYPE. */
5098 tree
5099 vect_create_destination_var (tree scalar_dest, tree vectype)
5101 tree vec_dest;
5102 const char *name;
5103 char *new_name;
5104 tree type;
5105 enum vect_var_kind kind;
5107 kind = vectype
5108 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5109 ? vect_mask_var
5110 : vect_simple_var
5111 : vect_scalar_var;
5112 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5114 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5116 name = get_name (scalar_dest);
5117 if (name)
5118 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5119 else
5120 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5121 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5122 free (new_name);
5124 return vec_dest;
5127 /* Function vect_grouped_store_supported.
5129 Returns TRUE if interleave high and interleave low permutations
5130 are supported, and FALSE otherwise. */
5132 bool
5133 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5135 machine_mode mode = TYPE_MODE (vectype);
5137 /* vect_permute_store_chain requires the group size to be equal to 3 or
5138 be a power of two. */
5139 if (count != 3 && exact_log2 (count) == -1)
5141 if (dump_enabled_p ())
5142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5143 "the size of the group of accesses"
5144 " is not a power of 2 or not eqaul to 3\n");
5145 return false;
5148 /* Check that the permutation is supported. */
5149 if (VECTOR_MODE_P (mode))
5151 unsigned int i;
5152 if (count == 3)
5154 unsigned int j0 = 0, j1 = 0, j2 = 0;
5155 unsigned int i, j;
5157 unsigned int nelt;
5158 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5160 if (dump_enabled_p ())
5161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5162 "cannot handle groups of 3 stores for"
5163 " variable-length vectors\n");
5164 return false;
5167 vec_perm_builder sel (nelt, nelt, 1);
5168 sel.quick_grow (nelt);
5169 vec_perm_indices indices;
5170 for (j = 0; j < 3; j++)
5172 int nelt0 = ((3 - j) * nelt) % 3;
5173 int nelt1 = ((3 - j) * nelt + 1) % 3;
5174 int nelt2 = ((3 - j) * nelt + 2) % 3;
5175 for (i = 0; i < nelt; i++)
5177 if (3 * i + nelt0 < nelt)
5178 sel[3 * i + nelt0] = j0++;
5179 if (3 * i + nelt1 < nelt)
5180 sel[3 * i + nelt1] = nelt + j1++;
5181 if (3 * i + nelt2 < nelt)
5182 sel[3 * i + nelt2] = 0;
5184 indices.new_vector (sel, 2, nelt);
5185 if (!can_vec_perm_const_p (mode, indices))
5187 if (dump_enabled_p ())
5188 dump_printf (MSG_MISSED_OPTIMIZATION,
5189 "permutation op not supported by target.\n");
5190 return false;
5193 for (i = 0; i < nelt; i++)
5195 if (3 * i + nelt0 < nelt)
5196 sel[3 * i + nelt0] = 3 * i + nelt0;
5197 if (3 * i + nelt1 < nelt)
5198 sel[3 * i + nelt1] = 3 * i + nelt1;
5199 if (3 * i + nelt2 < nelt)
5200 sel[3 * i + nelt2] = nelt + j2++;
5202 indices.new_vector (sel, 2, nelt);
5203 if (!can_vec_perm_const_p (mode, indices))
5205 if (dump_enabled_p ())
5206 dump_printf (MSG_MISSED_OPTIMIZATION,
5207 "permutation op not supported by target.\n");
5208 return false;
5211 return true;
5213 else
5215 /* If length is not equal to 3 then only power of 2 is supported. */
5216 gcc_assert (pow2p_hwi (count));
5217 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5219 /* The encoding has 2 interleaved stepped patterns. */
5220 vec_perm_builder sel (nelt, 2, 3);
5221 sel.quick_grow (6);
5222 for (i = 0; i < 3; i++)
5224 sel[i * 2] = i;
5225 sel[i * 2 + 1] = i + nelt;
5227 vec_perm_indices indices (sel, 2, nelt);
5228 if (can_vec_perm_const_p (mode, indices))
5230 for (i = 0; i < 6; i++)
5231 sel[i] += exact_div (nelt, 2);
5232 indices.new_vector (sel, 2, nelt);
5233 if (can_vec_perm_const_p (mode, indices))
5234 return true;
5239 if (dump_enabled_p ())
5240 dump_printf (MSG_MISSED_OPTIMIZATION,
5241 "permutation op not supported by target.\n");
5242 return false;
5246 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5247 type VECTYPE. MASKED_P says whether the masked form is needed. */
5249 bool
5250 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5251 bool masked_p)
5253 if (masked_p)
5254 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5255 vec_mask_store_lanes_optab,
5256 vectype, count);
5257 else
5258 return vect_lanes_optab_supported_p ("vec_store_lanes",
5259 vec_store_lanes_optab,
5260 vectype, count);
5264 /* Function vect_permute_store_chain.
5266 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5267 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5268 the data correctly for the stores. Return the final references for stores
5269 in RESULT_CHAIN.
5271 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5272 The input is 4 vectors each containing 8 elements. We assign a number to
5273 each element, the input sequence is:
5275 1st vec: 0 1 2 3 4 5 6 7
5276 2nd vec: 8 9 10 11 12 13 14 15
5277 3rd vec: 16 17 18 19 20 21 22 23
5278 4th vec: 24 25 26 27 28 29 30 31
5280 The output sequence should be:
5282 1st vec: 0 8 16 24 1 9 17 25
5283 2nd vec: 2 10 18 26 3 11 19 27
5284 3rd vec: 4 12 20 28 5 13 21 30
5285 4th vec: 6 14 22 30 7 15 23 31
5287 i.e., we interleave the contents of the four vectors in their order.
5289 We use interleave_high/low instructions to create such output. The input of
5290 each interleave_high/low operation is two vectors:
5291 1st vec 2nd vec
5292 0 1 2 3 4 5 6 7
5293 the even elements of the result vector are obtained left-to-right from the
5294 high/low elements of the first vector. The odd elements of the result are
5295 obtained left-to-right from the high/low elements of the second vector.
5296 The output of interleave_high will be: 0 4 1 5
5297 and of interleave_low: 2 6 3 7
5300 The permutation is done in log LENGTH stages. In each stage interleave_high
5301 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5302 where the first argument is taken from the first half of DR_CHAIN and the
5303 second argument from it's second half.
5304 In our example,
5306 I1: interleave_high (1st vec, 3rd vec)
5307 I2: interleave_low (1st vec, 3rd vec)
5308 I3: interleave_high (2nd vec, 4th vec)
5309 I4: interleave_low (2nd vec, 4th vec)
5311 The output for the first stage is:
5313 I1: 0 16 1 17 2 18 3 19
5314 I2: 4 20 5 21 6 22 7 23
5315 I3: 8 24 9 25 10 26 11 27
5316 I4: 12 28 13 29 14 30 15 31
5318 The output of the second stage, i.e. the final result is:
5320 I1: 0 8 16 24 1 9 17 25
5321 I2: 2 10 18 26 3 11 19 27
5322 I3: 4 12 20 28 5 13 21 30
5323 I4: 6 14 22 30 7 15 23 31. */
5325 void
5326 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5327 unsigned int length,
5328 stmt_vec_info stmt_info,
5329 gimple_stmt_iterator *gsi,
5330 vec<tree> *result_chain)
5332 tree vect1, vect2, high, low;
5333 gimple *perm_stmt;
5334 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5335 tree perm_mask_low, perm_mask_high;
5336 tree data_ref;
5337 tree perm3_mask_low, perm3_mask_high;
5338 unsigned int i, j, n, log_length = exact_log2 (length);
5340 result_chain->quick_grow (length);
5341 memcpy (result_chain->address (), dr_chain.address (),
5342 length * sizeof (tree));
5344 if (length == 3)
5346 /* vect_grouped_store_supported ensures that this is constant. */
5347 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5348 unsigned int j0 = 0, j1 = 0, j2 = 0;
5350 vec_perm_builder sel (nelt, nelt, 1);
5351 sel.quick_grow (nelt);
5352 vec_perm_indices indices;
5353 for (j = 0; j < 3; j++)
5355 int nelt0 = ((3 - j) * nelt) % 3;
5356 int nelt1 = ((3 - j) * nelt + 1) % 3;
5357 int nelt2 = ((3 - j) * nelt + 2) % 3;
5359 for (i = 0; i < nelt; i++)
5361 if (3 * i + nelt0 < nelt)
5362 sel[3 * i + nelt0] = j0++;
5363 if (3 * i + nelt1 < nelt)
5364 sel[3 * i + nelt1] = nelt + j1++;
5365 if (3 * i + nelt2 < nelt)
5366 sel[3 * i + nelt2] = 0;
5368 indices.new_vector (sel, 2, nelt);
5369 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5371 for (i = 0; i < nelt; i++)
5373 if (3 * i + nelt0 < nelt)
5374 sel[3 * i + nelt0] = 3 * i + nelt0;
5375 if (3 * i + nelt1 < nelt)
5376 sel[3 * i + nelt1] = 3 * i + nelt1;
5377 if (3 * i + nelt2 < nelt)
5378 sel[3 * i + nelt2] = nelt + j2++;
5380 indices.new_vector (sel, 2, nelt);
5381 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5383 vect1 = dr_chain[0];
5384 vect2 = dr_chain[1];
5386 /* Create interleaving stmt:
5387 low = VEC_PERM_EXPR <vect1, vect2,
5388 {j, nelt, *, j + 1, nelt + j + 1, *,
5389 j + 2, nelt + j + 2, *, ...}> */
5390 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5391 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5392 vect2, perm3_mask_low);
5393 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5395 vect1 = data_ref;
5396 vect2 = dr_chain[2];
5397 /* Create interleaving stmt:
5398 low = VEC_PERM_EXPR <vect1, vect2,
5399 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5400 6, 7, nelt + j + 2, ...}> */
5401 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5402 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5403 vect2, perm3_mask_high);
5404 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5405 (*result_chain)[j] = data_ref;
5408 else
5410 /* If length is not equal to 3 then only power of 2 is supported. */
5411 gcc_assert (pow2p_hwi (length));
5413 /* The encoding has 2 interleaved stepped patterns. */
5414 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5415 vec_perm_builder sel (nelt, 2, 3);
5416 sel.quick_grow (6);
5417 for (i = 0; i < 3; i++)
5419 sel[i * 2] = i;
5420 sel[i * 2 + 1] = i + nelt;
5422 vec_perm_indices indices (sel, 2, nelt);
5423 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5425 for (i = 0; i < 6; i++)
5426 sel[i] += exact_div (nelt, 2);
5427 indices.new_vector (sel, 2, nelt);
5428 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5430 for (i = 0, n = log_length; i < n; i++)
5432 for (j = 0; j < length/2; j++)
5434 vect1 = dr_chain[j];
5435 vect2 = dr_chain[j+length/2];
5437 /* Create interleaving stmt:
5438 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5439 ...}> */
5440 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5441 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5442 vect2, perm_mask_high);
5443 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5444 (*result_chain)[2*j] = high;
5446 /* Create interleaving stmt:
5447 low = VEC_PERM_EXPR <vect1, vect2,
5448 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5449 ...}> */
5450 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5451 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5452 vect2, perm_mask_low);
5453 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5454 (*result_chain)[2*j+1] = low;
5456 memcpy (dr_chain.address (), result_chain->address (),
5457 length * sizeof (tree));
5462 /* Function vect_setup_realignment
5464 This function is called when vectorizing an unaligned load using
5465 the dr_explicit_realign[_optimized] scheme.
5466 This function generates the following code at the loop prolog:
5468 p = initial_addr;
5469 x msq_init = *(floor(p)); # prolog load
5470 realignment_token = call target_builtin;
5471 loop:
5472 x msq = phi (msq_init, ---)
5474 The stmts marked with x are generated only for the case of
5475 dr_explicit_realign_optimized.
5477 The code above sets up a new (vector) pointer, pointing to the first
5478 location accessed by STMT_INFO, and a "floor-aligned" load using that
5479 pointer. It also generates code to compute the "realignment-token"
5480 (if the relevant target hook was defined), and creates a phi-node at the
5481 loop-header bb whose arguments are the result of the prolog-load (created
5482 by this function) and the result of a load that takes place in the loop
5483 (to be created by the caller to this function).
5485 For the case of dr_explicit_realign_optimized:
5486 The caller to this function uses the phi-result (msq) to create the
5487 realignment code inside the loop, and sets up the missing phi argument,
5488 as follows:
5489 loop:
5490 msq = phi (msq_init, lsq)
5491 lsq = *(floor(p')); # load in loop
5492 result = realign_load (msq, lsq, realignment_token);
5494 For the case of dr_explicit_realign:
5495 loop:
5496 msq = *(floor(p)); # load in loop
5497 p' = p + (VS-1);
5498 lsq = *(floor(p')); # load in loop
5499 result = realign_load (msq, lsq, realignment_token);
5501 Input:
5502 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5503 a memory location that may be unaligned.
5504 BSI - place where new code is to be inserted.
5505 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5506 is used.
5508 Output:
5509 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5510 target hook, if defined.
5511 Return value - the result of the loop-header phi node. */
5513 tree
5514 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5515 gimple_stmt_iterator *gsi, tree *realignment_token,
5516 enum dr_alignment_support alignment_support_scheme,
5517 tree init_addr,
5518 class loop **at_loop)
5520 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5521 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5522 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5523 struct data_reference *dr = dr_info->dr;
5524 class loop *loop = NULL;
5525 edge pe = NULL;
5526 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5527 tree vec_dest;
5528 gimple *inc;
5529 tree ptr;
5530 tree data_ref;
5531 basic_block new_bb;
5532 tree msq_init = NULL_TREE;
5533 tree new_temp;
5534 gphi *phi_stmt;
5535 tree msq = NULL_TREE;
5536 gimple_seq stmts = NULL;
5537 bool compute_in_loop = false;
5538 bool nested_in_vect_loop = false;
5539 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5540 class loop *loop_for_initial_load = NULL;
5542 if (loop_vinfo)
5544 loop = LOOP_VINFO_LOOP (loop_vinfo);
5545 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5548 gcc_assert (alignment_support_scheme == dr_explicit_realign
5549 || alignment_support_scheme == dr_explicit_realign_optimized);
5551 /* We need to generate three things:
5552 1. the misalignment computation
5553 2. the extra vector load (for the optimized realignment scheme).
5554 3. the phi node for the two vectors from which the realignment is
5555 done (for the optimized realignment scheme). */
5557 /* 1. Determine where to generate the misalignment computation.
5559 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5560 calculation will be generated by this function, outside the loop (in the
5561 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5562 caller, inside the loop.
5564 Background: If the misalignment remains fixed throughout the iterations of
5565 the loop, then both realignment schemes are applicable, and also the
5566 misalignment computation can be done outside LOOP. This is because we are
5567 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5568 are a multiple of VS (the Vector Size), and therefore the misalignment in
5569 different vectorized LOOP iterations is always the same.
5570 The problem arises only if the memory access is in an inner-loop nested
5571 inside LOOP, which is now being vectorized using outer-loop vectorization.
5572 This is the only case when the misalignment of the memory access may not
5573 remain fixed throughout the iterations of the inner-loop (as explained in
5574 detail in vect_supportable_dr_alignment). In this case, not only is the
5575 optimized realignment scheme not applicable, but also the misalignment
5576 computation (and generation of the realignment token that is passed to
5577 REALIGN_LOAD) have to be done inside the loop.
5579 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5580 or not, which in turn determines if the misalignment is computed inside
5581 the inner-loop, or outside LOOP. */
5583 if (init_addr != NULL_TREE || !loop_vinfo)
5585 compute_in_loop = true;
5586 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5590 /* 2. Determine where to generate the extra vector load.
5592 For the optimized realignment scheme, instead of generating two vector
5593 loads in each iteration, we generate a single extra vector load in the
5594 preheader of the loop, and in each iteration reuse the result of the
5595 vector load from the previous iteration. In case the memory access is in
5596 an inner-loop nested inside LOOP, which is now being vectorized using
5597 outer-loop vectorization, we need to determine whether this initial vector
5598 load should be generated at the preheader of the inner-loop, or can be
5599 generated at the preheader of LOOP. If the memory access has no evolution
5600 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5601 to be generated inside LOOP (in the preheader of the inner-loop). */
5603 if (nested_in_vect_loop)
5605 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5606 bool invariant_in_outerloop =
5607 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5608 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5610 else
5611 loop_for_initial_load = loop;
5612 if (at_loop)
5613 *at_loop = loop_for_initial_load;
5615 if (loop_for_initial_load)
5616 pe = loop_preheader_edge (loop_for_initial_load);
5618 /* 3. For the case of the optimized realignment, create the first vector
5619 load at the loop preheader. */
5621 if (alignment_support_scheme == dr_explicit_realign_optimized)
5623 /* Create msq_init = *(floor(p1)) in the loop preheader */
5624 gassign *new_stmt;
5626 gcc_assert (!compute_in_loop);
5627 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5628 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5629 loop_for_initial_load, NULL_TREE,
5630 &init_addr, NULL, &inc, true);
5631 if (TREE_CODE (ptr) == SSA_NAME)
5632 new_temp = copy_ssa_name (ptr);
5633 else
5634 new_temp = make_ssa_name (TREE_TYPE (ptr));
5635 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5636 tree type = TREE_TYPE (ptr);
5637 new_stmt = gimple_build_assign
5638 (new_temp, BIT_AND_EXPR, ptr,
5639 fold_build2 (MINUS_EXPR, type,
5640 build_int_cst (type, 0),
5641 build_int_cst (type, align)));
5642 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5643 gcc_assert (!new_bb);
5644 data_ref
5645 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5646 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5647 vect_copy_ref_info (data_ref, DR_REF (dr));
5648 new_stmt = gimple_build_assign (vec_dest, data_ref);
5649 new_temp = make_ssa_name (vec_dest, new_stmt);
5650 gimple_assign_set_lhs (new_stmt, new_temp);
5651 if (pe)
5653 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5654 gcc_assert (!new_bb);
5656 else
5657 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5659 msq_init = gimple_assign_lhs (new_stmt);
5662 /* 4. Create realignment token using a target builtin, if available.
5663 It is done either inside the containing loop, or before LOOP (as
5664 determined above). */
5666 if (targetm.vectorize.builtin_mask_for_load)
5668 gcall *new_stmt;
5669 tree builtin_decl;
5671 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5672 if (!init_addr)
5674 /* Generate the INIT_ADDR computation outside LOOP. */
5675 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5676 stmt_info, &stmts,
5677 NULL_TREE);
5678 if (loop)
5680 pe = loop_preheader_edge (loop);
5681 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5682 gcc_assert (!new_bb);
5684 else
5685 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5688 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5689 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5690 vec_dest =
5691 vect_create_destination_var (scalar_dest,
5692 gimple_call_return_type (new_stmt));
5693 new_temp = make_ssa_name (vec_dest, new_stmt);
5694 gimple_call_set_lhs (new_stmt, new_temp);
5696 if (compute_in_loop)
5697 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5698 else
5700 /* Generate the misalignment computation outside LOOP. */
5701 pe = loop_preheader_edge (loop);
5702 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5703 gcc_assert (!new_bb);
5706 *realignment_token = gimple_call_lhs (new_stmt);
5708 /* The result of the CALL_EXPR to this builtin is determined from
5709 the value of the parameter and no global variables are touched
5710 which makes the builtin a "const" function. Requiring the
5711 builtin to have the "const" attribute makes it unnecessary
5712 to call mark_call_clobbered. */
5713 gcc_assert (TREE_READONLY (builtin_decl));
5716 if (alignment_support_scheme == dr_explicit_realign)
5717 return msq;
5719 gcc_assert (!compute_in_loop);
5720 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5723 /* 5. Create msq = phi <msq_init, lsq> in loop */
5725 pe = loop_preheader_edge (containing_loop);
5726 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5727 msq = make_ssa_name (vec_dest);
5728 phi_stmt = create_phi_node (msq, containing_loop->header);
5729 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5731 return msq;
5735 /* Function vect_grouped_load_supported.
5737 COUNT is the size of the load group (the number of statements plus the
5738 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5739 only one statement, with a gap of COUNT - 1.
5741 Returns true if a suitable permute exists. */
5743 bool
5744 vect_grouped_load_supported (tree vectype, bool single_element_p,
5745 unsigned HOST_WIDE_INT count)
5747 machine_mode mode = TYPE_MODE (vectype);
5749 /* If this is single-element interleaving with an element distance
5750 that leaves unused vector loads around punt - we at least create
5751 very sub-optimal code in that case (and blow up memory,
5752 see PR65518). */
5753 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5755 if (dump_enabled_p ())
5756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5757 "single-element interleaving not supported "
5758 "for not adjacent vector loads\n");
5759 return false;
5762 /* vect_permute_load_chain requires the group size to be equal to 3 or
5763 be a power of two. */
5764 if (count != 3 && exact_log2 (count) == -1)
5766 if (dump_enabled_p ())
5767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5768 "the size of the group of accesses"
5769 " is not a power of 2 or not equal to 3\n");
5770 return false;
5773 /* Check that the permutation is supported. */
5774 if (VECTOR_MODE_P (mode))
5776 unsigned int i, j;
5777 if (count == 3)
5779 unsigned int nelt;
5780 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5782 if (dump_enabled_p ())
5783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5784 "cannot handle groups of 3 loads for"
5785 " variable-length vectors\n");
5786 return false;
5789 vec_perm_builder sel (nelt, nelt, 1);
5790 sel.quick_grow (nelt);
5791 vec_perm_indices indices;
5792 unsigned int k;
5793 for (k = 0; k < 3; k++)
5795 for (i = 0; i < nelt; i++)
5796 if (3 * i + k < 2 * nelt)
5797 sel[i] = 3 * i + k;
5798 else
5799 sel[i] = 0;
5800 indices.new_vector (sel, 2, nelt);
5801 if (!can_vec_perm_const_p (mode, indices))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 "shuffle of 3 loads is not supported by"
5806 " target\n");
5807 return false;
5809 for (i = 0, j = 0; i < nelt; i++)
5810 if (3 * i + k < 2 * nelt)
5811 sel[i] = i;
5812 else
5813 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5814 indices.new_vector (sel, 2, nelt);
5815 if (!can_vec_perm_const_p (mode, indices))
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5819 "shuffle of 3 loads is not supported by"
5820 " target\n");
5821 return false;
5824 return true;
5826 else
5828 /* If length is not equal to 3 then only power of 2 is supported. */
5829 gcc_assert (pow2p_hwi (count));
5830 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5832 /* The encoding has a single stepped pattern. */
5833 vec_perm_builder sel (nelt, 1, 3);
5834 sel.quick_grow (3);
5835 for (i = 0; i < 3; i++)
5836 sel[i] = i * 2;
5837 vec_perm_indices indices (sel, 2, nelt);
5838 if (can_vec_perm_const_p (mode, indices))
5840 for (i = 0; i < 3; i++)
5841 sel[i] = i * 2 + 1;
5842 indices.new_vector (sel, 2, nelt);
5843 if (can_vec_perm_const_p (mode, indices))
5844 return true;
5849 if (dump_enabled_p ())
5850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5851 "extract even/odd not supported by target\n");
5852 return false;
5855 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5856 type VECTYPE. MASKED_P says whether the masked form is needed. */
5858 bool
5859 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5860 bool masked_p)
5862 if (masked_p)
5863 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5864 vec_mask_load_lanes_optab,
5865 vectype, count);
5866 else
5867 return vect_lanes_optab_supported_p ("vec_load_lanes",
5868 vec_load_lanes_optab,
5869 vectype, count);
5872 /* Function vect_permute_load_chain.
5874 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5875 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5876 the input data correctly. Return the final references for loads in
5877 RESULT_CHAIN.
5879 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5880 The input is 4 vectors each containing 8 elements. We assign a number to each
5881 element, the input sequence is:
5883 1st vec: 0 1 2 3 4 5 6 7
5884 2nd vec: 8 9 10 11 12 13 14 15
5885 3rd vec: 16 17 18 19 20 21 22 23
5886 4th vec: 24 25 26 27 28 29 30 31
5888 The output sequence should be:
5890 1st vec: 0 4 8 12 16 20 24 28
5891 2nd vec: 1 5 9 13 17 21 25 29
5892 3rd vec: 2 6 10 14 18 22 26 30
5893 4th vec: 3 7 11 15 19 23 27 31
5895 i.e., the first output vector should contain the first elements of each
5896 interleaving group, etc.
5898 We use extract_even/odd instructions to create such output. The input of
5899 each extract_even/odd operation is two vectors
5900 1st vec 2nd vec
5901 0 1 2 3 4 5 6 7
5903 and the output is the vector of extracted even/odd elements. The output of
5904 extract_even will be: 0 2 4 6
5905 and of extract_odd: 1 3 5 7
5908 The permutation is done in log LENGTH stages. In each stage extract_even
5909 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5910 their order. In our example,
5912 E1: extract_even (1st vec, 2nd vec)
5913 E2: extract_odd (1st vec, 2nd vec)
5914 E3: extract_even (3rd vec, 4th vec)
5915 E4: extract_odd (3rd vec, 4th vec)
5917 The output for the first stage will be:
5919 E1: 0 2 4 6 8 10 12 14
5920 E2: 1 3 5 7 9 11 13 15
5921 E3: 16 18 20 22 24 26 28 30
5922 E4: 17 19 21 23 25 27 29 31
5924 In order to proceed and create the correct sequence for the next stage (or
5925 for the correct output, if the second stage is the last one, as in our
5926 example), we first put the output of extract_even operation and then the
5927 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5928 The input for the second stage is:
5930 1st vec (E1): 0 2 4 6 8 10 12 14
5931 2nd vec (E3): 16 18 20 22 24 26 28 30
5932 3rd vec (E2): 1 3 5 7 9 11 13 15
5933 4th vec (E4): 17 19 21 23 25 27 29 31
5935 The output of the second stage:
5937 E1: 0 4 8 12 16 20 24 28
5938 E2: 2 6 10 14 18 22 26 30
5939 E3: 1 5 9 13 17 21 25 29
5940 E4: 3 7 11 15 19 23 27 31
5942 And RESULT_CHAIN after reordering:
5944 1st vec (E1): 0 4 8 12 16 20 24 28
5945 2nd vec (E3): 1 5 9 13 17 21 25 29
5946 3rd vec (E2): 2 6 10 14 18 22 26 30
5947 4th vec (E4): 3 7 11 15 19 23 27 31. */
5949 static void
5950 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5951 unsigned int length,
5952 stmt_vec_info stmt_info,
5953 gimple_stmt_iterator *gsi,
5954 vec<tree> *result_chain)
5956 tree data_ref, first_vect, second_vect;
5957 tree perm_mask_even, perm_mask_odd;
5958 tree perm3_mask_low, perm3_mask_high;
5959 gimple *perm_stmt;
5960 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5961 unsigned int i, j, log_length = exact_log2 (length);
5963 result_chain->quick_grow (length);
5964 memcpy (result_chain->address (), dr_chain.address (),
5965 length * sizeof (tree));
5967 if (length == 3)
5969 /* vect_grouped_load_supported ensures that this is constant. */
5970 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5971 unsigned int k;
5973 vec_perm_builder sel (nelt, nelt, 1);
5974 sel.quick_grow (nelt);
5975 vec_perm_indices indices;
5976 for (k = 0; k < 3; k++)
5978 for (i = 0; i < nelt; i++)
5979 if (3 * i + k < 2 * nelt)
5980 sel[i] = 3 * i + k;
5981 else
5982 sel[i] = 0;
5983 indices.new_vector (sel, 2, nelt);
5984 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5986 for (i = 0, j = 0; i < nelt; i++)
5987 if (3 * i + k < 2 * nelt)
5988 sel[i] = i;
5989 else
5990 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5991 indices.new_vector (sel, 2, nelt);
5992 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5994 first_vect = dr_chain[0];
5995 second_vect = dr_chain[1];
5997 /* Create interleaving stmt (low part of):
5998 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5999 ...}> */
6000 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6001 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6002 second_vect, perm3_mask_low);
6003 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6005 /* Create interleaving stmt (high part of):
6006 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6007 ...}> */
6008 first_vect = data_ref;
6009 second_vect = dr_chain[2];
6010 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6011 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6012 second_vect, perm3_mask_high);
6013 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6014 (*result_chain)[k] = data_ref;
6017 else
6019 /* If length is not equal to 3 then only power of 2 is supported. */
6020 gcc_assert (pow2p_hwi (length));
6022 /* The encoding has a single stepped pattern. */
6023 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6024 vec_perm_builder sel (nelt, 1, 3);
6025 sel.quick_grow (3);
6026 for (i = 0; i < 3; ++i)
6027 sel[i] = i * 2;
6028 vec_perm_indices indices (sel, 2, nelt);
6029 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6031 for (i = 0; i < 3; ++i)
6032 sel[i] = i * 2 + 1;
6033 indices.new_vector (sel, 2, nelt);
6034 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6036 for (i = 0; i < log_length; i++)
6038 for (j = 0; j < length; j += 2)
6040 first_vect = dr_chain[j];
6041 second_vect = dr_chain[j+1];
6043 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6044 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6045 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6046 first_vect, second_vect,
6047 perm_mask_even);
6048 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6049 (*result_chain)[j/2] = data_ref;
6051 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6052 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6053 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6054 first_vect, second_vect,
6055 perm_mask_odd);
6056 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6057 (*result_chain)[j/2+length/2] = data_ref;
6059 memcpy (dr_chain.address (), result_chain->address (),
6060 length * sizeof (tree));
6065 /* Function vect_shift_permute_load_chain.
6067 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6068 sequence of stmts to reorder the input data accordingly.
6069 Return the final references for loads in RESULT_CHAIN.
6070 Return true if successed, false otherwise.
6072 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6073 The input is 3 vectors each containing 8 elements. We assign a
6074 number to each element, the input sequence is:
6076 1st vec: 0 1 2 3 4 5 6 7
6077 2nd vec: 8 9 10 11 12 13 14 15
6078 3rd vec: 16 17 18 19 20 21 22 23
6080 The output sequence should be:
6082 1st vec: 0 3 6 9 12 15 18 21
6083 2nd vec: 1 4 7 10 13 16 19 22
6084 3rd vec: 2 5 8 11 14 17 20 23
6086 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6088 First we shuffle all 3 vectors to get correct elements order:
6090 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6091 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6092 3rd vec: (16 19 22) (17 20 23) (18 21)
6094 Next we unite and shift vector 3 times:
6096 1st step:
6097 shift right by 6 the concatenation of:
6098 "1st vec" and "2nd vec"
6099 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6100 "2nd vec" and "3rd vec"
6101 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6102 "3rd vec" and "1st vec"
6103 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6104 | New vectors |
6106 So that now new vectors are:
6108 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6109 2nd vec: (10 13) (16 19 22) (17 20 23)
6110 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6112 2nd step:
6113 shift right by 5 the concatenation of:
6114 "1st vec" and "3rd vec"
6115 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6116 "2nd vec" and "1st vec"
6117 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6118 "3rd vec" and "2nd vec"
6119 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6120 | New vectors |
6122 So that now new vectors are:
6124 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6125 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6126 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6128 3rd step:
6129 shift right by 5 the concatenation of:
6130 "1st vec" and "1st vec"
6131 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6132 shift right by 3 the concatenation of:
6133 "2nd vec" and "2nd vec"
6134 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6135 | New vectors |
6137 So that now all vectors are READY:
6138 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6139 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6140 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6142 This algorithm is faster than one in vect_permute_load_chain if:
6143 1. "shift of a concatination" is faster than general permutation.
6144 This is usually so.
6145 2. The TARGET machine can't execute vector instructions in parallel.
6146 This is because each step of the algorithm depends on previous.
6147 The algorithm in vect_permute_load_chain is much more parallel.
6149 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6152 static bool
6153 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6154 unsigned int length,
6155 stmt_vec_info stmt_info,
6156 gimple_stmt_iterator *gsi,
6157 vec<tree> *result_chain)
6159 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6160 tree perm2_mask1, perm2_mask2, perm3_mask;
6161 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6162 gimple *perm_stmt;
6164 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6165 unsigned int i;
6166 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6168 unsigned HOST_WIDE_INT nelt, vf;
6169 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6170 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6171 /* Not supported for variable-length vectors. */
6172 return false;
6174 vec_perm_builder sel (nelt, nelt, 1);
6175 sel.quick_grow (nelt);
6177 result_chain->quick_grow (length);
6178 memcpy (result_chain->address (), dr_chain.address (),
6179 length * sizeof (tree));
6181 if (pow2p_hwi (length) && vf > 4)
6183 unsigned int j, log_length = exact_log2 (length);
6184 for (i = 0; i < nelt / 2; ++i)
6185 sel[i] = i * 2;
6186 for (i = 0; i < nelt / 2; ++i)
6187 sel[nelt / 2 + i] = i * 2 + 1;
6188 vec_perm_indices indices (sel, 2, nelt);
6189 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6191 if (dump_enabled_p ())
6192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6193 "shuffle of 2 fields structure is not \
6194 supported by target\n");
6195 return false;
6197 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6199 for (i = 0; i < nelt / 2; ++i)
6200 sel[i] = i * 2 + 1;
6201 for (i = 0; i < nelt / 2; ++i)
6202 sel[nelt / 2 + i] = i * 2;
6203 indices.new_vector (sel, 2, nelt);
6204 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6206 if (dump_enabled_p ())
6207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6208 "shuffle of 2 fields structure is not \
6209 supported by target\n");
6210 return false;
6212 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6214 /* Generating permutation constant to shift all elements.
6215 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6216 for (i = 0; i < nelt; i++)
6217 sel[i] = nelt / 2 + i;
6218 indices.new_vector (sel, 2, nelt);
6219 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6221 if (dump_enabled_p ())
6222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6223 "shift permutation is not supported by target\n");
6224 return false;
6226 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6228 /* Generating permutation constant to select vector from 2.
6229 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6230 for (i = 0; i < nelt / 2; i++)
6231 sel[i] = i;
6232 for (i = nelt / 2; i < nelt; i++)
6233 sel[i] = nelt + i;
6234 indices.new_vector (sel, 2, nelt);
6235 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6237 if (dump_enabled_p ())
6238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6239 "select is not supported by target\n");
6240 return false;
6242 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6244 for (i = 0; i < log_length; i++)
6246 for (j = 0; j < length; j += 2)
6248 first_vect = dr_chain[j];
6249 second_vect = dr_chain[j + 1];
6251 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6252 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6253 first_vect, first_vect,
6254 perm2_mask1);
6255 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6256 vect[0] = data_ref;
6258 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6259 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6260 second_vect, second_vect,
6261 perm2_mask2);
6262 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6263 vect[1] = data_ref;
6265 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6266 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6267 vect[0], vect[1], shift1_mask);
6268 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6269 (*result_chain)[j/2 + length/2] = data_ref;
6271 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6272 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6273 vect[0], vect[1], select_mask);
6274 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6275 (*result_chain)[j/2] = data_ref;
6277 memcpy (dr_chain.address (), result_chain->address (),
6278 length * sizeof (tree));
6280 return true;
6282 if (length == 3 && vf > 2)
6284 unsigned int k = 0, l = 0;
6286 /* Generating permutation constant to get all elements in rigth order.
6287 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6288 for (i = 0; i < nelt; i++)
6290 if (3 * k + (l % 3) >= nelt)
6292 k = 0;
6293 l += (3 - (nelt % 3));
6295 sel[i] = 3 * k + (l % 3);
6296 k++;
6298 vec_perm_indices indices (sel, 2, nelt);
6299 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303 "shuffle of 3 fields structure is not \
6304 supported by target\n");
6305 return false;
6307 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6309 /* Generating permutation constant to shift all elements.
6310 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6311 for (i = 0; i < nelt; i++)
6312 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6313 indices.new_vector (sel, 2, nelt);
6314 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6316 if (dump_enabled_p ())
6317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6318 "shift permutation is not supported by target\n");
6319 return false;
6321 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6323 /* Generating permutation constant to shift all elements.
6324 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6325 for (i = 0; i < nelt; i++)
6326 sel[i] = 2 * (nelt / 3) + 1 + i;
6327 indices.new_vector (sel, 2, nelt);
6328 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6330 if (dump_enabled_p ())
6331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6332 "shift permutation is not supported by target\n");
6333 return false;
6335 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6337 /* Generating permutation constant to shift all elements.
6338 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6339 for (i = 0; i < nelt; i++)
6340 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6341 indices.new_vector (sel, 2, nelt);
6342 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6344 if (dump_enabled_p ())
6345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6346 "shift permutation is not supported by target\n");
6347 return false;
6349 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6351 /* Generating permutation constant to shift all elements.
6352 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6353 for (i = 0; i < nelt; i++)
6354 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6355 indices.new_vector (sel, 2, nelt);
6356 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6358 if (dump_enabled_p ())
6359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6360 "shift permutation is not supported by target\n");
6361 return false;
6363 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6365 for (k = 0; k < 3; k++)
6367 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6368 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6369 dr_chain[k], dr_chain[k],
6370 perm3_mask);
6371 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6372 vect[k] = data_ref;
6375 for (k = 0; k < 3; k++)
6377 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6378 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6379 vect[k % 3], vect[(k + 1) % 3],
6380 shift1_mask);
6381 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6382 vect_shift[k] = data_ref;
6385 for (k = 0; k < 3; k++)
6387 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6388 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6389 vect_shift[(4 - k) % 3],
6390 vect_shift[(3 - k) % 3],
6391 shift2_mask);
6392 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6393 vect[k] = data_ref;
6396 (*result_chain)[3 - (nelt % 3)] = vect[2];
6398 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6399 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6400 vect[0], shift3_mask);
6401 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6402 (*result_chain)[nelt % 3] = data_ref;
6404 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6405 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6406 vect[1], shift4_mask);
6407 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6408 (*result_chain)[0] = data_ref;
6409 return true;
6411 return false;
6414 /* Function vect_transform_grouped_load.
6416 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6417 to perform their permutation and ascribe the result vectorized statements to
6418 the scalar statements.
6421 void
6422 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6423 vec<tree> dr_chain,
6424 int size, gimple_stmt_iterator *gsi)
6426 machine_mode mode;
6427 vec<tree> result_chain = vNULL;
6429 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6430 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6431 vectors, that are ready for vector computation. */
6432 result_chain.create (size);
6434 /* If reassociation width for vector type is 2 or greater target machine can
6435 execute 2 or more vector instructions in parallel. Otherwise try to
6436 get chain for loads group using vect_shift_permute_load_chain. */
6437 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6438 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6439 || pow2p_hwi (size)
6440 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6441 gsi, &result_chain))
6442 vect_permute_load_chain (vinfo, dr_chain,
6443 size, stmt_info, gsi, &result_chain);
6444 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6445 result_chain.release ();
6448 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6449 generated as part of the vectorization of STMT_INFO. Assign the statement
6450 for each vector to the associated scalar statement. */
6452 void
6453 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6454 vec<tree> result_chain)
6456 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6457 unsigned int i, gap_count;
6458 tree tmp_data_ref;
6460 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6461 Since we scan the chain starting from it's first node, their order
6462 corresponds the order of data-refs in RESULT_CHAIN. */
6463 stmt_vec_info next_stmt_info = first_stmt_info;
6464 gap_count = 1;
6465 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6467 if (!next_stmt_info)
6468 break;
6470 /* Skip the gaps. Loads created for the gaps will be removed by dead
6471 code elimination pass later. No need to check for the first stmt in
6472 the group, since it always exists.
6473 DR_GROUP_GAP is the number of steps in elements from the previous
6474 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6475 correspond to the gaps. */
6476 if (next_stmt_info != first_stmt_info
6477 && gap_count < DR_GROUP_GAP (next_stmt_info))
6479 gap_count++;
6480 continue;
6483 /* ??? The following needs cleanup after the removal of
6484 DR_GROUP_SAME_DR_STMT. */
6485 if (next_stmt_info)
6487 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6488 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6489 copies, and we put the new vector statement last. */
6490 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6492 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6493 gap_count = 1;
6498 /* Function vect_force_dr_alignment_p.
6500 Returns whether the alignment of a DECL can be forced to be aligned
6501 on ALIGNMENT bit boundary. */
6503 bool
6504 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6506 if (!VAR_P (decl))
6507 return false;
6509 if (decl_in_symtab_p (decl)
6510 && !symtab_node::get (decl)->can_increase_alignment_p ())
6511 return false;
6513 if (TREE_STATIC (decl))
6514 return (known_le (alignment,
6515 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6516 else
6517 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6521 /* Return whether the data reference DR_INFO is supported with respect to its
6522 alignment.
6523 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6524 it is aligned, i.e., check if it is possible to vectorize it with different
6525 alignment. */
6527 enum dr_alignment_support
6528 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6529 bool check_aligned_accesses)
6531 data_reference *dr = dr_info->dr;
6532 stmt_vec_info stmt_info = dr_info->stmt;
6533 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6534 machine_mode mode = TYPE_MODE (vectype);
6535 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6536 class loop *vect_loop = NULL;
6537 bool nested_in_vect_loop = false;
6539 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6540 return dr_aligned;
6542 /* For now assume all conditional loads/stores support unaligned
6543 access without any special code. */
6544 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6545 if (gimple_call_internal_p (stmt)
6546 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6547 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6548 return dr_unaligned_supported;
6550 if (loop_vinfo)
6552 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6553 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6556 /* Possibly unaligned access. */
6558 /* We can choose between using the implicit realignment scheme (generating
6559 a misaligned_move stmt) and the explicit realignment scheme (generating
6560 aligned loads with a REALIGN_LOAD). There are two variants to the
6561 explicit realignment scheme: optimized, and unoptimized.
6562 We can optimize the realignment only if the step between consecutive
6563 vector loads is equal to the vector size. Since the vector memory
6564 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6565 is guaranteed that the misalignment amount remains the same throughout the
6566 execution of the vectorized loop. Therefore, we can create the
6567 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6568 at the loop preheader.
6570 However, in the case of outer-loop vectorization, when vectorizing a
6571 memory access in the inner-loop nested within the LOOP that is now being
6572 vectorized, while it is guaranteed that the misalignment of the
6573 vectorized memory access will remain the same in different outer-loop
6574 iterations, it is *not* guaranteed that is will remain the same throughout
6575 the execution of the inner-loop. This is because the inner-loop advances
6576 with the original scalar step (and not in steps of VS). If the inner-loop
6577 step happens to be a multiple of VS, then the misalignment remains fixed
6578 and we can use the optimized realignment scheme. For example:
6580 for (i=0; i<N; i++)
6581 for (j=0; j<M; j++)
6582 s += a[i+j];
6584 When vectorizing the i-loop in the above example, the step between
6585 consecutive vector loads is 1, and so the misalignment does not remain
6586 fixed across the execution of the inner-loop, and the realignment cannot
6587 be optimized (as illustrated in the following pseudo vectorized loop):
6589 for (i=0; i<N; i+=4)
6590 for (j=0; j<M; j++){
6591 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6592 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6593 // (assuming that we start from an aligned address).
6596 We therefore have to use the unoptimized realignment scheme:
6598 for (i=0; i<N; i+=4)
6599 for (j=k; j<M; j+=4)
6600 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6601 // that the misalignment of the initial address is
6602 // 0).
6604 The loop can then be vectorized as follows:
6606 for (k=0; k<4; k++){
6607 rt = get_realignment_token (&vp[k]);
6608 for (i=0; i<N; i+=4){
6609 v1 = vp[i+k];
6610 for (j=k; j<M; j+=4){
6611 v2 = vp[i+j+VS-1];
6612 va = REALIGN_LOAD <v1,v2,rt>;
6613 vs += va;
6614 v1 = v2;
6617 } */
6619 if (DR_IS_READ (dr))
6621 bool is_packed = false;
6622 tree type = (TREE_TYPE (DR_REF (dr)));
6624 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6625 && (!targetm.vectorize.builtin_mask_for_load
6626 || targetm.vectorize.builtin_mask_for_load ()))
6628 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6630 /* If we are doing SLP then the accesses need not have the
6631 same alignment, instead it depends on the SLP group size. */
6632 if (loop_vinfo
6633 && STMT_SLP_TYPE (stmt_info)
6634 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6635 * (DR_GROUP_SIZE
6636 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6637 TYPE_VECTOR_SUBPARTS (vectype)))
6639 else if (!loop_vinfo
6640 || (nested_in_vect_loop
6641 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6642 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6643 return dr_explicit_realign;
6644 else
6645 return dr_explicit_realign_optimized;
6647 if (!known_alignment_for_access_p (dr_info))
6648 is_packed = not_size_aligned (DR_REF (dr));
6650 if (targetm.vectorize.support_vector_misalignment
6651 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6652 /* Can't software pipeline the loads, but can at least do them. */
6653 return dr_unaligned_supported;
6655 else
6657 bool is_packed = false;
6658 tree type = (TREE_TYPE (DR_REF (dr)));
6660 if (!known_alignment_for_access_p (dr_info))
6661 is_packed = not_size_aligned (DR_REF (dr));
6663 if (targetm.vectorize.support_vector_misalignment
6664 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6665 return dr_unaligned_supported;
6668 /* Unsupported. */
6669 return dr_unaligned_unsupported;