[RS6000] Non-pcrel tests when power10
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob4abd27e4c70c707e12514c8775485b7ddb47943f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode, array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
96 return true;
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
160 if (i < gimple_call_num_args (call))
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
182 static opt_result
183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return opt_result::success ();
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
205 static void
206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
224 static bool
225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
235 /* STMT_A and STMT_B belong to overlapping groups. All loads are
236 emitted at the position of the first scalar load.
237 Stores in a group are emitted at the position of the last scalar store.
238 Compute that position and check whether the resulting order matches
239 the current one. */
240 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
241 if (il_a)
243 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
244 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
245 s = DR_GROUP_NEXT_ELEMENT (s))
246 il_a = get_later_stmt (il_a, s);
247 else /* DR_IS_READ */
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 if (get_later_stmt (il_a, s) == il_a)
251 il_a = s;
253 else
254 il_a = stmtinfo_a;
255 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
256 if (il_b)
258 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
259 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
260 s = DR_GROUP_NEXT_ELEMENT (s))
261 il_b = get_later_stmt (il_b, s);
262 else /* DR_IS_READ */
263 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
264 s = DR_GROUP_NEXT_ELEMENT (s))
265 if (get_later_stmt (il_b, s) == il_b)
266 il_b = s;
268 else
269 il_b = stmtinfo_b;
270 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
271 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
274 /* A subroutine of vect_analyze_data_ref_dependence. Handle
275 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
276 distances. These distances are conservatively correct but they don't
277 reflect a guaranteed dependence.
279 Return true if this function does all the work necessary to avoid
280 an alias or false if the caller should use the dependence distances
281 to limit the vectorization factor in the usual way. LOOP_DEPTH is
282 the depth of the loop described by LOOP_VINFO and the other arguments
283 are as for vect_analyze_data_ref_dependence. */
285 static bool
286 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
287 loop_vec_info loop_vinfo,
288 int loop_depth, unsigned int *max_vf)
290 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
291 lambda_vector dist_v;
292 unsigned int i;
293 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
295 int dist = dist_v[loop_depth];
296 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
298 /* If the user asserted safelen >= DIST consecutive iterations
299 can be executed concurrently, assume independence.
301 ??? An alternative would be to add the alias check even
302 in this case, and vectorize the fallback loop with the
303 maximum VF set to safelen. However, if the user has
304 explicitly given a length, it's less likely that that
305 would be a win. */
306 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
308 if ((unsigned int) loop->safelen < *max_vf)
309 *max_vf = loop->safelen;
310 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
311 continue;
314 /* For dependence distances of 2 or more, we have the option
315 of limiting VF or checking for an alias at runtime.
316 Prefer to check at runtime if we can, to avoid limiting
317 the VF unnecessarily when the bases are in fact independent.
319 Note that the alias checks will be removed if the VF ends up
320 being small enough. */
321 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
322 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
323 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
324 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
325 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
328 return true;
332 /* Function vect_analyze_data_ref_dependence.
334 FIXME: I needed to change the sense of the returned flag.
336 Return FALSE if there (might) exist a dependence between a memory-reference
337 DRA and a memory-reference DRB. When versioning for alias may check a
338 dependence at run-time, return TRUE. Adjust *MAX_VF according to
339 the data dependence. */
341 static opt_result
342 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
343 loop_vec_info loop_vinfo,
344 unsigned int *max_vf)
346 unsigned int i;
347 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
348 struct data_reference *dra = DDR_A (ddr);
349 struct data_reference *drb = DDR_B (ddr);
350 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
351 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
352 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
353 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
354 lambda_vector dist_v;
355 unsigned int loop_depth;
357 /* In loop analysis all data references should be vectorizable. */
358 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
359 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
360 gcc_unreachable ();
362 /* Independent data accesses. */
363 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
364 return opt_result::success ();
366 if (dra == drb
367 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
368 return opt_result::success ();
370 /* We do not have to consider dependences between accesses that belong
371 to the same group, unless the stride could be smaller than the
372 group size. */
373 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
374 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
375 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
376 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
377 return opt_result::success ();
379 /* Even if we have an anti-dependence then, as the vectorized loop covers at
380 least two scalar iterations, there is always also a true dependence.
381 As the vectorizer does not re-order loads and stores we can ignore
382 the anti-dependence if TBAA can disambiguate both DRs similar to the
383 case with known negative distance anti-dependences (positive
384 distance anti-dependences would violate TBAA constraints). */
385 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
386 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
387 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
388 get_alias_set (DR_REF (drb))))
389 return opt_result::success ();
391 /* Unknown data dependence. */
392 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
394 /* If user asserted safelen consecutive iterations can be
395 executed concurrently, assume independence. */
396 if (loop->safelen >= 2)
398 if ((unsigned int) loop->safelen < *max_vf)
399 *max_vf = loop->safelen;
400 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
401 return opt_result::success ();
404 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
405 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
406 return opt_result::failure_at
407 (stmtinfo_a->stmt,
408 "versioning for alias not supported for: "
409 "can't determine dependence between %T and %T\n",
410 DR_REF (dra), DR_REF (drb));
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
414 "versioning for alias required: "
415 "can't determine dependence between %T and %T\n",
416 DR_REF (dra), DR_REF (drb));
418 /* Add to list of ddrs that need to be tested at run-time. */
419 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
422 /* Known data dependence. */
423 if (DDR_NUM_DIST_VECTS (ddr) == 0)
425 /* If user asserted safelen consecutive iterations can be
426 executed concurrently, assume independence. */
427 if (loop->safelen >= 2)
429 if ((unsigned int) loop->safelen < *max_vf)
430 *max_vf = loop->safelen;
431 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
432 return opt_result::success ();
435 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
436 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
437 return opt_result::failure_at
438 (stmtinfo_a->stmt,
439 "versioning for alias not supported for: "
440 "bad dist vector for %T and %T\n",
441 DR_REF (dra), DR_REF (drb));
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
445 "versioning for alias required: "
446 "bad dist vector for %T and %T\n",
447 DR_REF (dra), DR_REF (drb));
448 /* Add to list of ddrs that need to be tested at run-time. */
449 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
452 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
455 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
456 loop_depth, max_vf))
457 return opt_result::success ();
459 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461 int dist = dist_v[loop_depth];
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location,
465 "dependence distance = %d.\n", dist);
467 if (dist == 0)
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "dependence distance == 0 between %T and %T\n",
472 DR_REF (dra), DR_REF (drb));
474 /* When we perform grouped accesses and perform implicit CSE
475 by detecting equal accesses and doing disambiguation with
476 runtime alias tests like for
477 .. = a[i];
478 .. = a[i+1];
479 a[i] = ..;
480 a[i+1] = ..;
481 *p = ..;
482 .. = a[i];
483 .. = a[i+1];
484 where we will end up loading { a[i], a[i+1] } once, make
485 sure that inserting group loads before the first load and
486 stores after the last store will do the right thing.
487 Similar for groups like
488 a[i] = ...;
489 ... = a[i];
490 a[i+1] = ...;
491 where loads from the group interleave with the store. */
492 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
493 return opt_result::failure_at (stmtinfo_a->stmt,
494 "READ_WRITE dependence"
495 " in interleaving.\n");
497 if (loop->safelen < 2)
499 tree indicator = dr_zero_step_indicator (dra);
500 if (!indicator || integer_zerop (indicator))
501 return opt_result::failure_at (stmtinfo_a->stmt,
502 "access also has a zero step\n");
503 else if (TREE_CODE (indicator) != INTEGER_CST)
504 vect_check_nonzero_value (loop_vinfo, indicator);
506 continue;
509 if (dist > 0 && DDR_REVERSED_P (ddr))
511 /* If DDR_REVERSED_P the order of the data-refs in DDR was
512 reversed (to make distance vector positive), and the actual
513 distance is negative. */
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "dependence distance negative.\n");
517 /* When doing outer loop vectorization, we need to check if there is
518 a backward dependence at the inner loop level if the dependence
519 at the outer loop is reversed. See PR81740. */
520 if (nested_in_vect_loop_p (loop, stmtinfo_a)
521 || nested_in_vect_loop_p (loop, stmtinfo_b))
523 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
524 DDR_LOOP_NEST (ddr));
525 if (dist_v[inner_depth] < 0)
526 return opt_result::failure_at (stmtinfo_a->stmt,
527 "not vectorized, dependence "
528 "between data-refs %T and %T\n",
529 DR_REF (dra), DR_REF (drb));
531 /* Record a negative dependence distance to later limit the
532 amount of stmt copying / unrolling we can perform.
533 Only need to handle read-after-write dependence. */
534 if (DR_IS_READ (drb)
535 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
536 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
537 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
538 continue;
541 unsigned int abs_dist = abs (dist);
542 if (abs_dist >= 2 && abs_dist < *max_vf)
544 /* The dependence distance requires reduction of the maximal
545 vectorization factor. */
546 *max_vf = abs_dist;
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location,
549 "adjusting maximal vectorization factor to %i\n",
550 *max_vf);
553 if (abs_dist >= *max_vf)
555 /* Dependence distance does not create dependence, as far as
556 vectorization is concerned, in this case. */
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "dependence distance >= VF.\n");
560 continue;
563 return opt_result::failure_at (stmtinfo_a->stmt,
564 "not vectorized, possible dependence "
565 "between data-refs %T and %T\n",
566 DR_REF (dra), DR_REF (drb));
569 return opt_result::success ();
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the loop, and make sure there do not
575 exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 opt_result
579 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
580 unsigned int *max_vf)
582 unsigned int i;
583 struct data_dependence_relation *ddr;
585 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589 LOOP_VINFO_DDRS (loop_vinfo)
590 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
591 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
592 /* We do not need read-read dependences. */
593 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
594 &LOOP_VINFO_DDRS (loop_vinfo),
595 LOOP_VINFO_LOOP_NEST (loop_vinfo),
596 false);
597 gcc_assert (res);
600 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602 /* For epilogues we either have no aliases or alias versioning
603 was applied to original loop. Therefore we may just get max_vf
604 using VF of original loop. */
605 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
606 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
607 else
608 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610 opt_result res
611 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
612 if (!res)
613 return res;
616 return opt_result::success ();
620 /* Function vect_slp_analyze_data_ref_dependence.
622 Return TRUE if there (might) exist a dependence between a memory-reference
623 DRA and a memory-reference DRB for VINFO. When versioning for alias
624 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
625 according to the data dependence. */
627 static bool
628 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
629 struct data_dependence_relation *ddr)
631 struct data_reference *dra = DDR_A (ddr);
632 struct data_reference *drb = DDR_B (ddr);
633 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
634 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636 /* We need to check dependences of statements marked as unvectorizable
637 as well, they still can prohibit vectorization. */
639 /* Independent data accesses. */
640 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
641 return false;
643 if (dra == drb)
644 return false;
646 /* Read-read is OK. */
647 if (DR_IS_READ (dra) && DR_IS_READ (drb))
648 return false;
650 /* If dra and drb are part of the same interleaving chain consider
651 them independent. */
652 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
653 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
654 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
655 return false;
657 /* Unknown data dependence. */
658 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
662 "can't determine dependence between %T and %T\n",
663 DR_REF (dra), DR_REF (drb));
665 else if (dump_enabled_p ())
666 dump_printf_loc (MSG_NOTE, vect_location,
667 "determined dependence between %T and %T\n",
668 DR_REF (dra), DR_REF (drb));
670 return true;
674 /* Analyze dependences involved in the transform of SLP NODE. STORES
675 contain the vector of scalar stores of this instance if we are
676 disambiguating the loads. */
678 static bool
679 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
680 vec<stmt_vec_info> stores,
681 stmt_vec_info last_store_info)
683 /* This walks over all stmts involved in the SLP load/store done
684 in NODE verifying we can sink them up to the last stmt in the
685 group. */
686 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
688 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
689 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
691 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
692 if (access_info == last_access_info)
693 continue;
694 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
695 ao_ref ref;
696 bool ref_initialized_p = false;
697 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
698 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
700 gimple *stmt = gsi_stmt (gsi);
701 if (! gimple_vuse (stmt))
702 continue;
704 /* If we couldn't record a (single) data reference for this
705 stmt we have to resort to the alias oracle. */
706 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
707 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
708 if (!dr_b)
710 /* We are moving a store - this means
711 we cannot use TBAA for disambiguation. */
712 if (!ref_initialized_p)
713 ao_ref_init (&ref, DR_REF (dr_a));
714 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
715 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
716 return false;
717 continue;
720 bool dependent = false;
721 /* If we run into a store of this same instance (we've just
722 marked those) then delay dependence checking until we run
723 into the last store because this is where it will have
724 been sunk to (and we verify if we can do that as well). */
725 if (gimple_visited_p (stmt))
727 if (stmt_info != last_store_info)
728 continue;
729 unsigned i;
730 stmt_vec_info store_info;
731 FOR_EACH_VEC_ELT (stores, i, store_info)
733 data_reference *store_dr
734 = STMT_VINFO_DATA_REF (store_info);
735 ddr_p ddr = initialize_data_dependence_relation
736 (dr_a, store_dr, vNULL);
737 dependent
738 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
739 free_dependence_relation (ddr);
740 if (dependent)
741 break;
744 else
746 ddr_p ddr = initialize_data_dependence_relation (dr_a,
747 dr_b, vNULL);
748 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
749 free_dependence_relation (ddr);
751 if (dependent)
752 return false;
756 else /* DR_IS_READ */
758 stmt_vec_info first_access_info
759 = vect_find_first_scalar_stmt_in_slp (node);
760 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
762 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
763 if (access_info == first_access_info)
764 continue;
765 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
766 ao_ref ref;
767 bool ref_initialized_p = false;
768 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
769 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
771 gimple *stmt = gsi_stmt (gsi);
772 if (! gimple_vdef (stmt))
773 continue;
775 /* If we couldn't record a (single) data reference for this
776 stmt we have to resort to the alias oracle. */
777 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
778 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
779 if (!dr_b)
781 /* We are hoisting a load - this means we can use
782 TBAA for disambiguation. */
783 if (!ref_initialized_p)
784 ao_ref_init (&ref, DR_REF (dr_a));
785 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
786 return false;
787 continue;
790 bool dependent = false;
791 /* If we run into a store of this same instance (we've just
792 marked those) then delay dependence checking until we run
793 into the last store because this is where it will have
794 been sunk to (and we verify if we can do that as well). */
795 if (gimple_visited_p (stmt))
797 if (stmt_info != last_store_info)
798 continue;
799 unsigned i;
800 stmt_vec_info store_info;
801 FOR_EACH_VEC_ELT (stores, i, store_info)
803 data_reference *store_dr
804 = STMT_VINFO_DATA_REF (store_info);
805 ddr_p ddr = initialize_data_dependence_relation
806 (dr_a, store_dr, vNULL);
807 dependent
808 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
809 free_dependence_relation (ddr);
810 if (dependent)
811 break;
814 else
816 ddr_p ddr = initialize_data_dependence_relation (dr_a,
817 dr_b, vNULL);
818 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
819 free_dependence_relation (ddr);
821 if (dependent)
822 return false;
826 return true;
830 /* Function vect_analyze_data_ref_dependences.
832 Examine all the data references in the basic-block, and make sure there
833 do not exist any data dependences between them. Set *MAX_VF according to
834 the maximum vectorization factor the data dependences allow. */
836 bool
837 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
839 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
841 /* The stores of this instance are at the root of the SLP tree. */
842 slp_tree store = SLP_INSTANCE_TREE (instance);
843 if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
844 store = NULL;
846 /* Verify we can sink stores to the vectorized stmt insert location. */
847 stmt_vec_info last_store_info = NULL;
848 if (store)
850 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
851 return false;
853 /* Mark stores in this instance and remember the last one. */
854 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
855 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
856 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
859 bool res = true;
861 /* Verify we can sink loads to the vectorized stmt insert location,
862 special-casing stores of this instance. */
863 slp_tree load;
864 unsigned int i;
865 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
866 if (! vect_slp_analyze_node_dependences (vinfo, load,
867 store
868 ? SLP_TREE_SCALAR_STMTS (store)
869 : vNULL, last_store_info))
871 res = false;
872 break;
875 /* Unset the visited flag. */
876 if (store)
877 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
878 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
880 return res;
883 /* Record the base alignment guarantee given by DRB, which occurs
884 in STMT_INFO. */
886 static void
887 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
888 innermost_loop_behavior *drb)
890 bool existed;
891 innermost_loop_behavior *&entry
892 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
893 if (!existed || entry->base_alignment < drb->base_alignment)
895 entry = drb;
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE, vect_location,
898 "recording new base alignment for %T\n"
899 " alignment: %d\n"
900 " misalignment: %d\n"
901 " based on: %G",
902 drb->base_address,
903 drb->base_alignment,
904 drb->base_misalignment,
905 stmt_info->stmt);
909 /* If the region we're going to vectorize is reached, all unconditional
910 data references occur at least once. We can therefore pool the base
911 alignment guarantees from each unconditional reference. Do this by
912 going through all the data references in VINFO and checking whether
913 the containing statement makes the reference unconditionally. If so,
914 record the alignment of the base address in VINFO so that it can be
915 used for all other references with the same base. */
917 void
918 vect_record_base_alignments (vec_info *vinfo)
920 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
921 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
922 data_reference *dr;
923 unsigned int i;
924 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
926 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
927 stmt_vec_info stmt_info = dr_info->stmt;
928 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
929 && STMT_VINFO_VECTORIZABLE (stmt_info)
930 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
932 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
934 /* If DR is nested in the loop that is being vectorized, we can also
935 record the alignment of the base wrt the outer loop. */
936 if (loop && nested_in_vect_loop_p (loop, stmt_info))
937 vect_record_base_alignment
938 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
943 /* Return the target alignment for the vectorized form of DR_INFO. */
945 static poly_uint64
946 vect_calculate_target_alignment (dr_vec_info *dr_info)
948 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
949 return targetm.vectorize.preferred_vector_alignment (vectype);
952 /* Function vect_compute_data_ref_alignment
954 Compute the misalignment of the data reference DR_INFO.
956 Output:
957 1. DR_MISALIGNMENT (DR_INFO) is defined.
959 FOR NOW: No analysis is actually performed. Misalignment is calculated
960 only for trivial cases. TODO. */
962 static void
963 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
965 stmt_vec_info stmt_info = dr_info->stmt;
966 vec_base_alignments *base_alignments = &vinfo->base_alignments;
967 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
968 class loop *loop = NULL;
969 tree ref = DR_REF (dr_info->dr);
970 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
972 if (dump_enabled_p ())
973 dump_printf_loc (MSG_NOTE, vect_location,
974 "vect_compute_data_ref_alignment:\n");
976 if (loop_vinfo)
977 loop = LOOP_VINFO_LOOP (loop_vinfo);
979 /* Initialize misalignment to unknown. */
980 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
982 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
983 return;
985 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
986 bool step_preserves_misalignment_p;
988 poly_uint64 vector_alignment
989 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
990 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
992 /* If the main loop has peeled for alignment we have no way of knowing
993 whether the data accesses in the epilogues are aligned. We can't at
994 compile time answer the question whether we have entered the main loop or
995 not. Fixes PR 92351. */
996 if (loop_vinfo)
998 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
999 if (orig_loop_vinfo
1000 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1001 return;
1004 unsigned HOST_WIDE_INT vect_align_c;
1005 if (!vector_alignment.is_constant (&vect_align_c))
1006 return;
1008 /* No step for BB vectorization. */
1009 if (!loop)
1011 gcc_assert (integer_zerop (drb->step));
1012 step_preserves_misalignment_p = true;
1015 /* In case the dataref is in an inner-loop of the loop that is being
1016 vectorized (LOOP), we use the base and misalignment information
1017 relative to the outer-loop (LOOP). This is ok only if the misalignment
1018 stays the same throughout the execution of the inner-loop, which is why
1019 we have to check that the stride of the dataref in the inner-loop evenly
1020 divides by the vector alignment. */
1021 else if (nested_in_vect_loop_p (loop, stmt_info))
1023 step_preserves_misalignment_p
1024 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1026 if (dump_enabled_p ())
1028 if (step_preserves_misalignment_p)
1029 dump_printf_loc (MSG_NOTE, vect_location,
1030 "inner step divides the vector alignment.\n");
1031 else
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "inner step doesn't divide the vector"
1034 " alignment.\n");
1038 /* Similarly we can only use base and misalignment information relative to
1039 an innermost loop if the misalignment stays the same throughout the
1040 execution of the loop. As above, this is the case if the stride of
1041 the dataref evenly divides by the alignment. */
1042 else
1044 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1045 step_preserves_misalignment_p
1046 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1048 if (!step_preserves_misalignment_p && dump_enabled_p ())
1049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1050 "step doesn't divide the vector alignment.\n");
1053 unsigned int base_alignment = drb->base_alignment;
1054 unsigned int base_misalignment = drb->base_misalignment;
1056 /* Calculate the maximum of the pooled base address alignment and the
1057 alignment that we can compute for DR itself. */
1058 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1059 if (entry && base_alignment < (*entry)->base_alignment)
1061 base_alignment = (*entry)->base_alignment;
1062 base_misalignment = (*entry)->base_misalignment;
1065 if (drb->offset_alignment < vect_align_c
1066 || !step_preserves_misalignment_p
1067 /* We need to know whether the step wrt the vectorized loop is
1068 negative when computing the starting misalignment below. */
1069 || TREE_CODE (drb->step) != INTEGER_CST)
1071 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073 "Unknown alignment for access: %T\n", ref);
1074 return;
1077 if (base_alignment < vect_align_c)
1079 unsigned int max_alignment;
1080 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1081 if (max_alignment < vect_align_c
1082 || !vect_can_force_dr_alignment_p (base,
1083 vect_align_c * BITS_PER_UNIT))
1085 if (dump_enabled_p ())
1086 dump_printf_loc (MSG_NOTE, vect_location,
1087 "can't force alignment of ref: %T\n", ref);
1088 return;
1091 /* Force the alignment of the decl.
1092 NOTE: This is the only change to the code we make during
1093 the analysis phase, before deciding to vectorize the loop. */
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE, vect_location,
1096 "force alignment of %T\n", ref);
1098 dr_info->base_decl = base;
1099 dr_info->base_misaligned = true;
1100 base_misalignment = 0;
1102 poly_int64 misalignment
1103 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1105 /* If this is a backward running DR then first access in the larger
1106 vectype actually is N-1 elements before the address in the DR.
1107 Adjust misalign accordingly. */
1108 if (tree_int_cst_sgn (drb->step) < 0)
1109 /* PLUS because STEP is negative. */
1110 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1111 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1113 unsigned int const_misalignment;
1114 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Non-constant misalignment for access: %T\n", ref);
1119 return;
1122 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1124 if (dump_enabled_p ())
1125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1126 "misalign = %d bytes of ref %T\n",
1127 DR_MISALIGNMENT (dr_info), ref);
1129 return;
1132 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1133 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1134 is made aligned via peeling. */
1136 static bool
1137 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1138 dr_vec_info *dr_peel_info)
1140 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1141 DR_TARGET_ALIGNMENT (dr_info)))
1143 poly_offset_int diff
1144 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1145 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1146 if (known_eq (diff, 0)
1147 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1148 return true;
1150 return false;
1153 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1154 aligned via peeling. */
1156 static bool
1157 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1158 dr_vec_info *dr_peel_info)
1160 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1161 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1162 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1163 DR_OFFSET (dr_peel_info->dr), 0)
1164 || !operand_equal_p (DR_STEP (dr_info->dr),
1165 DR_STEP (dr_peel_info->dr), 0))
1166 return false;
1168 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1171 /* Function vect_update_misalignment_for_peel.
1172 Sets DR_INFO's misalignment
1173 - to 0 if it has the same alignment as DR_PEEL_INFO,
1174 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1175 - to -1 (unknown) otherwise.
1177 DR_INFO - the data reference whose misalignment is to be adjusted.
1178 DR_PEEL_INFO - the data reference whose misalignment is being made
1179 zero in the vector loop by the peel.
1180 NPEEL - the number of iterations in the peel loop if the misalignment
1181 of DR_PEEL_INFO is known at compile time. */
1183 static void
1184 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1185 dr_vec_info *dr_peel_info, int npeel)
1187 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1188 it is aligned in the vector loop. */
1189 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1191 gcc_assert (!known_alignment_for_access_p (dr_info)
1192 || !known_alignment_for_access_p (dr_peel_info)
1193 || (DR_MISALIGNMENT (dr_info)
1194 == DR_MISALIGNMENT (dr_peel_info)));
1195 SET_DR_MISALIGNMENT (dr_info, 0);
1196 return;
1199 unsigned HOST_WIDE_INT alignment;
1200 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1201 && known_alignment_for_access_p (dr_info)
1202 && known_alignment_for_access_p (dr_peel_info))
1204 int misal = DR_MISALIGNMENT (dr_info);
1205 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1206 misal &= alignment - 1;
1207 SET_DR_MISALIGNMENT (dr_info, misal);
1208 return;
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1213 "to unknown (-1).\n");
1214 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1217 /* Return true if alignment is relevant for DR_INFO. */
1219 static bool
1220 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1222 stmt_vec_info stmt_info = dr_info->stmt;
1224 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1225 return false;
1227 /* For interleaving, only the alignment of the first access matters. */
1228 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1229 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1230 return false;
1232 /* Scatter-gather and invariant accesses continue to address individual
1233 scalars, so vector-level alignment is irrelevant. */
1234 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1235 || integer_zerop (DR_STEP (dr_info->dr)))
1236 return false;
1238 /* Strided accesses perform only component accesses, alignment is
1239 irrelevant for them. */
1240 if (STMT_VINFO_STRIDED_P (stmt_info)
1241 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 return false;
1244 return true;
1247 /* Given an memory reference EXP return whether its alignment is less
1248 than its size. */
1250 static bool
1251 not_size_aligned (tree exp)
1253 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1254 return true;
1256 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1257 > get_object_alignment (exp));
1260 /* Function vector_alignment_reachable_p
1262 Return true if vector alignment for DR_INFO is reachable by peeling
1263 a few loop iterations. Return false otherwise. */
1265 static bool
1266 vector_alignment_reachable_p (dr_vec_info *dr_info)
1268 stmt_vec_info stmt_info = dr_info->stmt;
1269 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1271 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1273 /* For interleaved access we peel only if number of iterations in
1274 the prolog loop ({VF - misalignment}), is a multiple of the
1275 number of the interleaved accesses. */
1276 int elem_size, mis_in_elements;
1278 /* FORNOW: handle only known alignment. */
1279 if (!known_alignment_for_access_p (dr_info))
1280 return false;
1282 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1283 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1284 elem_size = vector_element_size (vector_size, nelements);
1285 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1287 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1288 return false;
1291 /* If misalignment is known at the compile time then allow peeling
1292 only if natural alignment is reachable through peeling. */
1293 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1295 HOST_WIDE_INT elmsize =
1296 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1297 if (dump_enabled_p ())
1299 dump_printf_loc (MSG_NOTE, vect_location,
1300 "data size = %wd. misalignment = %d.\n", elmsize,
1301 DR_MISALIGNMENT (dr_info));
1303 if (DR_MISALIGNMENT (dr_info) % elmsize)
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1307 "data size does not divide the misalignment.\n");
1308 return false;
1312 if (!known_alignment_for_access_p (dr_info))
1314 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1315 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1316 if (dump_enabled_p ())
1317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318 "Unknown misalignment, %snaturally aligned\n",
1319 is_packed ? "not " : "");
1320 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1323 return true;
1327 /* Calculate the cost of the memory access represented by DR_INFO. */
1329 static void
1330 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1331 unsigned int *inside_cost,
1332 unsigned int *outside_cost,
1333 stmt_vector_for_cost *body_cost_vec,
1334 stmt_vector_for_cost *prologue_cost_vec)
1336 stmt_vec_info stmt_info = dr_info->stmt;
1337 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1338 int ncopies;
1340 if (PURE_SLP_STMT (stmt_info))
1341 ncopies = 1;
1342 else
1343 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1345 if (DR_IS_READ (dr_info->dr))
1346 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1347 outside_cost, prologue_cost_vec, body_cost_vec, false);
1348 else
1349 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1351 if (dump_enabled_p ())
1352 dump_printf_loc (MSG_NOTE, vect_location,
1353 "vect_get_data_access_cost: inside_cost = %d, "
1354 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1358 typedef struct _vect_peel_info
1360 dr_vec_info *dr_info;
1361 int npeel;
1362 unsigned int count;
1363 } *vect_peel_info;
1365 typedef struct _vect_peel_extended_info
1367 vec_info *vinfo;
1368 struct _vect_peel_info peel_info;
1369 unsigned int inside_cost;
1370 unsigned int outside_cost;
1371 } *vect_peel_extended_info;
1374 /* Peeling hashtable helpers. */
1376 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1378 static inline hashval_t hash (const _vect_peel_info *);
1379 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1382 inline hashval_t
1383 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1385 return (hashval_t) peel_info->npeel;
1388 inline bool
1389 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1391 return (a->npeel == b->npeel);
1395 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1397 static void
1398 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1399 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1400 int npeel)
1402 struct _vect_peel_info elem, *slot;
1403 _vect_peel_info **new_slot;
1404 bool supportable_dr_alignment
1405 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1407 elem.npeel = npeel;
1408 slot = peeling_htab->find (&elem);
1409 if (slot)
1410 slot->count++;
1411 else
1413 slot = XNEW (struct _vect_peel_info);
1414 slot->npeel = npeel;
1415 slot->dr_info = dr_info;
1416 slot->count = 1;
1417 new_slot = peeling_htab->find_slot (slot, INSERT);
1418 *new_slot = slot;
1421 if (!supportable_dr_alignment
1422 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1423 slot->count += VECT_MAX_COST;
1427 /* Traverse peeling hash table to find peeling option that aligns maximum
1428 number of data accesses. */
1431 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1432 _vect_peel_extended_info *max)
1434 vect_peel_info elem = *slot;
1436 if (elem->count > max->peel_info.count
1437 || (elem->count == max->peel_info.count
1438 && max->peel_info.npeel > elem->npeel))
1440 max->peel_info.npeel = elem->npeel;
1441 max->peel_info.count = elem->count;
1442 max->peel_info.dr_info = elem->dr_info;
1445 return 1;
1448 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1449 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1450 we assume DR0_INFO's misalignment will be zero after peeling. */
1452 static void
1453 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1454 dr_vec_info *dr0_info,
1455 unsigned int *inside_cost,
1456 unsigned int *outside_cost,
1457 stmt_vector_for_cost *body_cost_vec,
1458 stmt_vector_for_cost *prologue_cost_vec,
1459 unsigned int npeel,
1460 bool unknown_misalignment)
1462 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1463 unsigned i;
1464 data_reference *dr;
1466 FOR_EACH_VEC_ELT (datarefs, i, dr)
1468 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1469 if (!vect_relevant_for_alignment_p (dr_info))
1470 continue;
1472 int save_misalignment;
1473 save_misalignment = DR_MISALIGNMENT (dr_info);
1474 if (npeel == 0)
1476 else if (unknown_misalignment && dr_info == dr0_info)
1477 SET_DR_MISALIGNMENT (dr_info, 0);
1478 else
1479 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1480 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1481 body_cost_vec, prologue_cost_vec);
1482 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1486 /* Traverse peeling hash table and calculate cost for each peeling option.
1487 Find the one with the lowest cost. */
1490 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1491 _vect_peel_extended_info *min)
1493 vect_peel_info elem = *slot;
1494 int dummy;
1495 unsigned int inside_cost = 0, outside_cost = 0;
1496 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1497 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1498 epilogue_cost_vec;
1500 prologue_cost_vec.create (2);
1501 body_cost_vec.create (2);
1502 epilogue_cost_vec.create (2);
1504 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1505 &outside_cost, &body_cost_vec,
1506 &prologue_cost_vec, elem->npeel, false);
1508 body_cost_vec.release ();
1510 outside_cost += vect_get_known_peeling_cost
1511 (loop_vinfo, elem->npeel, &dummy,
1512 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1513 &prologue_cost_vec, &epilogue_cost_vec);
1515 /* Prologue and epilogue costs are added to the target model later.
1516 These costs depend only on the scalar iteration cost, the
1517 number of peeling iterations finally chosen, and the number of
1518 misaligned statements. So discard the information found here. */
1519 prologue_cost_vec.release ();
1520 epilogue_cost_vec.release ();
1522 if (inside_cost < min->inside_cost
1523 || (inside_cost == min->inside_cost
1524 && outside_cost < min->outside_cost))
1526 min->inside_cost = inside_cost;
1527 min->outside_cost = outside_cost;
1528 min->peel_info.dr_info = elem->dr_info;
1529 min->peel_info.npeel = elem->npeel;
1530 min->peel_info.count = elem->count;
1533 return 1;
1537 /* Choose best peeling option by traversing peeling hash table and either
1538 choosing an option with the lowest cost (if cost model is enabled) or the
1539 option that aligns as many accesses as possible. */
1541 static struct _vect_peel_extended_info
1542 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1543 loop_vec_info loop_vinfo)
1545 struct _vect_peel_extended_info res;
1547 res.peel_info.dr_info = NULL;
1548 res.vinfo = loop_vinfo;
1550 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1552 res.inside_cost = INT_MAX;
1553 res.outside_cost = INT_MAX;
1554 peeling_htab->traverse <_vect_peel_extended_info *,
1555 vect_peeling_hash_get_lowest_cost> (&res);
1557 else
1559 res.peel_info.count = 0;
1560 peeling_htab->traverse <_vect_peel_extended_info *,
1561 vect_peeling_hash_get_most_frequent> (&res);
1562 res.inside_cost = 0;
1563 res.outside_cost = 0;
1566 return res;
1569 /* Return true if the new peeling NPEEL is supported. */
1571 static bool
1572 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1573 unsigned npeel)
1575 unsigned i;
1576 struct data_reference *dr = NULL;
1577 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1578 enum dr_alignment_support supportable_dr_alignment;
1580 /* Ensure that all data refs can be vectorized after the peel. */
1581 FOR_EACH_VEC_ELT (datarefs, i, dr)
1583 int save_misalignment;
1585 if (dr == dr0_info->dr)
1586 continue;
1588 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1589 if (!vect_relevant_for_alignment_p (dr_info))
1590 continue;
1592 save_misalignment = DR_MISALIGNMENT (dr_info);
1593 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1594 supportable_dr_alignment
1595 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1596 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1598 if (!supportable_dr_alignment)
1599 return false;
1602 return true;
1605 /* Compare two data-references DRA and DRB to group them into chunks
1606 with related alignment. */
1608 static int
1609 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1611 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1612 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1613 int cmp;
1615 /* Stabilize sort. */
1616 if (dra == drb)
1617 return 0;
1619 /* Ordering of DRs according to base. */
1620 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1621 DR_BASE_ADDRESS (drb));
1622 if (cmp != 0)
1623 return cmp;
1625 /* And according to DR_OFFSET. */
1626 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1627 if (cmp != 0)
1628 return cmp;
1630 /* And after step. */
1631 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1632 if (cmp != 0)
1633 return cmp;
1635 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1636 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1637 if (cmp == 0)
1638 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1639 return cmp;
1642 /* Function vect_enhance_data_refs_alignment
1644 This pass will use loop versioning and loop peeling in order to enhance
1645 the alignment of data references in the loop.
1647 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1648 original loop is to be vectorized. Any other loops that are created by
1649 the transformations performed in this pass - are not supposed to be
1650 vectorized. This restriction will be relaxed.
1652 This pass will require a cost model to guide it whether to apply peeling
1653 or versioning or a combination of the two. For example, the scheme that
1654 intel uses when given a loop with several memory accesses, is as follows:
1655 choose one memory access ('p') which alignment you want to force by doing
1656 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1657 other accesses are not necessarily aligned, or (2) use loop versioning to
1658 generate one loop in which all accesses are aligned, and another loop in
1659 which only 'p' is necessarily aligned.
1661 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1662 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1663 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1665 Devising a cost model is the most critical aspect of this work. It will
1666 guide us on which access to peel for, whether to use loop versioning, how
1667 many versions to create, etc. The cost model will probably consist of
1668 generic considerations as well as target specific considerations (on
1669 powerpc for example, misaligned stores are more painful than misaligned
1670 loads).
1672 Here are the general steps involved in alignment enhancements:
1674 -- original loop, before alignment analysis:
1675 for (i=0; i<N; i++){
1676 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1677 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1680 -- After vect_compute_data_refs_alignment:
1681 for (i=0; i<N; i++){
1682 x = q[i]; # DR_MISALIGNMENT(q) = 3
1683 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1686 -- Possibility 1: we do loop versioning:
1687 if (p is aligned) {
1688 for (i=0; i<N; i++){ # loop 1A
1689 x = q[i]; # DR_MISALIGNMENT(q) = 3
1690 p[i] = y; # DR_MISALIGNMENT(p) = 0
1693 else {
1694 for (i=0; i<N; i++){ # loop 1B
1695 x = q[i]; # DR_MISALIGNMENT(q) = 3
1696 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1700 -- Possibility 2: we do loop peeling:
1701 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1702 x = q[i];
1703 p[i] = y;
1705 for (i = 3; i < N; i++){ # loop 2A
1706 x = q[i]; # DR_MISALIGNMENT(q) = 0
1707 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1710 -- Possibility 3: combination of loop peeling and versioning:
1711 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1712 x = q[i];
1713 p[i] = y;
1715 if (p is aligned) {
1716 for (i = 3; i<N; i++){ # loop 3A
1717 x = q[i]; # DR_MISALIGNMENT(q) = 0
1718 p[i] = y; # DR_MISALIGNMENT(p) = 0
1721 else {
1722 for (i = 3; i<N; i++){ # loop 3B
1723 x = q[i]; # DR_MISALIGNMENT(q) = 0
1724 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1728 These loops are later passed to loop_transform to be vectorized. The
1729 vectorizer will use the alignment information to guide the transformation
1730 (whether to generate regular loads/stores, or with special handling for
1731 misalignment). */
1733 opt_result
1734 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1736 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1737 enum dr_alignment_support supportable_dr_alignment;
1738 dr_vec_info *first_store = NULL;
1739 dr_vec_info *dr0_info = NULL;
1740 struct data_reference *dr;
1741 unsigned int i;
1742 bool do_peeling = false;
1743 bool do_versioning = false;
1744 unsigned int npeel = 0;
1745 bool one_misalignment_known = false;
1746 bool one_misalignment_unknown = false;
1747 bool one_dr_unsupportable = false;
1748 dr_vec_info *unsupportable_dr_info = NULL;
1749 unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1750 hash_table<peel_info_hasher> peeling_htab (1);
1752 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1754 /* Reset data so we can safely be called multiple times. */
1755 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1756 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1758 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1759 return opt_result::success ();
1761 /* Sort the vector of datarefs so DRs that have the same or dependent
1762 alignment are next to each other. */
1763 auto_vec<data_reference_p> datarefs
1764 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1765 datarefs.qsort (dr_align_group_sort_cmp);
1767 /* Compute the number of DRs that become aligned when we peel
1768 a dataref so it becomes aligned. */
1769 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1770 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1771 unsigned i0;
1772 for (i0 = 0; i0 < datarefs.length (); ++i0)
1773 if (DR_BASE_ADDRESS (datarefs[i0]))
1774 break;
1775 for (i = i0 + 1; i <= datarefs.length (); ++i)
1777 if (i == datarefs.length ()
1778 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1779 DR_BASE_ADDRESS (datarefs[i]), 0)
1780 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1781 DR_OFFSET (datarefs[i]), 0)
1782 || !operand_equal_p (DR_STEP (datarefs[i0]),
1783 DR_STEP (datarefs[i]), 0))
1785 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1786 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1787 will get known misalignment if we align one of the refs
1788 with the largest DR_TARGET_ALIGNMENT. */
1789 for (unsigned j = i0; j < i; ++j)
1791 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1792 for (unsigned k = i0; k < i; ++k)
1794 if (k == j)
1795 continue;
1796 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1797 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1798 dr_infoj))
1799 n_same_align_refs[j]++;
1802 i0 = i;
1806 /* While cost model enhancements are expected in the future, the high level
1807 view of the code at this time is as follows:
1809 A) If there is a misaligned access then see if peeling to align
1810 this access can make all data references satisfy
1811 vect_supportable_dr_alignment. If so, update data structures
1812 as needed and return true.
1814 B) If peeling wasn't possible and there is a data reference with an
1815 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1816 then see if loop versioning checks can be used to make all data
1817 references satisfy vect_supportable_dr_alignment. If so, update
1818 data structures as needed and return true.
1820 C) If neither peeling nor versioning were successful then return false if
1821 any data reference does not satisfy vect_supportable_dr_alignment.
1823 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1825 Note, Possibility 3 above (which is peeling and versioning together) is not
1826 being done at this time. */
1828 /* (1) Peeling to force alignment. */
1830 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1831 Considerations:
1832 + How many accesses will become aligned due to the peeling
1833 - How many accesses will become unaligned due to the peeling,
1834 and the cost of misaligned accesses.
1835 - The cost of peeling (the extra runtime checks, the increase
1836 in code size). */
1838 FOR_EACH_VEC_ELT (datarefs, i, dr)
1840 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1841 if (!vect_relevant_for_alignment_p (dr_info))
1842 continue;
1844 stmt_vec_info stmt_info = dr_info->stmt;
1845 supportable_dr_alignment
1846 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1847 do_peeling = vector_alignment_reachable_p (dr_info);
1848 if (do_peeling)
1850 if (known_alignment_for_access_p (dr_info))
1852 unsigned int npeel_tmp = 0;
1853 bool negative = tree_int_cst_compare (DR_STEP (dr),
1854 size_zero_node) < 0;
1856 /* If known_alignment_for_access_p then we have set
1857 DR_MISALIGNMENT which is only done if we know it at compiler
1858 time, so it is safe to assume target alignment is constant.
1860 unsigned int target_align =
1861 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1862 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1863 mis = (negative
1864 ? DR_MISALIGNMENT (dr_info)
1865 : -DR_MISALIGNMENT (dr_info));
1866 if (DR_MISALIGNMENT (dr_info) != 0)
1867 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1869 /* For multiple types, it is possible that the bigger type access
1870 will have more than one peeling option. E.g., a loop with two
1871 types: one of size (vector size / 4), and the other one of
1872 size (vector size / 8). Vectorization factor will 8. If both
1873 accesses are misaligned by 3, the first one needs one scalar
1874 iteration to be aligned, and the second one needs 5. But the
1875 first one will be aligned also by peeling 5 scalar
1876 iterations, and in that case both accesses will be aligned.
1877 Hence, except for the immediate peeling amount, we also want
1878 to try to add full vector size, while we don't exceed
1879 vectorization factor.
1880 We do this automatically for cost model, since we calculate
1881 cost for every peeling option. */
1882 poly_uint64 nscalars = npeel_tmp;
1883 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1885 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1886 nscalars = (STMT_SLP_TYPE (stmt_info)
1887 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1890 /* Save info about DR in the hash table. Also include peeling
1891 amounts according to the explanation above. */
1892 while (known_le (npeel_tmp, nscalars))
1894 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1895 dr_info, npeel_tmp);
1896 npeel_tmp += MAX (1, target_align / dr_size);
1899 one_misalignment_known = true;
1901 else
1903 /* If we don't know any misalignment values, we prefer
1904 peeling for data-ref that has the maximum number of data-refs
1905 with the same alignment, unless the target prefers to align
1906 stores over load. */
1907 unsigned same_align_drs = n_same_align_refs[i];
1908 if (!dr0_info
1909 || dr0_same_align_drs < same_align_drs)
1911 dr0_same_align_drs = same_align_drs;
1912 dr0_info = dr_info;
1914 /* For data-refs with the same number of related
1915 accesses prefer the one where the misalign
1916 computation will be invariant in the outermost loop. */
1917 else if (dr0_same_align_drs == same_align_drs)
1919 class loop *ivloop0, *ivloop;
1920 ivloop0 = outermost_invariant_loop_for_expr
1921 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1922 ivloop = outermost_invariant_loop_for_expr
1923 (loop, DR_BASE_ADDRESS (dr));
1924 if ((ivloop && !ivloop0)
1925 || (ivloop && ivloop0
1926 && flow_loop_nested_p (ivloop, ivloop0)))
1927 dr0_info = dr_info;
1930 one_misalignment_unknown = true;
1932 /* Check for data refs with unsupportable alignment that
1933 can be peeled. */
1934 if (!supportable_dr_alignment)
1936 one_dr_unsupportable = true;
1937 unsupportable_dr_info = dr_info;
1940 if (!first_store && DR_IS_WRITE (dr))
1942 first_store = dr_info;
1943 first_store_same_align_drs = same_align_drs;
1947 else
1949 if (!aligned_access_p (dr_info))
1951 if (dump_enabled_p ())
1952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1953 "vector alignment may not be reachable\n");
1954 break;
1959 /* Check if we can possibly peel the loop. */
1960 if (!vect_can_advance_ivs_p (loop_vinfo)
1961 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1962 || loop->inner)
1963 do_peeling = false;
1965 struct _vect_peel_extended_info peel_for_known_alignment;
1966 struct _vect_peel_extended_info peel_for_unknown_alignment;
1967 struct _vect_peel_extended_info best_peel;
1969 peel_for_unknown_alignment.inside_cost = INT_MAX;
1970 peel_for_unknown_alignment.outside_cost = INT_MAX;
1971 peel_for_unknown_alignment.peel_info.count = 0;
1973 if (do_peeling
1974 && one_misalignment_unknown)
1976 /* Check if the target requires to prefer stores over loads, i.e., if
1977 misaligned stores are more expensive than misaligned loads (taking
1978 drs with same alignment into account). */
1979 unsigned int load_inside_cost = 0;
1980 unsigned int load_outside_cost = 0;
1981 unsigned int store_inside_cost = 0;
1982 unsigned int store_outside_cost = 0;
1983 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1985 stmt_vector_for_cost dummy;
1986 dummy.create (2);
1987 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1988 &load_inside_cost,
1989 &load_outside_cost,
1990 &dummy, &dummy, estimated_npeels, true);
1991 dummy.release ();
1993 if (first_store)
1995 dummy.create (2);
1996 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1997 &store_inside_cost,
1998 &store_outside_cost,
1999 &dummy, &dummy,
2000 estimated_npeels, true);
2001 dummy.release ();
2003 else
2005 store_inside_cost = INT_MAX;
2006 store_outside_cost = INT_MAX;
2009 if (load_inside_cost > store_inside_cost
2010 || (load_inside_cost == store_inside_cost
2011 && load_outside_cost > store_outside_cost))
2013 dr0_info = first_store;
2014 dr0_same_align_drs = first_store_same_align_drs;
2015 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2016 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2018 else
2020 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2021 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2024 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2025 prologue_cost_vec.create (2);
2026 epilogue_cost_vec.create (2);
2028 int dummy2;
2029 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2030 (loop_vinfo, estimated_npeels, &dummy2,
2031 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2032 &prologue_cost_vec, &epilogue_cost_vec);
2034 prologue_cost_vec.release ();
2035 epilogue_cost_vec.release ();
2037 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2040 peel_for_unknown_alignment.peel_info.npeel = 0;
2041 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2043 best_peel = peel_for_unknown_alignment;
2045 peel_for_known_alignment.inside_cost = INT_MAX;
2046 peel_for_known_alignment.outside_cost = INT_MAX;
2047 peel_for_known_alignment.peel_info.count = 0;
2048 peel_for_known_alignment.peel_info.dr_info = NULL;
2050 if (do_peeling && one_misalignment_known)
2052 /* Peeling is possible, but there is no data access that is not supported
2053 unless aligned. So we try to choose the best possible peeling from
2054 the hash table. */
2055 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2056 (&peeling_htab, loop_vinfo);
2059 /* Compare costs of peeling for known and unknown alignment. */
2060 if (peel_for_known_alignment.peel_info.dr_info != NULL
2061 && peel_for_unknown_alignment.inside_cost
2062 >= peel_for_known_alignment.inside_cost)
2064 best_peel = peel_for_known_alignment;
2066 /* If the best peeling for known alignment has NPEEL == 0, perform no
2067 peeling at all except if there is an unsupportable dr that we can
2068 align. */
2069 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2070 do_peeling = false;
2073 /* If there is an unsupportable data ref, prefer this over all choices so far
2074 since we'd have to discard a chosen peeling except when it accidentally
2075 aligned the unsupportable data ref. */
2076 if (one_dr_unsupportable)
2077 dr0_info = unsupportable_dr_info;
2078 else if (do_peeling)
2080 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2081 TODO: Use nopeel_outside_cost or get rid of it? */
2082 unsigned nopeel_inside_cost = 0;
2083 unsigned nopeel_outside_cost = 0;
2085 stmt_vector_for_cost dummy;
2086 dummy.create (2);
2087 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2088 &nopeel_outside_cost, &dummy, &dummy,
2089 0, false);
2090 dummy.release ();
2092 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2093 costs will be recorded. */
2094 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2095 prologue_cost_vec.create (2);
2096 epilogue_cost_vec.create (2);
2098 int dummy2;
2099 nopeel_outside_cost += vect_get_known_peeling_cost
2100 (loop_vinfo, 0, &dummy2,
2101 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2102 &prologue_cost_vec, &epilogue_cost_vec);
2104 prologue_cost_vec.release ();
2105 epilogue_cost_vec.release ();
2107 npeel = best_peel.peel_info.npeel;
2108 dr0_info = best_peel.peel_info.dr_info;
2110 /* If no peeling is not more expensive than the best peeling we
2111 have so far, don't perform any peeling. */
2112 if (nopeel_inside_cost <= best_peel.inside_cost)
2113 do_peeling = false;
2116 if (do_peeling)
2118 stmt_vec_info stmt_info = dr0_info->stmt;
2119 if (known_alignment_for_access_p (dr0_info))
2121 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2122 size_zero_node) < 0;
2123 if (!npeel)
2125 /* Since it's known at compile time, compute the number of
2126 iterations in the peeled loop (the peeling factor) for use in
2127 updating DR_MISALIGNMENT values. The peeling factor is the
2128 vectorization factor minus the misalignment as an element
2129 count. */
2130 mis = (negative
2131 ? DR_MISALIGNMENT (dr0_info)
2132 : -DR_MISALIGNMENT (dr0_info));
2133 /* If known_alignment_for_access_p then we have set
2134 DR_MISALIGNMENT which is only done if we know it at compiler
2135 time, so it is safe to assume target alignment is constant.
2137 unsigned int target_align =
2138 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2139 npeel = ((mis & (target_align - 1))
2140 / vect_get_scalar_dr_size (dr0_info));
2143 /* For interleaved data access every iteration accesses all the
2144 members of the group, therefore we divide the number of iterations
2145 by the group size. */
2146 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2147 npeel /= DR_GROUP_SIZE (stmt_info);
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_NOTE, vect_location,
2151 "Try peeling by %d\n", npeel);
2154 /* Ensure that all datarefs can be vectorized after the peel. */
2155 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2156 do_peeling = false;
2158 /* Check if all datarefs are supportable and log. */
2159 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2160 return opt_result::success ();
2162 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2163 if (do_peeling)
2165 unsigned max_allowed_peel
2166 = param_vect_max_peeling_for_alignment;
2167 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2168 max_allowed_peel = 0;
2169 if (max_allowed_peel != (unsigned)-1)
2171 unsigned max_peel = npeel;
2172 if (max_peel == 0)
2174 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2175 unsigned HOST_WIDE_INT target_align_c;
2176 if (target_align.is_constant (&target_align_c))
2177 max_peel =
2178 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2179 else
2181 do_peeling = false;
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Disable peeling, max peels set and vector"
2185 " alignment unknown\n");
2188 if (max_peel > max_allowed_peel)
2190 do_peeling = false;
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_NOTE, vect_location,
2193 "Disable peeling, max peels reached: %d\n", max_peel);
2198 /* Cost model #2 - if peeling may result in a remaining loop not
2199 iterating enough to be vectorized then do not peel. Since this
2200 is a cost heuristic rather than a correctness decision, use the
2201 most likely runtime value for variable vectorization factors. */
2202 if (do_peeling
2203 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2205 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2206 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2207 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2208 < assumed_vf + max_peel)
2209 do_peeling = false;
2212 if (do_peeling)
2214 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2215 If the misalignment of DR_i is identical to that of dr0 then set
2216 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2217 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2218 by the peeling factor times the element size of DR_i (MOD the
2219 vectorization factor times the size). Otherwise, the
2220 misalignment of DR_i must be set to unknown. */
2221 FOR_EACH_VEC_ELT (datarefs, i, dr)
2222 if (dr != dr0_info->dr)
2224 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2225 if (!vect_relevant_for_alignment_p (dr_info))
2226 continue;
2228 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2231 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2232 if (npeel)
2233 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2234 else
2235 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2236 = DR_MISALIGNMENT (dr0_info);
2237 SET_DR_MISALIGNMENT (dr0_info, 0);
2238 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_NOTE, vect_location,
2241 "Alignment of access forced using peeling.\n");
2242 dump_printf_loc (MSG_NOTE, vect_location,
2243 "Peeling for alignment will be applied.\n");
2246 /* The inside-loop cost will be accounted for in vectorizable_load
2247 and vectorizable_store correctly with adjusted alignments.
2248 Drop the body_cst_vec on the floor here. */
2249 return opt_result::success ();
2253 /* (2) Versioning to force alignment. */
2255 /* Try versioning if:
2256 1) optimize loop for speed and the cost-model is not cheap
2257 2) there is at least one unsupported misaligned data ref with an unknown
2258 misalignment, and
2259 3) all misaligned data refs with a known misalignment are supported, and
2260 4) the number of runtime alignment checks is within reason. */
2262 do_versioning
2263 = (optimize_loop_nest_for_speed_p (loop)
2264 && !loop->inner /* FORNOW */
2265 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2267 if (do_versioning)
2269 FOR_EACH_VEC_ELT (datarefs, i, dr)
2271 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2272 if (aligned_access_p (dr_info)
2273 || !vect_relevant_for_alignment_p (dr_info))
2274 continue;
2276 stmt_vec_info stmt_info = dr_info->stmt;
2277 if (STMT_VINFO_STRIDED_P (stmt_info))
2279 do_versioning = false;
2280 break;
2283 supportable_dr_alignment
2284 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2286 if (!supportable_dr_alignment)
2288 int mask;
2289 tree vectype;
2291 if (known_alignment_for_access_p (dr_info)
2292 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2293 >= (unsigned) param_vect_max_version_for_alignment_checks)
2295 do_versioning = false;
2296 break;
2299 vectype = STMT_VINFO_VECTYPE (stmt_info);
2300 gcc_assert (vectype);
2302 /* At present we don't support versioning for alignment
2303 with variable VF, since there's no guarantee that the
2304 VF is a power of two. We could relax this if we added
2305 a way of enforcing a power-of-two size. */
2306 unsigned HOST_WIDE_INT size;
2307 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2309 do_versioning = false;
2310 break;
2313 /* Forcing alignment in the first iteration is no good if
2314 we don't keep it across iterations. For now, just disable
2315 versioning in this case.
2316 ?? We could actually unroll the loop to achieve the required
2317 overall step alignment, and forcing the alignment could be
2318 done by doing some iterations of the non-vectorized loop. */
2319 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2320 * DR_STEP_ALIGNMENT (dr),
2321 DR_TARGET_ALIGNMENT (dr_info)))
2323 do_versioning = false;
2324 break;
2327 /* The rightmost bits of an aligned address must be zeros.
2328 Construct the mask needed for this test. For example,
2329 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2330 mask must be 15 = 0xf. */
2331 mask = size - 1;
2333 /* FORNOW: use the same mask to test all potentially unaligned
2334 references in the loop. */
2335 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2336 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2338 do_versioning = false;
2339 break;
2342 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2343 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2347 /* Versioning requires at least one misaligned data reference. */
2348 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2349 do_versioning = false;
2350 else if (!do_versioning)
2351 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2354 if (do_versioning)
2356 vec<stmt_vec_info> may_misalign_stmts
2357 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2358 stmt_vec_info stmt_info;
2360 /* It can now be assumed that the data references in the statements
2361 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2362 of the loop being vectorized. */
2363 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2365 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2366 SET_DR_MISALIGNMENT (dr_info, 0);
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "Alignment of access forced using versioning.\n");
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_NOTE, vect_location,
2374 "Versioning for alignment will be applied.\n");
2376 /* Peeling and versioning can't be done together at this time. */
2377 gcc_assert (! (do_peeling && do_versioning));
2379 return opt_result::success ();
2382 /* This point is reached if neither peeling nor versioning is being done. */
2383 gcc_assert (! (do_peeling || do_versioning));
2385 return opt_result::success ();
2389 /* Function vect_analyze_data_refs_alignment
2391 Analyze the alignment of the data-references in the loop.
2392 Return FALSE if a data reference is found that cannot be vectorized. */
2394 opt_result
2395 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2397 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2399 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2400 struct data_reference *dr;
2401 unsigned int i;
2403 vect_record_base_alignments (loop_vinfo);
2404 FOR_EACH_VEC_ELT (datarefs, i, dr)
2406 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2407 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2408 vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2411 return opt_result::success ();
2415 /* Analyze alignment of DRs of stmts in NODE. */
2417 static bool
2418 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2420 /* We vectorize from the first scalar stmt in the node unless
2421 the node is permuted in which case we start from the first
2422 element in the group. */
2423 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2424 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2425 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2426 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2428 /* We need to commit to a vector type for the group now. */
2429 if (is_a <bb_vec_info> (vinfo)
2430 && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2431 return false;
2433 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2434 vect_compute_data_ref_alignment (vinfo, dr_info);
2435 /* In several places we need alignment of the first element anyway. */
2436 if (dr_info != first_dr_info)
2437 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2439 /* For creating the data-ref pointer we need alignment of the
2440 first element as well. */
2441 first_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
2442 if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2444 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2445 if (dr_info != first_dr_info)
2446 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2449 return true;
2452 /* Function vect_slp_analyze_instance_alignment
2454 Analyze the alignment of the data-references in the SLP instance.
2455 Return FALSE if a data reference is found that cannot be vectorized. */
2457 bool
2458 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2459 slp_instance instance)
2461 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2463 slp_tree node;
2464 unsigned i;
2465 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2466 if (! vect_slp_analyze_node_alignment (vinfo, node))
2467 return false;
2469 node = SLP_INSTANCE_TREE (instance);
2470 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2471 && ! vect_slp_analyze_node_alignment
2472 (vinfo, SLP_INSTANCE_TREE (instance)))
2473 return false;
2475 return true;
2479 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2480 accesses of legal size, step, etc. Detect gaps, single element
2481 interleaving, and other special cases. Set grouped access info.
2482 Collect groups of strided stores for further use in SLP analysis.
2483 Worker for vect_analyze_group_access. */
2485 static bool
2486 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2488 data_reference *dr = dr_info->dr;
2489 tree step = DR_STEP (dr);
2490 tree scalar_type = TREE_TYPE (DR_REF (dr));
2491 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2492 stmt_vec_info stmt_info = dr_info->stmt;
2493 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2494 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2495 HOST_WIDE_INT dr_step = -1;
2496 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2497 bool slp_impossible = false;
2499 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2500 size of the interleaving group (including gaps). */
2501 if (tree_fits_shwi_p (step))
2503 dr_step = tree_to_shwi (step);
2504 /* Check that STEP is a multiple of type size. Otherwise there is
2505 a non-element-sized gap at the end of the group which we
2506 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2507 ??? As we can handle non-constant step fine here we should
2508 simply remove uses of DR_GROUP_GAP between the last and first
2509 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2510 simply not include that gap. */
2511 if ((dr_step % type_size) != 0)
2513 if (dump_enabled_p ())
2514 dump_printf_loc (MSG_NOTE, vect_location,
2515 "Step %T is not a multiple of the element size"
2516 " for %T\n",
2517 step, DR_REF (dr));
2518 return false;
2520 groupsize = absu_hwi (dr_step) / type_size;
2522 else
2523 groupsize = 0;
2525 /* Not consecutive access is possible only if it is a part of interleaving. */
2526 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2528 /* Check if it this DR is a part of interleaving, and is a single
2529 element of the group that is accessed in the loop. */
2531 /* Gaps are supported only for loads. STEP must be a multiple of the type
2532 size. */
2533 if (DR_IS_READ (dr)
2534 && (dr_step % type_size) == 0
2535 && groupsize > 0)
2537 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2538 DR_GROUP_SIZE (stmt_info) = groupsize;
2539 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_NOTE, vect_location,
2542 "Detected single element interleaving %T"
2543 " step %T\n",
2544 DR_REF (dr), step);
2546 return true;
2549 if (dump_enabled_p ())
2550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551 "not consecutive access %G", stmt_info->stmt);
2553 if (bb_vinfo)
2555 /* Mark the statement as unvectorizable. */
2556 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2557 return true;
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2562 STMT_VINFO_STRIDED_P (stmt_info) = true;
2563 return true;
2566 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2568 /* First stmt in the interleaving chain. Check the chain. */
2569 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2570 struct data_reference *data_ref = dr;
2571 unsigned int count = 1;
2572 tree prev_init = DR_INIT (data_ref);
2573 HOST_WIDE_INT diff, gaps = 0;
2575 /* By construction, all group members have INTEGER_CST DR_INITs. */
2576 while (next)
2578 /* We never have the same DR multiple times. */
2579 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2580 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2582 data_ref = STMT_VINFO_DATA_REF (next);
2584 /* All group members have the same STEP by construction. */
2585 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2587 /* Check that the distance between two accesses is equal to the type
2588 size. Otherwise, we have gaps. */
2589 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2590 - TREE_INT_CST_LOW (prev_init)) / type_size;
2591 if (diff != 1)
2593 /* FORNOW: SLP of accesses with gaps is not supported. */
2594 slp_impossible = true;
2595 if (DR_IS_WRITE (data_ref))
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2599 "interleaved store with gaps\n");
2600 return false;
2603 gaps += diff - 1;
2606 last_accessed_element += diff;
2608 /* Store the gap from the previous member of the group. If there is no
2609 gap in the access, DR_GROUP_GAP is always 1. */
2610 DR_GROUP_GAP (next) = diff;
2612 prev_init = DR_INIT (data_ref);
2613 next = DR_GROUP_NEXT_ELEMENT (next);
2614 /* Count the number of data-refs in the chain. */
2615 count++;
2618 if (groupsize == 0)
2619 groupsize = count + gaps;
2621 /* This could be UINT_MAX but as we are generating code in a very
2622 inefficient way we have to cap earlier. See PR78699 for example. */
2623 if (groupsize > 4096)
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627 "group is too large\n");
2628 return false;
2631 /* Check that the size of the interleaving is equal to count for stores,
2632 i.e., that there are no gaps. */
2633 if (groupsize != count
2634 && !DR_IS_READ (dr))
2636 groupsize = count;
2637 STMT_VINFO_STRIDED_P (stmt_info) = true;
2640 /* If there is a gap after the last load in the group it is the
2641 difference between the groupsize and the last accessed
2642 element.
2643 When there is no gap, this difference should be 0. */
2644 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2646 DR_GROUP_SIZE (stmt_info) = groupsize;
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "Detected interleaving ");
2651 if (DR_IS_READ (dr))
2652 dump_printf (MSG_NOTE, "load ");
2653 else if (STMT_VINFO_STRIDED_P (stmt_info))
2654 dump_printf (MSG_NOTE, "strided store ");
2655 else
2656 dump_printf (MSG_NOTE, "store ");
2657 dump_printf (MSG_NOTE, "of size %u\n",
2658 (unsigned)groupsize);
2659 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2660 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2661 while (next)
2663 if (DR_GROUP_GAP (next) != 1)
2664 dump_printf_loc (MSG_NOTE, vect_location,
2665 "\t<gap of %d elements>\n",
2666 DR_GROUP_GAP (next) - 1);
2667 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2668 next = DR_GROUP_NEXT_ELEMENT (next);
2670 if (DR_GROUP_GAP (stmt_info) != 0)
2671 dump_printf_loc (MSG_NOTE, vect_location,
2672 "\t<gap of %d elements>\n",
2673 DR_GROUP_GAP (stmt_info));
2676 /* SLP: create an SLP data structure for every interleaving group of
2677 stores for further analysis in vect_analyse_slp. */
2678 if (DR_IS_WRITE (dr) && !slp_impossible)
2680 if (loop_vinfo)
2681 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2682 if (bb_vinfo)
2683 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2687 return true;
2690 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2691 accesses of legal size, step, etc. Detect gaps, single element
2692 interleaving, and other special cases. Set grouped access info.
2693 Collect groups of strided stores for further use in SLP analysis. */
2695 static bool
2696 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2698 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2700 /* Dissolve the group if present. */
2701 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2702 while (stmt_info)
2704 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2705 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2706 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2707 stmt_info = next;
2709 return false;
2711 return true;
2714 /* Analyze the access pattern of the data-reference DR_INFO.
2715 In case of non-consecutive accesses call vect_analyze_group_access() to
2716 analyze groups of accesses. */
2718 static bool
2719 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2721 data_reference *dr = dr_info->dr;
2722 tree step = DR_STEP (dr);
2723 tree scalar_type = TREE_TYPE (DR_REF (dr));
2724 stmt_vec_info stmt_info = dr_info->stmt;
2725 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2726 class loop *loop = NULL;
2728 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2729 return true;
2731 if (loop_vinfo)
2732 loop = LOOP_VINFO_LOOP (loop_vinfo);
2734 if (loop_vinfo && !step)
2736 if (dump_enabled_p ())
2737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2738 "bad data-ref access in loop\n");
2739 return false;
2742 /* Allow loads with zero step in inner-loop vectorization. */
2743 if (loop_vinfo && integer_zerop (step))
2745 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2746 if (!nested_in_vect_loop_p (loop, stmt_info))
2747 return DR_IS_READ (dr);
2748 /* Allow references with zero step for outer loops marked
2749 with pragma omp simd only - it guarantees absence of
2750 loop-carried dependencies between inner loop iterations. */
2751 if (loop->safelen < 2)
2753 if (dump_enabled_p ())
2754 dump_printf_loc (MSG_NOTE, vect_location,
2755 "zero step in inner loop of nest\n");
2756 return false;
2760 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2762 /* Interleaved accesses are not yet supported within outer-loop
2763 vectorization for references in the inner-loop. */
2764 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2766 /* For the rest of the analysis we use the outer-loop step. */
2767 step = STMT_VINFO_DR_STEP (stmt_info);
2768 if (integer_zerop (step))
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "zero step in outer loop.\n");
2773 return DR_IS_READ (dr);
2777 /* Consecutive? */
2778 if (TREE_CODE (step) == INTEGER_CST)
2780 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2781 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2782 || (dr_step < 0
2783 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2785 /* Mark that it is not interleaving. */
2786 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2787 return true;
2791 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2793 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_NOTE, vect_location,
2795 "grouped access in outer loop.\n");
2796 return false;
2800 /* Assume this is a DR handled by non-constant strided load case. */
2801 if (TREE_CODE (step) != INTEGER_CST)
2802 return (STMT_VINFO_STRIDED_P (stmt_info)
2803 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2804 || vect_analyze_group_access (vinfo, dr_info)));
2806 /* Not consecutive access - check if it's a part of interleaving group. */
2807 return vect_analyze_group_access (vinfo, dr_info);
2810 typedef std::pair<data_reference_p, int> data_ref_pair;
2812 /* Compare two data-references DRA and DRB to group them into chunks
2813 suitable for grouping. */
2815 static int
2816 dr_group_sort_cmp (const void *dra_, const void *drb_)
2818 data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2819 data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2820 data_reference_p dra = dra_pair.first;
2821 data_reference_p drb = drb_pair.first;
2822 int cmp;
2824 /* Stabilize sort. */
2825 if (dra == drb)
2826 return 0;
2828 /* DRs in different basic-blocks never belong to the same group. */
2829 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2830 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2831 if (bb_index1 != bb_index2)
2832 return bb_index1 < bb_index2 ? -1 : 1;
2834 /* Different group IDs lead never belong to the same group. */
2835 if (dra_pair.second != drb_pair.second)
2836 return dra_pair.second < drb_pair.second ? -1 : 1;
2838 /* Ordering of DRs according to base. */
2839 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2840 DR_BASE_ADDRESS (drb));
2841 if (cmp != 0)
2842 return cmp;
2844 /* And according to DR_OFFSET. */
2845 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2846 if (cmp != 0)
2847 return cmp;
2849 /* Put reads before writes. */
2850 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2851 return DR_IS_READ (dra) ? -1 : 1;
2853 /* Then sort after access size. */
2854 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2855 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2856 if (cmp != 0)
2857 return cmp;
2859 /* And after step. */
2860 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2861 if (cmp != 0)
2862 return cmp;
2864 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2865 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2866 if (cmp == 0)
2867 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2868 return cmp;
2871 /* If OP is the result of a conversion, return the unconverted value,
2872 otherwise return null. */
2874 static tree
2875 strip_conversion (tree op)
2877 if (TREE_CODE (op) != SSA_NAME)
2878 return NULL_TREE;
2879 gimple *stmt = SSA_NAME_DEF_STMT (op);
2880 if (!is_gimple_assign (stmt)
2881 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2882 return NULL_TREE;
2883 return gimple_assign_rhs1 (stmt);
2886 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2887 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2888 be grouped in SLP mode. */
2890 static bool
2891 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2892 bool allow_slp_p)
2894 if (gimple_assign_single_p (stmt1_info->stmt))
2895 return gimple_assign_single_p (stmt2_info->stmt);
2897 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2898 if (call1 && gimple_call_internal_p (call1))
2900 /* Check for two masked loads or two masked stores. */
2901 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2902 if (!call2 || !gimple_call_internal_p (call2))
2903 return false;
2904 internal_fn ifn = gimple_call_internal_fn (call1);
2905 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2906 return false;
2907 if (ifn != gimple_call_internal_fn (call2))
2908 return false;
2910 /* Check that the masks are the same. Cope with casts of masks,
2911 like those created by build_mask_conversion. */
2912 tree mask1 = gimple_call_arg (call1, 2);
2913 tree mask2 = gimple_call_arg (call2, 2);
2914 if (!operand_equal_p (mask1, mask2, 0)
2915 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2917 mask1 = strip_conversion (mask1);
2918 if (!mask1)
2919 return false;
2920 mask2 = strip_conversion (mask2);
2921 if (!mask2)
2922 return false;
2923 if (!operand_equal_p (mask1, mask2, 0))
2924 return false;
2926 return true;
2929 return false;
2932 /* Function vect_analyze_data_ref_accesses.
2934 Analyze the access pattern of all the data references in the loop.
2936 FORNOW: the only access pattern that is considered vectorizable is a
2937 simple step 1 (consecutive) access.
2939 FORNOW: handle only arrays and pointer accesses. */
2941 opt_result
2942 vect_analyze_data_ref_accesses (vec_info *vinfo,
2943 vec<int> *dataref_groups)
2945 unsigned int i;
2946 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2948 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2950 if (datarefs.is_empty ())
2951 return opt_result::success ();
2953 /* Sort the array of datarefs to make building the interleaving chains
2954 linear. Don't modify the original vector's order, it is needed for
2955 determining what dependencies are reversed. */
2956 vec<data_ref_pair> datarefs_copy;
2957 datarefs_copy.create (datarefs.length ());
2958 for (unsigned i = 0; i < datarefs.length (); i++)
2960 int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2961 datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2963 datarefs_copy.qsort (dr_group_sort_cmp);
2964 hash_set<stmt_vec_info> to_fixup;
2966 /* Build the interleaving chains. */
2967 for (i = 0; i < datarefs_copy.length () - 1;)
2969 data_reference_p dra = datarefs_copy[i].first;
2970 int dra_group_id = datarefs_copy[i].second;
2971 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2972 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2973 stmt_vec_info lastinfo = NULL;
2974 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2975 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2977 ++i;
2978 continue;
2980 for (i = i + 1; i < datarefs_copy.length (); ++i)
2982 data_reference_p drb = datarefs_copy[i].first;
2983 int drb_group_id = datarefs_copy[i].second;
2984 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2985 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2986 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2987 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2988 break;
2990 /* ??? Imperfect sorting (non-compatible types, non-modulo
2991 accesses, same accesses) can lead to a group to be artificially
2992 split here as we don't just skip over those. If it really
2993 matters we can push those to a worklist and re-iterate
2994 over them. The we can just skip ahead to the next DR here. */
2996 /* DRs in a different BBs should not be put into the same
2997 interleaving group. */
2998 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2999 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
3000 if (bb_index1 != bb_index2)
3001 break;
3003 if (dra_group_id != drb_group_id)
3004 break;
3006 /* Check that the data-refs have same first location (except init)
3007 and they are both either store or load (not load and store,
3008 not masked loads or stores). */
3009 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3010 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3011 DR_BASE_ADDRESS (drb)) != 0
3012 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3013 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3014 break;
3016 /* Check that the data-refs have the same constant size. */
3017 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3018 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3019 if (!tree_fits_uhwi_p (sza)
3020 || !tree_fits_uhwi_p (szb)
3021 || !tree_int_cst_equal (sza, szb))
3022 break;
3024 /* Check that the data-refs have the same step. */
3025 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3026 break;
3028 /* Check the types are compatible.
3029 ??? We don't distinguish this during sorting. */
3030 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3031 TREE_TYPE (DR_REF (drb))))
3032 break;
3034 /* Check that the DR_INITs are compile-time constants. */
3035 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3036 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3037 break;
3039 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3040 just hold extra information. */
3041 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3042 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3043 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3044 break;
3046 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3047 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3048 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3049 HOST_WIDE_INT init_prev
3050 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3051 gcc_assert (init_a <= init_b
3052 && init_a <= init_prev
3053 && init_prev <= init_b);
3055 /* Do not place the same access in the interleaving chain twice. */
3056 if (init_b == init_prev)
3058 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3059 < gimple_uid (DR_STMT (drb)));
3060 /* Simply link in duplicates and fix up the chain below. */
3062 else
3064 /* If init_b == init_a + the size of the type * k, we have an
3065 interleaving, and DRA is accessed before DRB. */
3066 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3067 if (type_size_a == 0
3068 || (init_b - init_a) % type_size_a != 0)
3069 break;
3071 /* If we have a store, the accesses are adjacent. This splits
3072 groups into chunks we support (we don't support vectorization
3073 of stores with gaps). */
3074 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3075 break;
3077 /* If the step (if not zero or non-constant) is smaller than the
3078 difference between data-refs' inits this splits groups into
3079 suitable sizes. */
3080 if (tree_fits_shwi_p (DR_STEP (dra)))
3082 unsigned HOST_WIDE_INT step
3083 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3084 if (step != 0
3085 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3086 break;
3090 if (dump_enabled_p ())
3091 dump_printf_loc (MSG_NOTE, vect_location,
3092 DR_IS_READ (dra)
3093 ? "Detected interleaving load %T and %T\n"
3094 : "Detected interleaving store %T and %T\n",
3095 DR_REF (dra), DR_REF (drb));
3097 /* Link the found element into the group list. */
3098 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3100 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3101 lastinfo = stmtinfo_a;
3103 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3104 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3105 lastinfo = stmtinfo_b;
3107 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3108 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3110 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3111 dump_printf_loc (MSG_NOTE, vect_location,
3112 "Load suitable for SLP vectorization only.\n");
3114 if (init_b == init_prev
3115 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3116 && dump_enabled_p ())
3117 dump_printf_loc (MSG_NOTE, vect_location,
3118 "Queuing group with duplicate access for fixup\n");
3122 /* Fixup groups with duplicate entries by splitting it. */
3123 while (1)
3125 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3126 if (!(it != to_fixup.end ()))
3127 break;
3128 stmt_vec_info grp = *it;
3129 to_fixup.remove (grp);
3131 /* Find the earliest duplicate group member. */
3132 unsigned first_duplicate = -1u;
3133 stmt_vec_info next, g = grp;
3134 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3136 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3137 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3138 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3139 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3140 g = next;
3142 if (first_duplicate == -1U)
3143 continue;
3145 /* Then move all stmts after the first duplicate to a new group.
3146 Note this is a heuristic but one with the property that *it
3147 is fixed up completely. */
3148 g = grp;
3149 stmt_vec_info newgroup = NULL, ng = grp;
3150 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3152 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3154 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3155 if (!newgroup)
3156 newgroup = next;
3157 else
3158 DR_GROUP_NEXT_ELEMENT (ng) = next;
3159 ng = next;
3160 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3162 else
3163 g = DR_GROUP_NEXT_ELEMENT (g);
3165 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3167 /* Fixup the new group which still may contain duplicates. */
3168 to_fixup.add (newgroup);
3171 data_ref_pair *dr_pair;
3172 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3174 dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3175 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3176 && !vect_analyze_data_ref_access (vinfo, dr_info))
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3180 "not vectorized: complicated access pattern.\n");
3182 if (is_a <bb_vec_info> (vinfo))
3184 /* Mark the statement as not vectorizable. */
3185 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3186 continue;
3188 else
3190 datarefs_copy.release ();
3191 return opt_result::failure_at (dr_info->stmt->stmt,
3192 "not vectorized:"
3193 " complicated access pattern.\n");
3198 datarefs_copy.release ();
3199 return opt_result::success ();
3202 /* Function vect_vfa_segment_size.
3204 Input:
3205 DR_INFO: The data reference.
3206 LENGTH_FACTOR: segment length to consider.
3208 Return a value suitable for the dr_with_seg_len::seg_len field.
3209 This is the "distance travelled" by the pointer from the first
3210 iteration in the segment to the last. Note that it does not include
3211 the size of the access; in effect it only describes the first byte. */
3213 static tree
3214 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3216 length_factor = size_binop (MINUS_EXPR,
3217 fold_convert (sizetype, length_factor),
3218 size_one_node);
3219 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3220 length_factor);
3223 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3224 gives the worst-case number of bytes covered by the segment. */
3226 static unsigned HOST_WIDE_INT
3227 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3229 stmt_vec_info stmt_vinfo = dr_info->stmt;
3230 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3231 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3232 unsigned HOST_WIDE_INT access_size = ref_size;
3233 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3235 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3236 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3238 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3239 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3240 == dr_explicit_realign_optimized))
3242 /* We might access a full vector's worth. */
3243 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3244 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3246 return access_size;
3249 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3250 describes. */
3252 static unsigned int
3253 vect_vfa_align (dr_vec_info *dr_info)
3255 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3258 /* Function vect_no_alias_p.
3260 Given data references A and B with equal base and offset, see whether
3261 the alias relation can be decided at compilation time. Return 1 if
3262 it can and the references alias, 0 if it can and the references do
3263 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3264 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3265 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3267 static int
3268 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3269 tree segment_length_a, tree segment_length_b,
3270 unsigned HOST_WIDE_INT access_size_a,
3271 unsigned HOST_WIDE_INT access_size_b)
3273 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3274 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3275 poly_uint64 const_length_a;
3276 poly_uint64 const_length_b;
3278 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3279 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3280 [a, a+12) */
3281 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3283 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3284 offset_a -= const_length_a;
3286 else
3287 const_length_a = tree_to_poly_uint64 (segment_length_a);
3288 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3290 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3291 offset_b -= const_length_b;
3293 else
3294 const_length_b = tree_to_poly_uint64 (segment_length_b);
3296 const_length_a += access_size_a;
3297 const_length_b += access_size_b;
3299 if (ranges_known_overlap_p (offset_a, const_length_a,
3300 offset_b, const_length_b))
3301 return 1;
3303 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3304 offset_b, const_length_b))
3305 return 0;
3307 return -1;
3310 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3311 in DDR is >= VF. */
3313 static bool
3314 dependence_distance_ge_vf (data_dependence_relation *ddr,
3315 unsigned int loop_depth, poly_uint64 vf)
3317 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3318 || DDR_NUM_DIST_VECTS (ddr) == 0)
3319 return false;
3321 /* If the dependence is exact, we should have limited the VF instead. */
3322 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3324 unsigned int i;
3325 lambda_vector dist_v;
3326 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3328 HOST_WIDE_INT dist = dist_v[loop_depth];
3329 if (dist != 0
3330 && !(dist > 0 && DDR_REVERSED_P (ddr))
3331 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3332 return false;
3335 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE, vect_location,
3337 "dependence distance between %T and %T is >= VF\n",
3338 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3340 return true;
3343 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3345 static void
3346 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3348 dump_printf (dump_kind, "%s (%T) >= ",
3349 lower_bound.unsigned_p ? "unsigned" : "abs",
3350 lower_bound.expr);
3351 dump_dec (dump_kind, lower_bound.min_value);
3354 /* Record that the vectorized loop requires the vec_lower_bound described
3355 by EXPR, UNSIGNED_P and MIN_VALUE. */
3357 static void
3358 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3359 poly_uint64 min_value)
3361 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3362 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3363 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3365 unsigned_p &= lower_bounds[i].unsigned_p;
3366 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3367 if (lower_bounds[i].unsigned_p != unsigned_p
3368 || maybe_lt (lower_bounds[i].min_value, min_value))
3370 lower_bounds[i].unsigned_p = unsigned_p;
3371 lower_bounds[i].min_value = min_value;
3372 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE, vect_location,
3375 "updating run-time check to ");
3376 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3377 dump_printf (MSG_NOTE, "\n");
3380 return;
3383 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3384 if (dump_enabled_p ())
3386 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3387 dump_lower_bound (MSG_NOTE, lower_bound);
3388 dump_printf (MSG_NOTE, "\n");
3390 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3393 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3394 will span fewer than GAP bytes. */
3396 static bool
3397 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3398 poly_int64 gap)
3400 stmt_vec_info stmt_info = dr_info->stmt;
3401 HOST_WIDE_INT count
3402 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3403 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3404 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3405 return (estimated_poly_value (gap)
3406 <= count * vect_get_scalar_dr_size (dr_info));
3409 /* Return true if we know that there is no alias between DR_INFO_A and
3410 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3411 When returning true, set *LOWER_BOUND_OUT to this N. */
3413 static bool
3414 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3415 poly_uint64 *lower_bound_out)
3417 /* Check that there is a constant gap of known sign between DR_A
3418 and DR_B. */
3419 data_reference *dr_a = dr_info_a->dr;
3420 data_reference *dr_b = dr_info_b->dr;
3421 poly_int64 init_a, init_b;
3422 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3423 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3424 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3425 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3426 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3427 || !ordered_p (init_a, init_b))
3428 return false;
3430 /* Sort DR_A and DR_B by the address they access. */
3431 if (maybe_lt (init_b, init_a))
3433 std::swap (init_a, init_b);
3434 std::swap (dr_info_a, dr_info_b);
3435 std::swap (dr_a, dr_b);
3438 /* If the two accesses could be dependent within a scalar iteration,
3439 make sure that we'd retain their order. */
3440 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3441 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3442 return false;
3444 /* There is no alias if abs (DR_STEP) is greater than or equal to
3445 the bytes spanned by the combination of the two accesses. */
3446 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3447 return true;
3450 /* Function vect_prune_runtime_alias_test_list.
3452 Prune a list of ddrs to be tested at run-time by versioning for alias.
3453 Merge several alias checks into one if possible.
3454 Return FALSE if resulting list of ddrs is longer then allowed by
3455 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3457 opt_result
3458 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3460 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3461 hash_set <tree_pair_hash> compared_objects;
3463 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3464 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3465 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3466 vec<vec_object_pair> &check_unequal_addrs
3467 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3468 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3469 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3471 ddr_p ddr;
3472 unsigned int i;
3473 tree length_factor;
3475 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3477 /* Step values are irrelevant for aliasing if the number of vector
3478 iterations is equal to the number of scalar iterations (which can
3479 happen for fully-SLP loops). */
3480 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3482 if (!ignore_step_p)
3484 /* Convert the checks for nonzero steps into bound tests. */
3485 tree value;
3486 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3487 vect_check_lower_bound (loop_vinfo, value, true, 1);
3490 if (may_alias_ddrs.is_empty ())
3491 return opt_result::success ();
3493 comp_alias_ddrs.create (may_alias_ddrs.length ());
3495 unsigned int loop_depth
3496 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3497 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3499 /* First, we collect all data ref pairs for aliasing checks. */
3500 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3502 poly_uint64 lower_bound;
3503 tree segment_length_a, segment_length_b;
3504 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3505 unsigned int align_a, align_b;
3507 /* Ignore the alias if the VF we chose ended up being no greater
3508 than the dependence distance. */
3509 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3510 continue;
3512 if (DDR_OBJECT_A (ddr))
3514 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3515 if (!compared_objects.add (new_pair))
3517 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_NOTE, vect_location,
3519 "checking that %T and %T"
3520 " have different addresses\n",
3521 new_pair.first, new_pair.second);
3522 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3524 continue;
3527 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3528 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3530 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3531 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3533 bool preserves_scalar_order_p
3534 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3536 /* Skip the pair if inter-iteration dependencies are irrelevant
3537 and intra-iteration dependencies are guaranteed to be honored. */
3538 if (ignore_step_p
3539 && (preserves_scalar_order_p
3540 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3541 &lower_bound)))
3543 if (dump_enabled_p ())
3544 dump_printf_loc (MSG_NOTE, vect_location,
3545 "no need for alias check between "
3546 "%T and %T when VF is 1\n",
3547 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3548 continue;
3551 /* See whether we can handle the alias using a bounds check on
3552 the step, and whether that's likely to be the best approach.
3553 (It might not be, for example, if the minimum step is much larger
3554 than the number of bytes handled by one vector iteration.) */
3555 if (!ignore_step_p
3556 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3557 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3558 &lower_bound)
3559 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3560 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3562 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3563 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3566 "%T and %T when the step %T is outside ",
3567 DR_REF (dr_info_a->dr),
3568 DR_REF (dr_info_b->dr),
3569 DR_STEP (dr_info_a->dr));
3570 if (unsigned_p)
3571 dump_printf (MSG_NOTE, "[0");
3572 else
3574 dump_printf (MSG_NOTE, "(");
3575 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3577 dump_printf (MSG_NOTE, ", ");
3578 dump_dec (MSG_NOTE, lower_bound);
3579 dump_printf (MSG_NOTE, ")\n");
3581 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3582 unsigned_p, lower_bound);
3583 continue;
3586 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3587 if (dr_group_first_a)
3589 stmt_info_a = dr_group_first_a;
3590 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3593 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3594 if (dr_group_first_b)
3596 stmt_info_b = dr_group_first_b;
3597 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3600 if (ignore_step_p)
3602 segment_length_a = size_zero_node;
3603 segment_length_b = size_zero_node;
3605 else
3607 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3608 DR_STEP (dr_info_b->dr), 0))
3609 length_factor = scalar_loop_iters;
3610 else
3611 length_factor = size_int (vect_factor);
3612 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3613 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3615 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3616 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3617 align_a = vect_vfa_align (dr_info_a);
3618 align_b = vect_vfa_align (dr_info_b);
3620 /* See whether the alias is known at compilation time. */
3621 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3622 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3623 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3624 DR_OFFSET (dr_info_b->dr), 0)
3625 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3626 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3627 && poly_int_tree_p (segment_length_a)
3628 && poly_int_tree_p (segment_length_b))
3630 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3631 segment_length_a,
3632 segment_length_b,
3633 access_size_a,
3634 access_size_b);
3635 if (res >= 0 && dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "can tell at compile time that %T and %T",
3639 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3640 if (res == 0)
3641 dump_printf (MSG_NOTE, " do not alias\n");
3642 else
3643 dump_printf (MSG_NOTE, " alias\n");
3646 if (res == 0)
3647 continue;
3649 if (res == 1)
3650 return opt_result::failure_at (stmt_info_b->stmt,
3651 "not vectorized:"
3652 " compilation time alias: %G%G",
3653 stmt_info_a->stmt,
3654 stmt_info_b->stmt);
3657 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3658 access_size_a, align_a);
3659 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3660 access_size_b, align_b);
3661 /* Canonicalize the order to be the one that's needed for accurate
3662 RAW, WAR and WAW flags, in cases where the data references are
3663 well-ordered. The order doesn't really matter otherwise,
3664 but we might as well be consistent. */
3665 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3666 std::swap (dr_a, dr_b);
3668 dr_with_seg_len_pair_t dr_with_seg_len_pair
3669 (dr_a, dr_b, (preserves_scalar_order_p
3670 ? dr_with_seg_len_pair_t::WELL_ORDERED
3671 : dr_with_seg_len_pair_t::REORDERED));
3673 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3676 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3678 unsigned int count = (comp_alias_ddrs.length ()
3679 + check_unequal_addrs.length ());
3681 if (dump_enabled_p ())
3682 dump_printf_loc (MSG_NOTE, vect_location,
3683 "improved number of alias checks from %d to %d\n",
3684 may_alias_ddrs.length (), count);
3685 unsigned limit = param_vect_max_version_for_alias_checks;
3686 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3687 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3688 if (count > limit)
3689 return opt_result::failure_at
3690 (vect_location,
3691 "number of versioning for alias run-time tests exceeds %d "
3692 "(--param vect-max-version-for-alias-checks)\n", limit);
3694 return opt_result::success ();
3697 /* Check whether we can use an internal function for a gather load
3698 or scatter store. READ_P is true for loads and false for stores.
3699 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3700 the type of the memory elements being loaded or stored. OFFSET_TYPE
3701 is the type of the offset that is being applied to the invariant
3702 base address. SCALE is the amount by which the offset should
3703 be multiplied *after* it has been converted to address width.
3705 Return true if the function is supported, storing the function id in
3706 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3708 bool
3709 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3710 tree vectype, tree memory_type, tree offset_type,
3711 int scale, internal_fn *ifn_out,
3712 tree *offset_vectype_out)
3714 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3715 unsigned int element_bits = vector_element_bits (vectype);
3716 if (element_bits != memory_bits)
3717 /* For now the vector elements must be the same width as the
3718 memory elements. */
3719 return false;
3721 /* Work out which function we need. */
3722 internal_fn ifn;
3723 if (read_p)
3724 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3725 else
3726 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3728 for (;;)
3730 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3731 if (!offset_vectype)
3732 return false;
3734 /* Test whether the target supports this combination. */
3735 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3736 offset_vectype, scale))
3738 *ifn_out = ifn;
3739 *offset_vectype_out = offset_vectype;
3740 return true;
3743 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3744 && TYPE_PRECISION (offset_type) >= element_bits)
3745 return false;
3747 offset_type = build_nonstandard_integer_type
3748 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3752 /* STMT_INFO is a call to an internal gather load or scatter store function.
3753 Describe the operation in INFO. */
3755 static void
3756 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3757 gather_scatter_info *info)
3759 gcall *call = as_a <gcall *> (stmt_info->stmt);
3760 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3761 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3763 info->ifn = gimple_call_internal_fn (call);
3764 info->decl = NULL_TREE;
3765 info->base = gimple_call_arg (call, 0);
3766 info->offset = gimple_call_arg (call, 1);
3767 info->offset_dt = vect_unknown_def_type;
3768 info->offset_vectype = NULL_TREE;
3769 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3770 info->element_type = TREE_TYPE (vectype);
3771 info->memory_type = TREE_TYPE (DR_REF (dr));
3774 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3775 gather load or scatter store. Describe the operation in *INFO if so. */
3777 bool
3778 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3779 gather_scatter_info *info)
3781 HOST_WIDE_INT scale = 1;
3782 poly_int64 pbitpos, pbitsize;
3783 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3784 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3785 tree offtype = NULL_TREE;
3786 tree decl = NULL_TREE, base, off;
3787 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3788 tree memory_type = TREE_TYPE (DR_REF (dr));
3789 machine_mode pmode;
3790 int punsignedp, reversep, pvolatilep = 0;
3791 internal_fn ifn;
3792 tree offset_vectype;
3793 bool masked_p = false;
3795 /* See whether this is already a call to a gather/scatter internal function.
3796 If not, see whether it's a masked load or store. */
3797 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3798 if (call && gimple_call_internal_p (call))
3800 ifn = gimple_call_internal_fn (call);
3801 if (internal_gather_scatter_fn_p (ifn))
3803 vect_describe_gather_scatter_call (stmt_info, info);
3804 return true;
3806 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3809 /* True if we should aim to use internal functions rather than
3810 built-in functions. */
3811 bool use_ifn_p = (DR_IS_READ (dr)
3812 ? supports_vec_gather_load_p ()
3813 : supports_vec_scatter_store_p ());
3815 base = DR_REF (dr);
3816 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3817 see if we can use the def stmt of the address. */
3818 if (masked_p
3819 && TREE_CODE (base) == MEM_REF
3820 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3821 && integer_zerop (TREE_OPERAND (base, 1))
3822 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3824 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3825 if (is_gimple_assign (def_stmt)
3826 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3827 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3830 /* The gather and scatter builtins need address of the form
3831 loop_invariant + vector * {1, 2, 4, 8}
3833 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3834 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3835 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3836 multiplications and additions in it. To get a vector, we need
3837 a single SSA_NAME that will be defined in the loop and will
3838 contain everything that is not loop invariant and that can be
3839 vectorized. The following code attempts to find such a preexistng
3840 SSA_NAME OFF and put the loop invariants into a tree BASE
3841 that can be gimplified before the loop. */
3842 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3843 &punsignedp, &reversep, &pvolatilep);
3844 if (reversep)
3845 return false;
3847 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3849 if (TREE_CODE (base) == MEM_REF)
3851 if (!integer_zerop (TREE_OPERAND (base, 1)))
3853 if (off == NULL_TREE)
3854 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3855 else
3856 off = size_binop (PLUS_EXPR, off,
3857 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3859 base = TREE_OPERAND (base, 0);
3861 else
3862 base = build_fold_addr_expr (base);
3864 if (off == NULL_TREE)
3865 off = size_zero_node;
3867 /* If base is not loop invariant, either off is 0, then we start with just
3868 the constant offset in the loop invariant BASE and continue with base
3869 as OFF, otherwise give up.
3870 We could handle that case by gimplifying the addition of base + off
3871 into some SSA_NAME and use that as off, but for now punt. */
3872 if (!expr_invariant_in_loop_p (loop, base))
3874 if (!integer_zerop (off))
3875 return false;
3876 off = base;
3877 base = size_int (pbytepos);
3879 /* Otherwise put base + constant offset into the loop invariant BASE
3880 and continue with OFF. */
3881 else
3883 base = fold_convert (sizetype, base);
3884 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3887 /* OFF at this point may be either a SSA_NAME or some tree expression
3888 from get_inner_reference. Try to peel off loop invariants from it
3889 into BASE as long as possible. */
3890 STRIP_NOPS (off);
3891 while (offtype == NULL_TREE)
3893 enum tree_code code;
3894 tree op0, op1, add = NULL_TREE;
3896 if (TREE_CODE (off) == SSA_NAME)
3898 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3900 if (expr_invariant_in_loop_p (loop, off))
3901 return false;
3903 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3904 break;
3906 op0 = gimple_assign_rhs1 (def_stmt);
3907 code = gimple_assign_rhs_code (def_stmt);
3908 op1 = gimple_assign_rhs2 (def_stmt);
3910 else
3912 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3913 return false;
3914 code = TREE_CODE (off);
3915 extract_ops_from_tree (off, &code, &op0, &op1);
3917 switch (code)
3919 case POINTER_PLUS_EXPR:
3920 case PLUS_EXPR:
3921 if (expr_invariant_in_loop_p (loop, op0))
3923 add = op0;
3924 off = op1;
3925 do_add:
3926 add = fold_convert (sizetype, add);
3927 if (scale != 1)
3928 add = size_binop (MULT_EXPR, add, size_int (scale));
3929 base = size_binop (PLUS_EXPR, base, add);
3930 continue;
3932 if (expr_invariant_in_loop_p (loop, op1))
3934 add = op1;
3935 off = op0;
3936 goto do_add;
3938 break;
3939 case MINUS_EXPR:
3940 if (expr_invariant_in_loop_p (loop, op1))
3942 add = fold_convert (sizetype, op1);
3943 add = size_binop (MINUS_EXPR, size_zero_node, add);
3944 off = op0;
3945 goto do_add;
3947 break;
3948 case MULT_EXPR:
3949 if (scale == 1 && tree_fits_shwi_p (op1))
3951 int new_scale = tree_to_shwi (op1);
3952 /* Only treat this as a scaling operation if the target
3953 supports it for at least some offset type. */
3954 if (use_ifn_p
3955 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3956 masked_p, vectype, memory_type,
3957 signed_char_type_node,
3958 new_scale, &ifn,
3959 &offset_vectype)
3960 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3961 masked_p, vectype, memory_type,
3962 unsigned_char_type_node,
3963 new_scale, &ifn,
3964 &offset_vectype))
3965 break;
3966 scale = new_scale;
3967 off = op0;
3968 continue;
3970 break;
3971 case SSA_NAME:
3972 off = op0;
3973 continue;
3974 CASE_CONVERT:
3975 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3976 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3977 break;
3979 /* Don't include the conversion if the target is happy with
3980 the current offset type. */
3981 if (use_ifn_p
3982 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3983 masked_p, vectype, memory_type,
3984 TREE_TYPE (off), scale, &ifn,
3985 &offset_vectype))
3986 break;
3988 if (TYPE_PRECISION (TREE_TYPE (op0))
3989 == TYPE_PRECISION (TREE_TYPE (off)))
3991 off = op0;
3992 continue;
3995 if (TYPE_PRECISION (TREE_TYPE (op0))
3996 < TYPE_PRECISION (TREE_TYPE (off)))
3998 off = op0;
3999 offtype = TREE_TYPE (off);
4000 STRIP_NOPS (off);
4001 continue;
4003 break;
4004 default:
4005 break;
4007 break;
4010 /* If at the end OFF still isn't a SSA_NAME or isn't
4011 defined in the loop, punt. */
4012 if (TREE_CODE (off) != SSA_NAME
4013 || expr_invariant_in_loop_p (loop, off))
4014 return false;
4016 if (offtype == NULL_TREE)
4017 offtype = TREE_TYPE (off);
4019 if (use_ifn_p)
4021 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4022 vectype, memory_type, offtype, scale,
4023 &ifn, &offset_vectype))
4024 return false;
4026 else
4028 if (DR_IS_READ (dr))
4030 if (targetm.vectorize.builtin_gather)
4031 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4033 else
4035 if (targetm.vectorize.builtin_scatter)
4036 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4039 if (!decl)
4040 return false;
4042 ifn = IFN_LAST;
4043 /* The offset vector type will be read from DECL when needed. */
4044 offset_vectype = NULL_TREE;
4047 info->ifn = ifn;
4048 info->decl = decl;
4049 info->base = base;
4050 info->offset = off;
4051 info->offset_dt = vect_unknown_def_type;
4052 info->offset_vectype = offset_vectype;
4053 info->scale = scale;
4054 info->element_type = TREE_TYPE (vectype);
4055 info->memory_type = memory_type;
4056 return true;
4059 /* Find the data references in STMT, analyze them with respect to LOOP and
4060 append them to DATAREFS. Return false if datarefs in this stmt cannot
4061 be handled. */
4063 opt_result
4064 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4065 vec<data_reference_p> *datarefs,
4066 vec<int> *dataref_groups, int group_id)
4068 /* We can ignore clobbers for dataref analysis - they are removed during
4069 loop vectorization and BB vectorization checks dependences with a
4070 stmt walk. */
4071 if (gimple_clobber_p (stmt))
4072 return opt_result::success ();
4074 if (gimple_has_volatile_ops (stmt))
4075 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4076 stmt);
4078 if (stmt_can_throw_internal (cfun, stmt))
4079 return opt_result::failure_at (stmt,
4080 "not vectorized:"
4081 " statement can throw an exception: %G",
4082 stmt);
4084 auto_vec<data_reference_p, 2> refs;
4085 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4086 if (!res)
4087 return res;
4089 if (refs.is_empty ())
4090 return opt_result::success ();
4092 if (refs.length () > 1)
4094 while (!refs.is_empty ())
4095 free_data_ref (refs.pop ());
4096 return opt_result::failure_at (stmt,
4097 "not vectorized: more than one "
4098 "data ref in stmt: %G", stmt);
4101 data_reference_p dr = refs.pop ();
4102 if (gcall *call = dyn_cast <gcall *> (stmt))
4103 if (!gimple_call_internal_p (call)
4104 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4105 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4107 free_data_ref (dr);
4108 return opt_result::failure_at (stmt,
4109 "not vectorized: dr in a call %G", stmt);
4112 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4113 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4115 free_data_ref (dr);
4116 return opt_result::failure_at (stmt,
4117 "not vectorized:"
4118 " statement is bitfield access %G", stmt);
4121 if (DR_BASE_ADDRESS (dr)
4122 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4124 free_data_ref (dr);
4125 return opt_result::failure_at (stmt,
4126 "not vectorized:"
4127 " base addr of dr is a constant\n");
4130 /* Check whether this may be a SIMD lane access and adjust the
4131 DR to make it easier for us to handle it. */
4132 if (loop
4133 && loop->simduid
4134 && (!DR_BASE_ADDRESS (dr)
4135 || !DR_OFFSET (dr)
4136 || !DR_INIT (dr)
4137 || !DR_STEP (dr)))
4139 struct data_reference *newdr
4140 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4141 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4142 if (DR_BASE_ADDRESS (newdr)
4143 && DR_OFFSET (newdr)
4144 && DR_INIT (newdr)
4145 && DR_STEP (newdr)
4146 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4147 && integer_zerop (DR_STEP (newdr)))
4149 tree base_address = DR_BASE_ADDRESS (newdr);
4150 tree off = DR_OFFSET (newdr);
4151 tree step = ssize_int (1);
4152 if (integer_zerop (off)
4153 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4155 off = TREE_OPERAND (base_address, 1);
4156 base_address = TREE_OPERAND (base_address, 0);
4158 STRIP_NOPS (off);
4159 if (TREE_CODE (off) == MULT_EXPR
4160 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4162 step = TREE_OPERAND (off, 1);
4163 off = TREE_OPERAND (off, 0);
4164 STRIP_NOPS (off);
4166 if (CONVERT_EXPR_P (off)
4167 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4168 < TYPE_PRECISION (TREE_TYPE (off))))
4169 off = TREE_OPERAND (off, 0);
4170 if (TREE_CODE (off) == SSA_NAME)
4172 gimple *def = SSA_NAME_DEF_STMT (off);
4173 /* Look through widening conversion. */
4174 if (is_gimple_assign (def)
4175 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4177 tree rhs1 = gimple_assign_rhs1 (def);
4178 if (TREE_CODE (rhs1) == SSA_NAME
4179 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4180 && (TYPE_PRECISION (TREE_TYPE (off))
4181 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4182 def = SSA_NAME_DEF_STMT (rhs1);
4184 if (is_gimple_call (def)
4185 && gimple_call_internal_p (def)
4186 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4188 tree arg = gimple_call_arg (def, 0);
4189 tree reft = TREE_TYPE (DR_REF (newdr));
4190 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4191 arg = SSA_NAME_VAR (arg);
4192 if (arg == loop->simduid
4193 /* For now. */
4194 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4196 DR_BASE_ADDRESS (newdr) = base_address;
4197 DR_OFFSET (newdr) = ssize_int (0);
4198 DR_STEP (newdr) = step;
4199 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4200 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4201 /* Mark as simd-lane access. */
4202 tree arg2 = gimple_call_arg (def, 1);
4203 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4204 free_data_ref (dr);
4205 datarefs->safe_push (newdr);
4206 if (dataref_groups)
4207 dataref_groups->safe_push (group_id);
4208 return opt_result::success ();
4213 free_data_ref (newdr);
4216 datarefs->safe_push (dr);
4217 if (dataref_groups)
4218 dataref_groups->safe_push (group_id);
4219 return opt_result::success ();
4222 /* Function vect_analyze_data_refs.
4224 Find all the data references in the loop or basic block.
4226 The general structure of the analysis of data refs in the vectorizer is as
4227 follows:
4228 1- vect_analyze_data_refs(loop/bb): call
4229 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4230 in the loop/bb and their dependences.
4231 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4232 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4233 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4237 opt_result
4238 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4240 class loop *loop = NULL;
4241 unsigned int i;
4242 struct data_reference *dr;
4243 tree scalar_type;
4245 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4247 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4248 loop = LOOP_VINFO_LOOP (loop_vinfo);
4250 /* Go through the data-refs, check that the analysis succeeded. Update
4251 pointer from stmt_vec_info struct to DR and vectype. */
4253 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4254 FOR_EACH_VEC_ELT (datarefs, i, dr)
4256 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4257 poly_uint64 vf;
4259 gcc_assert (DR_REF (dr));
4260 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4261 gcc_assert (!stmt_info->dr_aux.dr);
4262 stmt_info->dr_aux.dr = dr;
4263 stmt_info->dr_aux.stmt = stmt_info;
4265 /* Check that analysis of the data-ref succeeded. */
4266 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4267 || !DR_STEP (dr))
4269 bool maybe_gather
4270 = DR_IS_READ (dr)
4271 && !TREE_THIS_VOLATILE (DR_REF (dr))
4272 && (targetm.vectorize.builtin_gather != NULL
4273 || supports_vec_gather_load_p ());
4274 bool maybe_scatter
4275 = DR_IS_WRITE (dr)
4276 && !TREE_THIS_VOLATILE (DR_REF (dr))
4277 && (targetm.vectorize.builtin_scatter != NULL
4278 || supports_vec_scatter_store_p ());
4280 /* If target supports vector gather loads or scatter stores,
4281 see if they can't be used. */
4282 if (is_a <loop_vec_info> (vinfo)
4283 && !nested_in_vect_loop_p (loop, stmt_info))
4285 if (maybe_gather || maybe_scatter)
4287 if (maybe_gather)
4288 gatherscatter = GATHER;
4289 else
4290 gatherscatter = SCATTER;
4294 if (gatherscatter == SG_NONE)
4296 if (dump_enabled_p ())
4297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4298 "not vectorized: data ref analysis "
4299 "failed %G", stmt_info->stmt);
4300 if (is_a <bb_vec_info> (vinfo))
4302 /* In BB vectorization the ref can still participate
4303 in dependence analysis, we just can't vectorize it. */
4304 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4305 continue;
4307 return opt_result::failure_at (stmt_info->stmt,
4308 "not vectorized:"
4309 " data ref analysis failed: %G",
4310 stmt_info->stmt);
4314 /* See if this was detected as SIMD lane access. */
4315 if (dr->aux == (void *)-1
4316 || dr->aux == (void *)-2
4317 || dr->aux == (void *)-3
4318 || dr->aux == (void *)-4)
4320 if (nested_in_vect_loop_p (loop, stmt_info))
4321 return opt_result::failure_at (stmt_info->stmt,
4322 "not vectorized:"
4323 " data ref analysis failed: %G",
4324 stmt_info->stmt);
4325 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4326 = -(uintptr_t) dr->aux;
4329 tree base = get_base_address (DR_REF (dr));
4330 if (base && VAR_P (base) && DECL_NONALIASED (base))
4332 if (dump_enabled_p ())
4333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4334 "not vectorized: base object not addressable "
4335 "for stmt: %G", stmt_info->stmt);
4336 if (is_a <bb_vec_info> (vinfo))
4338 /* In BB vectorization the ref can still participate
4339 in dependence analysis, we just can't vectorize it. */
4340 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4341 continue;
4343 return opt_result::failure_at (stmt_info->stmt,
4344 "not vectorized: base object not"
4345 " addressable for stmt: %G",
4346 stmt_info->stmt);
4349 if (is_a <loop_vec_info> (vinfo)
4350 && DR_STEP (dr)
4351 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4353 if (nested_in_vect_loop_p (loop, stmt_info))
4354 return opt_result::failure_at (stmt_info->stmt,
4355 "not vectorized: "
4356 "not suitable for strided load %G",
4357 stmt_info->stmt);
4358 STMT_VINFO_STRIDED_P (stmt_info) = true;
4361 /* Update DR field in stmt_vec_info struct. */
4363 /* If the dataref is in an inner-loop of the loop that is considered for
4364 for vectorization, we also want to analyze the access relative to
4365 the outer-loop (DR contains information only relative to the
4366 inner-most enclosing loop). We do that by building a reference to the
4367 first location accessed by the inner-loop, and analyze it relative to
4368 the outer-loop. */
4369 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4371 /* Build a reference to the first location accessed by the
4372 inner loop: *(BASE + INIT + OFFSET). By construction,
4373 this address must be invariant in the inner loop, so we
4374 can consider it as being used in the outer loop. */
4375 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4376 tree offset = unshare_expr (DR_OFFSET (dr));
4377 tree init = unshare_expr (DR_INIT (dr));
4378 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4379 init, offset);
4380 tree init_addr = fold_build_pointer_plus (base, init_offset);
4381 tree init_ref = build_fold_indirect_ref (init_addr);
4383 if (dump_enabled_p ())
4384 dump_printf_loc (MSG_NOTE, vect_location,
4385 "analyze in outer loop: %T\n", init_ref);
4387 opt_result res
4388 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4389 init_ref, loop, stmt_info->stmt);
4390 if (!res)
4391 /* dr_analyze_innermost already explained the failure. */
4392 return res;
4394 if (dump_enabled_p ())
4395 dump_printf_loc (MSG_NOTE, vect_location,
4396 "\touter base_address: %T\n"
4397 "\touter offset from base address: %T\n"
4398 "\touter constant offset from base address: %T\n"
4399 "\touter step: %T\n"
4400 "\touter base alignment: %d\n\n"
4401 "\touter base misalignment: %d\n"
4402 "\touter offset alignment: %d\n"
4403 "\touter step alignment: %d\n",
4404 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4405 STMT_VINFO_DR_OFFSET (stmt_info),
4406 STMT_VINFO_DR_INIT (stmt_info),
4407 STMT_VINFO_DR_STEP (stmt_info),
4408 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4409 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4410 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4411 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4414 /* Set vectype for STMT. */
4415 scalar_type = TREE_TYPE (DR_REF (dr));
4416 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4417 if (!vectype)
4419 if (dump_enabled_p ())
4421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4422 "not vectorized: no vectype for stmt: %G",
4423 stmt_info->stmt);
4424 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4425 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4426 scalar_type);
4427 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4430 if (is_a <bb_vec_info> (vinfo))
4432 /* No vector type is fine, the ref can still participate
4433 in dependence analysis, we just can't vectorize it. */
4434 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4435 continue;
4437 if (fatal)
4438 *fatal = false;
4439 return opt_result::failure_at (stmt_info->stmt,
4440 "not vectorized:"
4441 " no vectype for stmt: %G"
4442 " scalar_type: %T\n",
4443 stmt_info->stmt, scalar_type);
4445 else
4447 if (dump_enabled_p ())
4448 dump_printf_loc (MSG_NOTE, vect_location,
4449 "got vectype for stmt: %G%T\n",
4450 stmt_info->stmt, vectype);
4453 /* Adjust the minimal vectorization factor according to the
4454 vector type. */
4455 vf = TYPE_VECTOR_SUBPARTS (vectype);
4456 *min_vf = upper_bound (*min_vf, vf);
4458 /* Leave the BB vectorizer to pick the vector type later, based on
4459 the final dataref group size and SLP node size. */
4460 if (is_a <loop_vec_info> (vinfo))
4461 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4463 if (gatherscatter != SG_NONE)
4465 gather_scatter_info gs_info;
4466 if (!vect_check_gather_scatter (stmt_info,
4467 as_a <loop_vec_info> (vinfo),
4468 &gs_info)
4469 || !get_vectype_for_scalar_type (vinfo,
4470 TREE_TYPE (gs_info.offset)))
4472 if (fatal)
4473 *fatal = false;
4474 return opt_result::failure_at
4475 (stmt_info->stmt,
4476 (gatherscatter == GATHER)
4477 ? "not vectorized: not suitable for gather load %G"
4478 : "not vectorized: not suitable for scatter store %G",
4479 stmt_info->stmt);
4481 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4485 /* We used to stop processing and prune the list here. Verify we no
4486 longer need to. */
4487 gcc_assert (i == datarefs.length ());
4489 return opt_result::success ();
4493 /* Function vect_get_new_vect_var.
4495 Returns a name for a new variable. The current naming scheme appends the
4496 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4497 the name of vectorizer generated variables, and appends that to NAME if
4498 provided. */
4500 tree
4501 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4503 const char *prefix;
4504 tree new_vect_var;
4506 switch (var_kind)
4508 case vect_simple_var:
4509 prefix = "vect";
4510 break;
4511 case vect_scalar_var:
4512 prefix = "stmp";
4513 break;
4514 case vect_mask_var:
4515 prefix = "mask";
4516 break;
4517 case vect_pointer_var:
4518 prefix = "vectp";
4519 break;
4520 default:
4521 gcc_unreachable ();
4524 if (name)
4526 char* tmp = concat (prefix, "_", name, NULL);
4527 new_vect_var = create_tmp_reg (type, tmp);
4528 free (tmp);
4530 else
4531 new_vect_var = create_tmp_reg (type, prefix);
4533 return new_vect_var;
4536 /* Like vect_get_new_vect_var but return an SSA name. */
4538 tree
4539 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4541 const char *prefix;
4542 tree new_vect_var;
4544 switch (var_kind)
4546 case vect_simple_var:
4547 prefix = "vect";
4548 break;
4549 case vect_scalar_var:
4550 prefix = "stmp";
4551 break;
4552 case vect_pointer_var:
4553 prefix = "vectp";
4554 break;
4555 default:
4556 gcc_unreachable ();
4559 if (name)
4561 char* tmp = concat (prefix, "_", name, NULL);
4562 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4563 free (tmp);
4565 else
4566 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4568 return new_vect_var;
4571 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4573 static void
4574 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4576 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4577 int misalign = DR_MISALIGNMENT (dr_info);
4578 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4579 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4580 else
4581 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4582 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4583 misalign);
4586 /* Function vect_create_addr_base_for_vector_ref.
4588 Create an expression that computes the address of the first memory location
4589 that will be accessed for a data reference.
4591 Input:
4592 STMT_INFO: The statement containing the data reference.
4593 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4594 OFFSET: Optional. If supplied, it is be added to the initial address.
4595 LOOP: Specify relative to which loop-nest should the address be computed.
4596 For example, when the dataref is in an inner-loop nested in an
4597 outer-loop that is now being vectorized, LOOP can be either the
4598 outer-loop, or the inner-loop. The first memory location accessed
4599 by the following dataref ('in' points to short):
4601 for (i=0; i<N; i++)
4602 for (j=0; j<M; j++)
4603 s += in[i+j]
4605 is as follows:
4606 if LOOP=i_loop: &in (relative to i_loop)
4607 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4608 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4609 initial address. Unlike OFFSET, which is number of elements to
4610 be added, BYTE_OFFSET is measured in bytes.
4612 Output:
4613 1. Return an SSA_NAME whose value is the address of the memory location of
4614 the first vector of the data reference.
4615 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4616 these statement(s) which define the returned SSA_NAME.
4618 FORNOW: We are only handling array accesses with step 1. */
4620 tree
4621 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4622 gimple_seq *new_stmt_list,
4623 tree offset,
4624 tree byte_offset)
4626 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4627 struct data_reference *dr = dr_info->dr;
4628 const char *base_name;
4629 tree addr_base;
4630 tree dest;
4631 gimple_seq seq = NULL;
4632 tree vect_ptr_type;
4633 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4634 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4635 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4637 tree data_ref_base = unshare_expr (drb->base_address);
4638 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4639 tree init = unshare_expr (drb->init);
4641 if (loop_vinfo)
4642 base_name = get_name (data_ref_base);
4643 else
4645 base_offset = ssize_int (0);
4646 init = ssize_int (0);
4647 base_name = get_name (DR_REF (dr));
4650 /* Create base_offset */
4651 base_offset = size_binop (PLUS_EXPR,
4652 fold_convert (sizetype, base_offset),
4653 fold_convert (sizetype, init));
4655 if (offset)
4657 offset = fold_build2 (MULT_EXPR, sizetype,
4658 fold_convert (sizetype, offset), step);
4659 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4660 base_offset, offset);
4662 if (byte_offset)
4664 byte_offset = fold_convert (sizetype, byte_offset);
4665 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4666 base_offset, byte_offset);
4669 /* base + base_offset */
4670 if (loop_vinfo)
4671 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4672 else
4674 addr_base = build1 (ADDR_EXPR,
4675 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4676 unshare_expr (DR_REF (dr)));
4679 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4680 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4681 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4682 gimple_seq_add_seq (new_stmt_list, seq);
4684 if (DR_PTR_INFO (dr)
4685 && TREE_CODE (addr_base) == SSA_NAME
4686 && !SSA_NAME_PTR_INFO (addr_base))
4688 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4689 if (offset || byte_offset)
4690 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4693 if (dump_enabled_p ())
4694 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4696 return addr_base;
4700 /* Function vect_create_data_ref_ptr.
4702 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4703 location accessed in the loop by STMT_INFO, along with the def-use update
4704 chain to appropriately advance the pointer through the loop iterations.
4705 Also set aliasing information for the pointer. This pointer is used by
4706 the callers to this function to create a memory reference expression for
4707 vector load/store access.
4709 Input:
4710 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4711 GIMPLE_ASSIGN <name, data-ref> or
4712 GIMPLE_ASSIGN <data-ref, name>.
4713 2. AGGR_TYPE: the type of the reference, which should be either a vector
4714 or an array.
4715 3. AT_LOOP: the loop where the vector memref is to be created.
4716 4. OFFSET (optional): an offset to be added to the initial address accessed
4717 by the data-ref in STMT_INFO.
4718 5. BSI: location where the new stmts are to be placed if there is no loop
4719 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4720 pointing to the initial address.
4721 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4722 to the initial address accessed by the data-ref in STMT_INFO. This is
4723 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4724 in bytes.
4725 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4726 to the IV during each iteration of the loop. NULL says to move
4727 by one copy of AGGR_TYPE up or down, depending on the step of the
4728 data reference.
4730 Output:
4731 1. Declare a new ptr to vector_type, and have it point to the base of the
4732 data reference (initial addressed accessed by the data reference).
4733 For example, for vector of type V8HI, the following code is generated:
4735 v8hi *ap;
4736 ap = (v8hi *)initial_address;
4738 if OFFSET is not supplied:
4739 initial_address = &a[init];
4740 if OFFSET is supplied:
4741 initial_address = &a[init + OFFSET];
4742 if BYTE_OFFSET is supplied:
4743 initial_address = &a[init] + BYTE_OFFSET;
4745 Return the initial_address in INITIAL_ADDRESS.
4747 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4748 update the pointer in each iteration of the loop.
4750 Return the increment stmt that updates the pointer in PTR_INCR.
4752 3. Return the pointer. */
4754 tree
4755 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4756 tree aggr_type, class loop *at_loop, tree offset,
4757 tree *initial_address, gimple_stmt_iterator *gsi,
4758 gimple **ptr_incr, bool only_init,
4759 tree byte_offset, tree iv_step)
4761 const char *base_name;
4762 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4763 class loop *loop = NULL;
4764 bool nested_in_vect_loop = false;
4765 class loop *containing_loop = NULL;
4766 tree aggr_ptr_type;
4767 tree aggr_ptr;
4768 tree new_temp;
4769 gimple_seq new_stmt_list = NULL;
4770 edge pe = NULL;
4771 basic_block new_bb;
4772 tree aggr_ptr_init;
4773 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4774 struct data_reference *dr = dr_info->dr;
4775 tree aptr;
4776 gimple_stmt_iterator incr_gsi;
4777 bool insert_after;
4778 tree indx_before_incr, indx_after_incr;
4779 gimple *incr;
4780 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4782 gcc_assert (iv_step != NULL_TREE
4783 || TREE_CODE (aggr_type) == ARRAY_TYPE
4784 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4786 if (loop_vinfo)
4788 loop = LOOP_VINFO_LOOP (loop_vinfo);
4789 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4790 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4791 pe = loop_preheader_edge (loop);
4793 else
4795 gcc_assert (bb_vinfo);
4796 only_init = true;
4797 *ptr_incr = NULL;
4800 /* Create an expression for the first address accessed by this load
4801 in LOOP. */
4802 base_name = get_name (DR_BASE_ADDRESS (dr));
4804 if (dump_enabled_p ())
4806 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4807 dump_printf_loc (MSG_NOTE, vect_location,
4808 "create %s-pointer variable to type: %T",
4809 get_tree_code_name (TREE_CODE (aggr_type)),
4810 aggr_type);
4811 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4812 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4813 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4814 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4815 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4816 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4817 else
4818 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4819 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4822 /* (1) Create the new aggregate-pointer variable.
4823 Vector and array types inherit the alias set of their component
4824 type by default so we need to use a ref-all pointer if the data
4825 reference does not conflict with the created aggregated data
4826 reference because it is not addressable. */
4827 bool need_ref_all = false;
4828 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4829 get_alias_set (DR_REF (dr))))
4830 need_ref_all = true;
4831 /* Likewise for any of the data references in the stmt group. */
4832 else if (DR_GROUP_SIZE (stmt_info) > 1)
4834 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4837 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4838 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4839 get_alias_set (DR_REF (sdr))))
4841 need_ref_all = true;
4842 break;
4844 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4846 while (sinfo);
4848 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4849 need_ref_all);
4850 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4853 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4854 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4855 def-use update cycles for the pointer: one relative to the outer-loop
4856 (LOOP), which is what steps (3) and (4) below do. The other is relative
4857 to the inner-loop (which is the inner-most loop containing the dataref),
4858 and this is done be step (5) below.
4860 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4861 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4862 redundant. Steps (3),(4) create the following:
4864 vp0 = &base_addr;
4865 LOOP: vp1 = phi(vp0,vp2)
4868 vp2 = vp1 + step
4869 goto LOOP
4871 If there is an inner-loop nested in loop, then step (5) will also be
4872 applied, and an additional update in the inner-loop will be created:
4874 vp0 = &base_addr;
4875 LOOP: vp1 = phi(vp0,vp2)
4877 inner: vp3 = phi(vp1,vp4)
4878 vp4 = vp3 + inner_step
4879 if () goto inner
4881 vp2 = vp1 + step
4882 if () goto LOOP */
4884 /* (2) Calculate the initial address of the aggregate-pointer, and set
4885 the aggregate-pointer to point to it before the loop. */
4887 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4889 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4890 stmt_info, &new_stmt_list,
4891 offset, byte_offset);
4892 if (new_stmt_list)
4894 if (pe)
4896 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4897 gcc_assert (!new_bb);
4899 else
4900 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4903 *initial_address = new_temp;
4904 aggr_ptr_init = new_temp;
4906 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4907 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4908 inner-loop nested in LOOP (during outer-loop vectorization). */
4910 /* No update in loop is required. */
4911 if (only_init && (!loop_vinfo || at_loop == loop))
4912 aptr = aggr_ptr_init;
4913 else
4915 /* Accesses to invariant addresses should be handled specially
4916 by the caller. */
4917 tree step = vect_dr_behavior (vinfo, dr_info)->step;
4918 gcc_assert (!integer_zerop (step));
4920 if (iv_step == NULL_TREE)
4922 /* The step of the aggregate pointer is the type size,
4923 negated for downward accesses. */
4924 iv_step = TYPE_SIZE_UNIT (aggr_type);
4925 if (tree_int_cst_sgn (step) == -1)
4926 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4929 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4931 create_iv (aggr_ptr_init,
4932 fold_convert (aggr_ptr_type, iv_step),
4933 aggr_ptr, loop, &incr_gsi, insert_after,
4934 &indx_before_incr, &indx_after_incr);
4935 incr = gsi_stmt (incr_gsi);
4937 /* Copy the points-to information if it exists. */
4938 if (DR_PTR_INFO (dr))
4940 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4941 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4943 if (ptr_incr)
4944 *ptr_incr = incr;
4946 aptr = indx_before_incr;
4949 if (!nested_in_vect_loop || only_init)
4950 return aptr;
4953 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4954 nested in LOOP, if exists. */
4956 gcc_assert (nested_in_vect_loop);
4957 if (!only_init)
4959 standard_iv_increment_position (containing_loop, &incr_gsi,
4960 &insert_after);
4961 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4962 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4963 &indx_after_incr);
4964 incr = gsi_stmt (incr_gsi);
4966 /* Copy the points-to information if it exists. */
4967 if (DR_PTR_INFO (dr))
4969 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4970 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4972 if (ptr_incr)
4973 *ptr_incr = incr;
4975 return indx_before_incr;
4977 else
4978 gcc_unreachable ();
4982 /* Function bump_vector_ptr
4984 Increment a pointer (to a vector type) by vector-size. If requested,
4985 i.e. if PTR-INCR is given, then also connect the new increment stmt
4986 to the existing def-use update-chain of the pointer, by modifying
4987 the PTR_INCR as illustrated below:
4989 The pointer def-use update-chain before this function:
4990 DATAREF_PTR = phi (p_0, p_2)
4991 ....
4992 PTR_INCR: p_2 = DATAREF_PTR + step
4994 The pointer def-use update-chain after this function:
4995 DATAREF_PTR = phi (p_0, p_2)
4996 ....
4997 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4998 ....
4999 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5001 Input:
5002 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5003 in the loop.
5004 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5005 the loop. The increment amount across iterations is expected
5006 to be vector_size.
5007 BSI - location where the new update stmt is to be placed.
5008 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5009 BUMP - optional. The offset by which to bump the pointer. If not given,
5010 the offset is assumed to be vector_size.
5012 Output: Return NEW_DATAREF_PTR as illustrated above.
5016 tree
5017 bump_vector_ptr (vec_info *vinfo,
5018 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5019 stmt_vec_info stmt_info, tree bump)
5021 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5022 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5023 tree update = TYPE_SIZE_UNIT (vectype);
5024 gassign *incr_stmt;
5025 ssa_op_iter iter;
5026 use_operand_p use_p;
5027 tree new_dataref_ptr;
5029 if (bump)
5030 update = bump;
5032 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5033 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5034 else
5035 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5036 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5037 dataref_ptr, update);
5038 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5040 /* Copy the points-to information if it exists. */
5041 if (DR_PTR_INFO (dr))
5043 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5044 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5047 if (!ptr_incr)
5048 return new_dataref_ptr;
5050 /* Update the vector-pointer's cross-iteration increment. */
5051 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5053 tree use = USE_FROM_PTR (use_p);
5055 if (use == dataref_ptr)
5056 SET_USE (use_p, new_dataref_ptr);
5057 else
5058 gcc_assert (operand_equal_p (use, update, 0));
5061 return new_dataref_ptr;
5065 /* Copy memory reference info such as base/clique from the SRC reference
5066 to the DEST MEM_REF. */
5068 void
5069 vect_copy_ref_info (tree dest, tree src)
5071 if (TREE_CODE (dest) != MEM_REF)
5072 return;
5074 tree src_base = src;
5075 while (handled_component_p (src_base))
5076 src_base = TREE_OPERAND (src_base, 0);
5077 if (TREE_CODE (src_base) != MEM_REF
5078 && TREE_CODE (src_base) != TARGET_MEM_REF)
5079 return;
5081 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5082 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5086 /* Function vect_create_destination_var.
5088 Create a new temporary of type VECTYPE. */
5090 tree
5091 vect_create_destination_var (tree scalar_dest, tree vectype)
5093 tree vec_dest;
5094 const char *name;
5095 char *new_name;
5096 tree type;
5097 enum vect_var_kind kind;
5099 kind = vectype
5100 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5101 ? vect_mask_var
5102 : vect_simple_var
5103 : vect_scalar_var;
5104 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5106 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5108 name = get_name (scalar_dest);
5109 if (name)
5110 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5111 else
5112 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5113 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5114 free (new_name);
5116 return vec_dest;
5119 /* Function vect_grouped_store_supported.
5121 Returns TRUE if interleave high and interleave low permutations
5122 are supported, and FALSE otherwise. */
5124 bool
5125 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5127 machine_mode mode = TYPE_MODE (vectype);
5129 /* vect_permute_store_chain requires the group size to be equal to 3 or
5130 be a power of two. */
5131 if (count != 3 && exact_log2 (count) == -1)
5133 if (dump_enabled_p ())
5134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5135 "the size of the group of accesses"
5136 " is not a power of 2 or not eqaul to 3\n");
5137 return false;
5140 /* Check that the permutation is supported. */
5141 if (VECTOR_MODE_P (mode))
5143 unsigned int i;
5144 if (count == 3)
5146 unsigned int j0 = 0, j1 = 0, j2 = 0;
5147 unsigned int i, j;
5149 unsigned int nelt;
5150 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5152 if (dump_enabled_p ())
5153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5154 "cannot handle groups of 3 stores for"
5155 " variable-length vectors\n");
5156 return false;
5159 vec_perm_builder sel (nelt, nelt, 1);
5160 sel.quick_grow (nelt);
5161 vec_perm_indices indices;
5162 for (j = 0; j < 3; j++)
5164 int nelt0 = ((3 - j) * nelt) % 3;
5165 int nelt1 = ((3 - j) * nelt + 1) % 3;
5166 int nelt2 = ((3 - j) * nelt + 2) % 3;
5167 for (i = 0; i < nelt; i++)
5169 if (3 * i + nelt0 < nelt)
5170 sel[3 * i + nelt0] = j0++;
5171 if (3 * i + nelt1 < nelt)
5172 sel[3 * i + nelt1] = nelt + j1++;
5173 if (3 * i + nelt2 < nelt)
5174 sel[3 * i + nelt2] = 0;
5176 indices.new_vector (sel, 2, nelt);
5177 if (!can_vec_perm_const_p (mode, indices))
5179 if (dump_enabled_p ())
5180 dump_printf (MSG_MISSED_OPTIMIZATION,
5181 "permutation op not supported by target.\n");
5182 return false;
5185 for (i = 0; i < nelt; i++)
5187 if (3 * i + nelt0 < nelt)
5188 sel[3 * i + nelt0] = 3 * i + nelt0;
5189 if (3 * i + nelt1 < nelt)
5190 sel[3 * i + nelt1] = 3 * i + nelt1;
5191 if (3 * i + nelt2 < nelt)
5192 sel[3 * i + nelt2] = nelt + j2++;
5194 indices.new_vector (sel, 2, nelt);
5195 if (!can_vec_perm_const_p (mode, indices))
5197 if (dump_enabled_p ())
5198 dump_printf (MSG_MISSED_OPTIMIZATION,
5199 "permutation op not supported by target.\n");
5200 return false;
5203 return true;
5205 else
5207 /* If length is not equal to 3 then only power of 2 is supported. */
5208 gcc_assert (pow2p_hwi (count));
5209 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5211 /* The encoding has 2 interleaved stepped patterns. */
5212 vec_perm_builder sel (nelt, 2, 3);
5213 sel.quick_grow (6);
5214 for (i = 0; i < 3; i++)
5216 sel[i * 2] = i;
5217 sel[i * 2 + 1] = i + nelt;
5219 vec_perm_indices indices (sel, 2, nelt);
5220 if (can_vec_perm_const_p (mode, indices))
5222 for (i = 0; i < 6; i++)
5223 sel[i] += exact_div (nelt, 2);
5224 indices.new_vector (sel, 2, nelt);
5225 if (can_vec_perm_const_p (mode, indices))
5226 return true;
5231 if (dump_enabled_p ())
5232 dump_printf (MSG_MISSED_OPTIMIZATION,
5233 "permutation op not supported by target.\n");
5234 return false;
5238 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5239 type VECTYPE. MASKED_P says whether the masked form is needed. */
5241 bool
5242 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5243 bool masked_p)
5245 if (masked_p)
5246 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5247 vec_mask_store_lanes_optab,
5248 vectype, count);
5249 else
5250 return vect_lanes_optab_supported_p ("vec_store_lanes",
5251 vec_store_lanes_optab,
5252 vectype, count);
5256 /* Function vect_permute_store_chain.
5258 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5259 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5260 the data correctly for the stores. Return the final references for stores
5261 in RESULT_CHAIN.
5263 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5264 The input is 4 vectors each containing 8 elements. We assign a number to
5265 each element, the input sequence is:
5267 1st vec: 0 1 2 3 4 5 6 7
5268 2nd vec: 8 9 10 11 12 13 14 15
5269 3rd vec: 16 17 18 19 20 21 22 23
5270 4th vec: 24 25 26 27 28 29 30 31
5272 The output sequence should be:
5274 1st vec: 0 8 16 24 1 9 17 25
5275 2nd vec: 2 10 18 26 3 11 19 27
5276 3rd vec: 4 12 20 28 5 13 21 30
5277 4th vec: 6 14 22 30 7 15 23 31
5279 i.e., we interleave the contents of the four vectors in their order.
5281 We use interleave_high/low instructions to create such output. The input of
5282 each interleave_high/low operation is two vectors:
5283 1st vec 2nd vec
5284 0 1 2 3 4 5 6 7
5285 the even elements of the result vector are obtained left-to-right from the
5286 high/low elements of the first vector. The odd elements of the result are
5287 obtained left-to-right from the high/low elements of the second vector.
5288 The output of interleave_high will be: 0 4 1 5
5289 and of interleave_low: 2 6 3 7
5292 The permutation is done in log LENGTH stages. In each stage interleave_high
5293 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5294 where the first argument is taken from the first half of DR_CHAIN and the
5295 second argument from it's second half.
5296 In our example,
5298 I1: interleave_high (1st vec, 3rd vec)
5299 I2: interleave_low (1st vec, 3rd vec)
5300 I3: interleave_high (2nd vec, 4th vec)
5301 I4: interleave_low (2nd vec, 4th vec)
5303 The output for the first stage is:
5305 I1: 0 16 1 17 2 18 3 19
5306 I2: 4 20 5 21 6 22 7 23
5307 I3: 8 24 9 25 10 26 11 27
5308 I4: 12 28 13 29 14 30 15 31
5310 The output of the second stage, i.e. the final result is:
5312 I1: 0 8 16 24 1 9 17 25
5313 I2: 2 10 18 26 3 11 19 27
5314 I3: 4 12 20 28 5 13 21 30
5315 I4: 6 14 22 30 7 15 23 31. */
5317 void
5318 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5319 unsigned int length,
5320 stmt_vec_info stmt_info,
5321 gimple_stmt_iterator *gsi,
5322 vec<tree> *result_chain)
5324 tree vect1, vect2, high, low;
5325 gimple *perm_stmt;
5326 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5327 tree perm_mask_low, perm_mask_high;
5328 tree data_ref;
5329 tree perm3_mask_low, perm3_mask_high;
5330 unsigned int i, j, n, log_length = exact_log2 (length);
5332 result_chain->quick_grow (length);
5333 memcpy (result_chain->address (), dr_chain.address (),
5334 length * sizeof (tree));
5336 if (length == 3)
5338 /* vect_grouped_store_supported ensures that this is constant. */
5339 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5340 unsigned int j0 = 0, j1 = 0, j2 = 0;
5342 vec_perm_builder sel (nelt, nelt, 1);
5343 sel.quick_grow (nelt);
5344 vec_perm_indices indices;
5345 for (j = 0; j < 3; j++)
5347 int nelt0 = ((3 - j) * nelt) % 3;
5348 int nelt1 = ((3 - j) * nelt + 1) % 3;
5349 int nelt2 = ((3 - j) * nelt + 2) % 3;
5351 for (i = 0; i < nelt; i++)
5353 if (3 * i + nelt0 < nelt)
5354 sel[3 * i + nelt0] = j0++;
5355 if (3 * i + nelt1 < nelt)
5356 sel[3 * i + nelt1] = nelt + j1++;
5357 if (3 * i + nelt2 < nelt)
5358 sel[3 * i + nelt2] = 0;
5360 indices.new_vector (sel, 2, nelt);
5361 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5363 for (i = 0; i < nelt; i++)
5365 if (3 * i + nelt0 < nelt)
5366 sel[3 * i + nelt0] = 3 * i + nelt0;
5367 if (3 * i + nelt1 < nelt)
5368 sel[3 * i + nelt1] = 3 * i + nelt1;
5369 if (3 * i + nelt2 < nelt)
5370 sel[3 * i + nelt2] = nelt + j2++;
5372 indices.new_vector (sel, 2, nelt);
5373 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5375 vect1 = dr_chain[0];
5376 vect2 = dr_chain[1];
5378 /* Create interleaving stmt:
5379 low = VEC_PERM_EXPR <vect1, vect2,
5380 {j, nelt, *, j + 1, nelt + j + 1, *,
5381 j + 2, nelt + j + 2, *, ...}> */
5382 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5383 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5384 vect2, perm3_mask_low);
5385 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5387 vect1 = data_ref;
5388 vect2 = dr_chain[2];
5389 /* Create interleaving stmt:
5390 low = VEC_PERM_EXPR <vect1, vect2,
5391 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5392 6, 7, nelt + j + 2, ...}> */
5393 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5394 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5395 vect2, perm3_mask_high);
5396 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5397 (*result_chain)[j] = data_ref;
5400 else
5402 /* If length is not equal to 3 then only power of 2 is supported. */
5403 gcc_assert (pow2p_hwi (length));
5405 /* The encoding has 2 interleaved stepped patterns. */
5406 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5407 vec_perm_builder sel (nelt, 2, 3);
5408 sel.quick_grow (6);
5409 for (i = 0; i < 3; i++)
5411 sel[i * 2] = i;
5412 sel[i * 2 + 1] = i + nelt;
5414 vec_perm_indices indices (sel, 2, nelt);
5415 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5417 for (i = 0; i < 6; i++)
5418 sel[i] += exact_div (nelt, 2);
5419 indices.new_vector (sel, 2, nelt);
5420 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5422 for (i = 0, n = log_length; i < n; i++)
5424 for (j = 0; j < length/2; j++)
5426 vect1 = dr_chain[j];
5427 vect2 = dr_chain[j+length/2];
5429 /* Create interleaving stmt:
5430 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5431 ...}> */
5432 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5433 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5434 vect2, perm_mask_high);
5435 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5436 (*result_chain)[2*j] = high;
5438 /* Create interleaving stmt:
5439 low = VEC_PERM_EXPR <vect1, vect2,
5440 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5441 ...}> */
5442 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5443 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5444 vect2, perm_mask_low);
5445 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5446 (*result_chain)[2*j+1] = low;
5448 memcpy (dr_chain.address (), result_chain->address (),
5449 length * sizeof (tree));
5454 /* Function vect_setup_realignment
5456 This function is called when vectorizing an unaligned load using
5457 the dr_explicit_realign[_optimized] scheme.
5458 This function generates the following code at the loop prolog:
5460 p = initial_addr;
5461 x msq_init = *(floor(p)); # prolog load
5462 realignment_token = call target_builtin;
5463 loop:
5464 x msq = phi (msq_init, ---)
5466 The stmts marked with x are generated only for the case of
5467 dr_explicit_realign_optimized.
5469 The code above sets up a new (vector) pointer, pointing to the first
5470 location accessed by STMT_INFO, and a "floor-aligned" load using that
5471 pointer. It also generates code to compute the "realignment-token"
5472 (if the relevant target hook was defined), and creates a phi-node at the
5473 loop-header bb whose arguments are the result of the prolog-load (created
5474 by this function) and the result of a load that takes place in the loop
5475 (to be created by the caller to this function).
5477 For the case of dr_explicit_realign_optimized:
5478 The caller to this function uses the phi-result (msq) to create the
5479 realignment code inside the loop, and sets up the missing phi argument,
5480 as follows:
5481 loop:
5482 msq = phi (msq_init, lsq)
5483 lsq = *(floor(p')); # load in loop
5484 result = realign_load (msq, lsq, realignment_token);
5486 For the case of dr_explicit_realign:
5487 loop:
5488 msq = *(floor(p)); # load in loop
5489 p' = p + (VS-1);
5490 lsq = *(floor(p')); # load in loop
5491 result = realign_load (msq, lsq, realignment_token);
5493 Input:
5494 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5495 a memory location that may be unaligned.
5496 BSI - place where new code is to be inserted.
5497 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5498 is used.
5500 Output:
5501 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5502 target hook, if defined.
5503 Return value - the result of the loop-header phi node. */
5505 tree
5506 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5507 gimple_stmt_iterator *gsi, tree *realignment_token,
5508 enum dr_alignment_support alignment_support_scheme,
5509 tree init_addr,
5510 class loop **at_loop)
5512 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5513 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5514 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5515 struct data_reference *dr = dr_info->dr;
5516 class loop *loop = NULL;
5517 edge pe = NULL;
5518 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5519 tree vec_dest;
5520 gimple *inc;
5521 tree ptr;
5522 tree data_ref;
5523 basic_block new_bb;
5524 tree msq_init = NULL_TREE;
5525 tree new_temp;
5526 gphi *phi_stmt;
5527 tree msq = NULL_TREE;
5528 gimple_seq stmts = NULL;
5529 bool compute_in_loop = false;
5530 bool nested_in_vect_loop = false;
5531 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5532 class loop *loop_for_initial_load = NULL;
5534 if (loop_vinfo)
5536 loop = LOOP_VINFO_LOOP (loop_vinfo);
5537 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5540 gcc_assert (alignment_support_scheme == dr_explicit_realign
5541 || alignment_support_scheme == dr_explicit_realign_optimized);
5543 /* We need to generate three things:
5544 1. the misalignment computation
5545 2. the extra vector load (for the optimized realignment scheme).
5546 3. the phi node for the two vectors from which the realignment is
5547 done (for the optimized realignment scheme). */
5549 /* 1. Determine where to generate the misalignment computation.
5551 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5552 calculation will be generated by this function, outside the loop (in the
5553 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5554 caller, inside the loop.
5556 Background: If the misalignment remains fixed throughout the iterations of
5557 the loop, then both realignment schemes are applicable, and also the
5558 misalignment computation can be done outside LOOP. This is because we are
5559 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5560 are a multiple of VS (the Vector Size), and therefore the misalignment in
5561 different vectorized LOOP iterations is always the same.
5562 The problem arises only if the memory access is in an inner-loop nested
5563 inside LOOP, which is now being vectorized using outer-loop vectorization.
5564 This is the only case when the misalignment of the memory access may not
5565 remain fixed throughout the iterations of the inner-loop (as explained in
5566 detail in vect_supportable_dr_alignment). In this case, not only is the
5567 optimized realignment scheme not applicable, but also the misalignment
5568 computation (and generation of the realignment token that is passed to
5569 REALIGN_LOAD) have to be done inside the loop.
5571 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5572 or not, which in turn determines if the misalignment is computed inside
5573 the inner-loop, or outside LOOP. */
5575 if (init_addr != NULL_TREE || !loop_vinfo)
5577 compute_in_loop = true;
5578 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5582 /* 2. Determine where to generate the extra vector load.
5584 For the optimized realignment scheme, instead of generating two vector
5585 loads in each iteration, we generate a single extra vector load in the
5586 preheader of the loop, and in each iteration reuse the result of the
5587 vector load from the previous iteration. In case the memory access is in
5588 an inner-loop nested inside LOOP, which is now being vectorized using
5589 outer-loop vectorization, we need to determine whether this initial vector
5590 load should be generated at the preheader of the inner-loop, or can be
5591 generated at the preheader of LOOP. If the memory access has no evolution
5592 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5593 to be generated inside LOOP (in the preheader of the inner-loop). */
5595 if (nested_in_vect_loop)
5597 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5598 bool invariant_in_outerloop =
5599 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5600 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5602 else
5603 loop_for_initial_load = loop;
5604 if (at_loop)
5605 *at_loop = loop_for_initial_load;
5607 if (loop_for_initial_load)
5608 pe = loop_preheader_edge (loop_for_initial_load);
5610 /* 3. For the case of the optimized realignment, create the first vector
5611 load at the loop preheader. */
5613 if (alignment_support_scheme == dr_explicit_realign_optimized)
5615 /* Create msq_init = *(floor(p1)) in the loop preheader */
5616 gassign *new_stmt;
5618 gcc_assert (!compute_in_loop);
5619 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5620 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5621 loop_for_initial_load, NULL_TREE,
5622 &init_addr, NULL, &inc, true);
5623 if (TREE_CODE (ptr) == SSA_NAME)
5624 new_temp = copy_ssa_name (ptr);
5625 else
5626 new_temp = make_ssa_name (TREE_TYPE (ptr));
5627 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5628 tree type = TREE_TYPE (ptr);
5629 new_stmt = gimple_build_assign
5630 (new_temp, BIT_AND_EXPR, ptr,
5631 fold_build2 (MINUS_EXPR, type,
5632 build_int_cst (type, 0),
5633 build_int_cst (type, align)));
5634 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5635 gcc_assert (!new_bb);
5636 data_ref
5637 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5638 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5639 vect_copy_ref_info (data_ref, DR_REF (dr));
5640 new_stmt = gimple_build_assign (vec_dest, data_ref);
5641 new_temp = make_ssa_name (vec_dest, new_stmt);
5642 gimple_assign_set_lhs (new_stmt, new_temp);
5643 if (pe)
5645 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5646 gcc_assert (!new_bb);
5648 else
5649 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5651 msq_init = gimple_assign_lhs (new_stmt);
5654 /* 4. Create realignment token using a target builtin, if available.
5655 It is done either inside the containing loop, or before LOOP (as
5656 determined above). */
5658 if (targetm.vectorize.builtin_mask_for_load)
5660 gcall *new_stmt;
5661 tree builtin_decl;
5663 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5664 if (!init_addr)
5666 /* Generate the INIT_ADDR computation outside LOOP. */
5667 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5668 stmt_info, &stmts,
5669 NULL_TREE);
5670 if (loop)
5672 pe = loop_preheader_edge (loop);
5673 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5674 gcc_assert (!new_bb);
5676 else
5677 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5680 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5681 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5682 vec_dest =
5683 vect_create_destination_var (scalar_dest,
5684 gimple_call_return_type (new_stmt));
5685 new_temp = make_ssa_name (vec_dest, new_stmt);
5686 gimple_call_set_lhs (new_stmt, new_temp);
5688 if (compute_in_loop)
5689 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5690 else
5692 /* Generate the misalignment computation outside LOOP. */
5693 pe = loop_preheader_edge (loop);
5694 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5695 gcc_assert (!new_bb);
5698 *realignment_token = gimple_call_lhs (new_stmt);
5700 /* The result of the CALL_EXPR to this builtin is determined from
5701 the value of the parameter and no global variables are touched
5702 which makes the builtin a "const" function. Requiring the
5703 builtin to have the "const" attribute makes it unnecessary
5704 to call mark_call_clobbered. */
5705 gcc_assert (TREE_READONLY (builtin_decl));
5708 if (alignment_support_scheme == dr_explicit_realign)
5709 return msq;
5711 gcc_assert (!compute_in_loop);
5712 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5715 /* 5. Create msq = phi <msq_init, lsq> in loop */
5717 pe = loop_preheader_edge (containing_loop);
5718 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5719 msq = make_ssa_name (vec_dest);
5720 phi_stmt = create_phi_node (msq, containing_loop->header);
5721 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5723 return msq;
5727 /* Function vect_grouped_load_supported.
5729 COUNT is the size of the load group (the number of statements plus the
5730 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5731 only one statement, with a gap of COUNT - 1.
5733 Returns true if a suitable permute exists. */
5735 bool
5736 vect_grouped_load_supported (tree vectype, bool single_element_p,
5737 unsigned HOST_WIDE_INT count)
5739 machine_mode mode = TYPE_MODE (vectype);
5741 /* If this is single-element interleaving with an element distance
5742 that leaves unused vector loads around punt - we at least create
5743 very sub-optimal code in that case (and blow up memory,
5744 see PR65518). */
5745 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5747 if (dump_enabled_p ())
5748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5749 "single-element interleaving not supported "
5750 "for not adjacent vector loads\n");
5751 return false;
5754 /* vect_permute_load_chain requires the group size to be equal to 3 or
5755 be a power of two. */
5756 if (count != 3 && exact_log2 (count) == -1)
5758 if (dump_enabled_p ())
5759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5760 "the size of the group of accesses"
5761 " is not a power of 2 or not equal to 3\n");
5762 return false;
5765 /* Check that the permutation is supported. */
5766 if (VECTOR_MODE_P (mode))
5768 unsigned int i, j;
5769 if (count == 3)
5771 unsigned int nelt;
5772 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5774 if (dump_enabled_p ())
5775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5776 "cannot handle groups of 3 loads for"
5777 " variable-length vectors\n");
5778 return false;
5781 vec_perm_builder sel (nelt, nelt, 1);
5782 sel.quick_grow (nelt);
5783 vec_perm_indices indices;
5784 unsigned int k;
5785 for (k = 0; k < 3; k++)
5787 for (i = 0; i < nelt; i++)
5788 if (3 * i + k < 2 * nelt)
5789 sel[i] = 3 * i + k;
5790 else
5791 sel[i] = 0;
5792 indices.new_vector (sel, 2, nelt);
5793 if (!can_vec_perm_const_p (mode, indices))
5795 if (dump_enabled_p ())
5796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5797 "shuffle of 3 loads is not supported by"
5798 " target\n");
5799 return false;
5801 for (i = 0, j = 0; i < nelt; i++)
5802 if (3 * i + k < 2 * nelt)
5803 sel[i] = i;
5804 else
5805 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5806 indices.new_vector (sel, 2, nelt);
5807 if (!can_vec_perm_const_p (mode, indices))
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5811 "shuffle of 3 loads is not supported by"
5812 " target\n");
5813 return false;
5816 return true;
5818 else
5820 /* If length is not equal to 3 then only power of 2 is supported. */
5821 gcc_assert (pow2p_hwi (count));
5822 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5824 /* The encoding has a single stepped pattern. */
5825 vec_perm_builder sel (nelt, 1, 3);
5826 sel.quick_grow (3);
5827 for (i = 0; i < 3; i++)
5828 sel[i] = i * 2;
5829 vec_perm_indices indices (sel, 2, nelt);
5830 if (can_vec_perm_const_p (mode, indices))
5832 for (i = 0; i < 3; i++)
5833 sel[i] = i * 2 + 1;
5834 indices.new_vector (sel, 2, nelt);
5835 if (can_vec_perm_const_p (mode, indices))
5836 return true;
5841 if (dump_enabled_p ())
5842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5843 "extract even/odd not supported by target\n");
5844 return false;
5847 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5848 type VECTYPE. MASKED_P says whether the masked form is needed. */
5850 bool
5851 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5852 bool masked_p)
5854 if (masked_p)
5855 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5856 vec_mask_load_lanes_optab,
5857 vectype, count);
5858 else
5859 return vect_lanes_optab_supported_p ("vec_load_lanes",
5860 vec_load_lanes_optab,
5861 vectype, count);
5864 /* Function vect_permute_load_chain.
5866 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5867 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5868 the input data correctly. Return the final references for loads in
5869 RESULT_CHAIN.
5871 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5872 The input is 4 vectors each containing 8 elements. We assign a number to each
5873 element, the input sequence is:
5875 1st vec: 0 1 2 3 4 5 6 7
5876 2nd vec: 8 9 10 11 12 13 14 15
5877 3rd vec: 16 17 18 19 20 21 22 23
5878 4th vec: 24 25 26 27 28 29 30 31
5880 The output sequence should be:
5882 1st vec: 0 4 8 12 16 20 24 28
5883 2nd vec: 1 5 9 13 17 21 25 29
5884 3rd vec: 2 6 10 14 18 22 26 30
5885 4th vec: 3 7 11 15 19 23 27 31
5887 i.e., the first output vector should contain the first elements of each
5888 interleaving group, etc.
5890 We use extract_even/odd instructions to create such output. The input of
5891 each extract_even/odd operation is two vectors
5892 1st vec 2nd vec
5893 0 1 2 3 4 5 6 7
5895 and the output is the vector of extracted even/odd elements. The output of
5896 extract_even will be: 0 2 4 6
5897 and of extract_odd: 1 3 5 7
5900 The permutation is done in log LENGTH stages. In each stage extract_even
5901 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5902 their order. In our example,
5904 E1: extract_even (1st vec, 2nd vec)
5905 E2: extract_odd (1st vec, 2nd vec)
5906 E3: extract_even (3rd vec, 4th vec)
5907 E4: extract_odd (3rd vec, 4th vec)
5909 The output for the first stage will be:
5911 E1: 0 2 4 6 8 10 12 14
5912 E2: 1 3 5 7 9 11 13 15
5913 E3: 16 18 20 22 24 26 28 30
5914 E4: 17 19 21 23 25 27 29 31
5916 In order to proceed and create the correct sequence for the next stage (or
5917 for the correct output, if the second stage is the last one, as in our
5918 example), we first put the output of extract_even operation and then the
5919 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5920 The input for the second stage is:
5922 1st vec (E1): 0 2 4 6 8 10 12 14
5923 2nd vec (E3): 16 18 20 22 24 26 28 30
5924 3rd vec (E2): 1 3 5 7 9 11 13 15
5925 4th vec (E4): 17 19 21 23 25 27 29 31
5927 The output of the second stage:
5929 E1: 0 4 8 12 16 20 24 28
5930 E2: 2 6 10 14 18 22 26 30
5931 E3: 1 5 9 13 17 21 25 29
5932 E4: 3 7 11 15 19 23 27 31
5934 And RESULT_CHAIN after reordering:
5936 1st vec (E1): 0 4 8 12 16 20 24 28
5937 2nd vec (E3): 1 5 9 13 17 21 25 29
5938 3rd vec (E2): 2 6 10 14 18 22 26 30
5939 4th vec (E4): 3 7 11 15 19 23 27 31. */
5941 static void
5942 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5943 unsigned int length,
5944 stmt_vec_info stmt_info,
5945 gimple_stmt_iterator *gsi,
5946 vec<tree> *result_chain)
5948 tree data_ref, first_vect, second_vect;
5949 tree perm_mask_even, perm_mask_odd;
5950 tree perm3_mask_low, perm3_mask_high;
5951 gimple *perm_stmt;
5952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5953 unsigned int i, j, log_length = exact_log2 (length);
5955 result_chain->quick_grow (length);
5956 memcpy (result_chain->address (), dr_chain.address (),
5957 length * sizeof (tree));
5959 if (length == 3)
5961 /* vect_grouped_load_supported ensures that this is constant. */
5962 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5963 unsigned int k;
5965 vec_perm_builder sel (nelt, nelt, 1);
5966 sel.quick_grow (nelt);
5967 vec_perm_indices indices;
5968 for (k = 0; k < 3; k++)
5970 for (i = 0; i < nelt; i++)
5971 if (3 * i + k < 2 * nelt)
5972 sel[i] = 3 * i + k;
5973 else
5974 sel[i] = 0;
5975 indices.new_vector (sel, 2, nelt);
5976 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5978 for (i = 0, j = 0; i < nelt; i++)
5979 if (3 * i + k < 2 * nelt)
5980 sel[i] = i;
5981 else
5982 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5983 indices.new_vector (sel, 2, nelt);
5984 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5986 first_vect = dr_chain[0];
5987 second_vect = dr_chain[1];
5989 /* Create interleaving stmt (low part of):
5990 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5991 ...}> */
5992 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5993 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5994 second_vect, perm3_mask_low);
5995 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5997 /* Create interleaving stmt (high part of):
5998 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5999 ...}> */
6000 first_vect = data_ref;
6001 second_vect = dr_chain[2];
6002 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6003 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6004 second_vect, perm3_mask_high);
6005 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6006 (*result_chain)[k] = data_ref;
6009 else
6011 /* If length is not equal to 3 then only power of 2 is supported. */
6012 gcc_assert (pow2p_hwi (length));
6014 /* The encoding has a single stepped pattern. */
6015 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6016 vec_perm_builder sel (nelt, 1, 3);
6017 sel.quick_grow (3);
6018 for (i = 0; i < 3; ++i)
6019 sel[i] = i * 2;
6020 vec_perm_indices indices (sel, 2, nelt);
6021 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6023 for (i = 0; i < 3; ++i)
6024 sel[i] = i * 2 + 1;
6025 indices.new_vector (sel, 2, nelt);
6026 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6028 for (i = 0; i < log_length; i++)
6030 for (j = 0; j < length; j += 2)
6032 first_vect = dr_chain[j];
6033 second_vect = dr_chain[j+1];
6035 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6036 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6037 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6038 first_vect, second_vect,
6039 perm_mask_even);
6040 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6041 (*result_chain)[j/2] = data_ref;
6043 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6044 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6045 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6046 first_vect, second_vect,
6047 perm_mask_odd);
6048 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6049 (*result_chain)[j/2+length/2] = data_ref;
6051 memcpy (dr_chain.address (), result_chain->address (),
6052 length * sizeof (tree));
6057 /* Function vect_shift_permute_load_chain.
6059 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6060 sequence of stmts to reorder the input data accordingly.
6061 Return the final references for loads in RESULT_CHAIN.
6062 Return true if successed, false otherwise.
6064 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6065 The input is 3 vectors each containing 8 elements. We assign a
6066 number to each element, the input sequence is:
6068 1st vec: 0 1 2 3 4 5 6 7
6069 2nd vec: 8 9 10 11 12 13 14 15
6070 3rd vec: 16 17 18 19 20 21 22 23
6072 The output sequence should be:
6074 1st vec: 0 3 6 9 12 15 18 21
6075 2nd vec: 1 4 7 10 13 16 19 22
6076 3rd vec: 2 5 8 11 14 17 20 23
6078 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6080 First we shuffle all 3 vectors to get correct elements order:
6082 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6083 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6084 3rd vec: (16 19 22) (17 20 23) (18 21)
6086 Next we unite and shift vector 3 times:
6088 1st step:
6089 shift right by 6 the concatenation of:
6090 "1st vec" and "2nd vec"
6091 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6092 "2nd vec" and "3rd vec"
6093 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6094 "3rd vec" and "1st vec"
6095 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6096 | New vectors |
6098 So that now new vectors are:
6100 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6101 2nd vec: (10 13) (16 19 22) (17 20 23)
6102 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6104 2nd step:
6105 shift right by 5 the concatenation of:
6106 "1st vec" and "3rd vec"
6107 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6108 "2nd vec" and "1st vec"
6109 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6110 "3rd vec" and "2nd vec"
6111 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6112 | New vectors |
6114 So that now new vectors are:
6116 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6117 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6118 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6120 3rd step:
6121 shift right by 5 the concatenation of:
6122 "1st vec" and "1st vec"
6123 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6124 shift right by 3 the concatenation of:
6125 "2nd vec" and "2nd vec"
6126 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6127 | New vectors |
6129 So that now all vectors are READY:
6130 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6131 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6132 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6134 This algorithm is faster than one in vect_permute_load_chain if:
6135 1. "shift of a concatination" is faster than general permutation.
6136 This is usually so.
6137 2. The TARGET machine can't execute vector instructions in parallel.
6138 This is because each step of the algorithm depends on previous.
6139 The algorithm in vect_permute_load_chain is much more parallel.
6141 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6144 static bool
6145 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6146 unsigned int length,
6147 stmt_vec_info stmt_info,
6148 gimple_stmt_iterator *gsi,
6149 vec<tree> *result_chain)
6151 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6152 tree perm2_mask1, perm2_mask2, perm3_mask;
6153 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6154 gimple *perm_stmt;
6156 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6157 unsigned int i;
6158 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6160 unsigned HOST_WIDE_INT nelt, vf;
6161 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6162 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6163 /* Not supported for variable-length vectors. */
6164 return false;
6166 vec_perm_builder sel (nelt, nelt, 1);
6167 sel.quick_grow (nelt);
6169 result_chain->quick_grow (length);
6170 memcpy (result_chain->address (), dr_chain.address (),
6171 length * sizeof (tree));
6173 if (pow2p_hwi (length) && vf > 4)
6175 unsigned int j, log_length = exact_log2 (length);
6176 for (i = 0; i < nelt / 2; ++i)
6177 sel[i] = i * 2;
6178 for (i = 0; i < nelt / 2; ++i)
6179 sel[nelt / 2 + i] = i * 2 + 1;
6180 vec_perm_indices indices (sel, 2, nelt);
6181 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6183 if (dump_enabled_p ())
6184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6185 "shuffle of 2 fields structure is not \
6186 supported by target\n");
6187 return false;
6189 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6191 for (i = 0; i < nelt / 2; ++i)
6192 sel[i] = i * 2 + 1;
6193 for (i = 0; i < nelt / 2; ++i)
6194 sel[nelt / 2 + i] = i * 2;
6195 indices.new_vector (sel, 2, nelt);
6196 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6198 if (dump_enabled_p ())
6199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6200 "shuffle of 2 fields structure is not \
6201 supported by target\n");
6202 return false;
6204 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6206 /* Generating permutation constant to shift all elements.
6207 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6208 for (i = 0; i < nelt; i++)
6209 sel[i] = nelt / 2 + i;
6210 indices.new_vector (sel, 2, nelt);
6211 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215 "shift permutation is not supported by target\n");
6216 return false;
6218 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6220 /* Generating permutation constant to select vector from 2.
6221 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6222 for (i = 0; i < nelt / 2; i++)
6223 sel[i] = i;
6224 for (i = nelt / 2; i < nelt; i++)
6225 sel[i] = nelt + i;
6226 indices.new_vector (sel, 2, nelt);
6227 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6229 if (dump_enabled_p ())
6230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6231 "select is not supported by target\n");
6232 return false;
6234 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6236 for (i = 0; i < log_length; i++)
6238 for (j = 0; j < length; j += 2)
6240 first_vect = dr_chain[j];
6241 second_vect = dr_chain[j + 1];
6243 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6244 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6245 first_vect, first_vect,
6246 perm2_mask1);
6247 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6248 vect[0] = data_ref;
6250 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6251 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6252 second_vect, second_vect,
6253 perm2_mask2);
6254 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6255 vect[1] = data_ref;
6257 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6258 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6259 vect[0], vect[1], shift1_mask);
6260 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6261 (*result_chain)[j/2 + length/2] = data_ref;
6263 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6264 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6265 vect[0], vect[1], select_mask);
6266 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6267 (*result_chain)[j/2] = data_ref;
6269 memcpy (dr_chain.address (), result_chain->address (),
6270 length * sizeof (tree));
6272 return true;
6274 if (length == 3 && vf > 2)
6276 unsigned int k = 0, l = 0;
6278 /* Generating permutation constant to get all elements in rigth order.
6279 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6280 for (i = 0; i < nelt; i++)
6282 if (3 * k + (l % 3) >= nelt)
6284 k = 0;
6285 l += (3 - (nelt % 3));
6287 sel[i] = 3 * k + (l % 3);
6288 k++;
6290 vec_perm_indices indices (sel, 2, nelt);
6291 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6293 if (dump_enabled_p ())
6294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6295 "shuffle of 3 fields structure is not \
6296 supported by target\n");
6297 return false;
6299 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6301 /* Generating permutation constant to shift all elements.
6302 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6303 for (i = 0; i < nelt; i++)
6304 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6305 indices.new_vector (sel, 2, nelt);
6306 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6308 if (dump_enabled_p ())
6309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6310 "shift permutation is not supported by target\n");
6311 return false;
6313 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6315 /* Generating permutation constant to shift all elements.
6316 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6317 for (i = 0; i < nelt; i++)
6318 sel[i] = 2 * (nelt / 3) + 1 + i;
6319 indices.new_vector (sel, 2, nelt);
6320 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324 "shift permutation is not supported by target\n");
6325 return false;
6327 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6329 /* Generating permutation constant to shift all elements.
6330 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6331 for (i = 0; i < nelt; i++)
6332 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6333 indices.new_vector (sel, 2, nelt);
6334 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6336 if (dump_enabled_p ())
6337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6338 "shift permutation is not supported by target\n");
6339 return false;
6341 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6343 /* Generating permutation constant to shift all elements.
6344 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6345 for (i = 0; i < nelt; i++)
6346 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6347 indices.new_vector (sel, 2, nelt);
6348 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6350 if (dump_enabled_p ())
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6352 "shift permutation is not supported by target\n");
6353 return false;
6355 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6357 for (k = 0; k < 3; k++)
6359 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6360 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6361 dr_chain[k], dr_chain[k],
6362 perm3_mask);
6363 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6364 vect[k] = data_ref;
6367 for (k = 0; k < 3; k++)
6369 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6370 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6371 vect[k % 3], vect[(k + 1) % 3],
6372 shift1_mask);
6373 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6374 vect_shift[k] = data_ref;
6377 for (k = 0; k < 3; k++)
6379 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6380 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6381 vect_shift[(4 - k) % 3],
6382 vect_shift[(3 - k) % 3],
6383 shift2_mask);
6384 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6385 vect[k] = data_ref;
6388 (*result_chain)[3 - (nelt % 3)] = vect[2];
6390 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6391 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6392 vect[0], shift3_mask);
6393 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6394 (*result_chain)[nelt % 3] = data_ref;
6396 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6397 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6398 vect[1], shift4_mask);
6399 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6400 (*result_chain)[0] = data_ref;
6401 return true;
6403 return false;
6406 /* Function vect_transform_grouped_load.
6408 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6409 to perform their permutation and ascribe the result vectorized statements to
6410 the scalar statements.
6413 void
6414 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6415 vec<tree> dr_chain,
6416 int size, gimple_stmt_iterator *gsi)
6418 machine_mode mode;
6419 vec<tree> result_chain = vNULL;
6421 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6422 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6423 vectors, that are ready for vector computation. */
6424 result_chain.create (size);
6426 /* If reassociation width for vector type is 2 or greater target machine can
6427 execute 2 or more vector instructions in parallel. Otherwise try to
6428 get chain for loads group using vect_shift_permute_load_chain. */
6429 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6430 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6431 || pow2p_hwi (size)
6432 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6433 gsi, &result_chain))
6434 vect_permute_load_chain (vinfo, dr_chain,
6435 size, stmt_info, gsi, &result_chain);
6436 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6437 result_chain.release ();
6440 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6441 generated as part of the vectorization of STMT_INFO. Assign the statement
6442 for each vector to the associated scalar statement. */
6444 void
6445 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6446 vec<tree> result_chain)
6448 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6449 unsigned int i, gap_count;
6450 tree tmp_data_ref;
6452 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6453 Since we scan the chain starting from it's first node, their order
6454 corresponds the order of data-refs in RESULT_CHAIN. */
6455 stmt_vec_info next_stmt_info = first_stmt_info;
6456 gap_count = 1;
6457 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6459 if (!next_stmt_info)
6460 break;
6462 /* Skip the gaps. Loads created for the gaps will be removed by dead
6463 code elimination pass later. No need to check for the first stmt in
6464 the group, since it always exists.
6465 DR_GROUP_GAP is the number of steps in elements from the previous
6466 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6467 correspond to the gaps. */
6468 if (next_stmt_info != first_stmt_info
6469 && gap_count < DR_GROUP_GAP (next_stmt_info))
6471 gap_count++;
6472 continue;
6475 /* ??? The following needs cleanup after the removal of
6476 DR_GROUP_SAME_DR_STMT. */
6477 if (next_stmt_info)
6479 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6480 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6481 copies, and we put the new vector statement last. */
6482 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6484 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6485 gap_count = 1;
6490 /* Function vect_force_dr_alignment_p.
6492 Returns whether the alignment of a DECL can be forced to be aligned
6493 on ALIGNMENT bit boundary. */
6495 bool
6496 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6498 if (!VAR_P (decl))
6499 return false;
6501 if (decl_in_symtab_p (decl)
6502 && !symtab_node::get (decl)->can_increase_alignment_p ())
6503 return false;
6505 if (TREE_STATIC (decl))
6506 return (known_le (alignment,
6507 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6508 else
6509 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6513 /* Return whether the data reference DR_INFO is supported with respect to its
6514 alignment.
6515 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6516 it is aligned, i.e., check if it is possible to vectorize it with different
6517 alignment. */
6519 enum dr_alignment_support
6520 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6521 bool check_aligned_accesses)
6523 data_reference *dr = dr_info->dr;
6524 stmt_vec_info stmt_info = dr_info->stmt;
6525 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6526 machine_mode mode = TYPE_MODE (vectype);
6527 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6528 class loop *vect_loop = NULL;
6529 bool nested_in_vect_loop = false;
6531 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6532 return dr_aligned;
6534 /* For now assume all conditional loads/stores support unaligned
6535 access without any special code. */
6536 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6537 if (gimple_call_internal_p (stmt)
6538 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6539 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6540 return dr_unaligned_supported;
6542 if (loop_vinfo)
6544 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6545 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6548 /* Possibly unaligned access. */
6550 /* We can choose between using the implicit realignment scheme (generating
6551 a misaligned_move stmt) and the explicit realignment scheme (generating
6552 aligned loads with a REALIGN_LOAD). There are two variants to the
6553 explicit realignment scheme: optimized, and unoptimized.
6554 We can optimize the realignment only if the step between consecutive
6555 vector loads is equal to the vector size. Since the vector memory
6556 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6557 is guaranteed that the misalignment amount remains the same throughout the
6558 execution of the vectorized loop. Therefore, we can create the
6559 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6560 at the loop preheader.
6562 However, in the case of outer-loop vectorization, when vectorizing a
6563 memory access in the inner-loop nested within the LOOP that is now being
6564 vectorized, while it is guaranteed that the misalignment of the
6565 vectorized memory access will remain the same in different outer-loop
6566 iterations, it is *not* guaranteed that is will remain the same throughout
6567 the execution of the inner-loop. This is because the inner-loop advances
6568 with the original scalar step (and not in steps of VS). If the inner-loop
6569 step happens to be a multiple of VS, then the misalignment remains fixed
6570 and we can use the optimized realignment scheme. For example:
6572 for (i=0; i<N; i++)
6573 for (j=0; j<M; j++)
6574 s += a[i+j];
6576 When vectorizing the i-loop in the above example, the step between
6577 consecutive vector loads is 1, and so the misalignment does not remain
6578 fixed across the execution of the inner-loop, and the realignment cannot
6579 be optimized (as illustrated in the following pseudo vectorized loop):
6581 for (i=0; i<N; i+=4)
6582 for (j=0; j<M; j++){
6583 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6584 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6585 // (assuming that we start from an aligned address).
6588 We therefore have to use the unoptimized realignment scheme:
6590 for (i=0; i<N; i+=4)
6591 for (j=k; j<M; j+=4)
6592 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6593 // that the misalignment of the initial address is
6594 // 0).
6596 The loop can then be vectorized as follows:
6598 for (k=0; k<4; k++){
6599 rt = get_realignment_token (&vp[k]);
6600 for (i=0; i<N; i+=4){
6601 v1 = vp[i+k];
6602 for (j=k; j<M; j+=4){
6603 v2 = vp[i+j+VS-1];
6604 va = REALIGN_LOAD <v1,v2,rt>;
6605 vs += va;
6606 v1 = v2;
6609 } */
6611 if (DR_IS_READ (dr))
6613 bool is_packed = false;
6614 tree type = (TREE_TYPE (DR_REF (dr)));
6616 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6617 && (!targetm.vectorize.builtin_mask_for_load
6618 || targetm.vectorize.builtin_mask_for_load ()))
6620 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6622 /* If we are doing SLP then the accesses need not have the
6623 same alignment, instead it depends on the SLP group size. */
6624 if (loop_vinfo
6625 && STMT_SLP_TYPE (stmt_info)
6626 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6627 * (DR_GROUP_SIZE
6628 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6629 TYPE_VECTOR_SUBPARTS (vectype)))
6631 else if (!loop_vinfo
6632 || (nested_in_vect_loop
6633 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6634 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6635 return dr_explicit_realign;
6636 else
6637 return dr_explicit_realign_optimized;
6639 if (!known_alignment_for_access_p (dr_info))
6640 is_packed = not_size_aligned (DR_REF (dr));
6642 if (targetm.vectorize.support_vector_misalignment
6643 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6644 /* Can't software pipeline the loads, but can at least do them. */
6645 return dr_unaligned_supported;
6647 else
6649 bool is_packed = false;
6650 tree type = (TREE_TYPE (DR_REF (dr)));
6652 if (!known_alignment_for_access_p (dr_info))
6653 is_packed = not_size_aligned (DR_REF (dr));
6655 if (targetm.vectorize.support_vector_misalignment
6656 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6657 return dr_unaligned_supported;
6660 /* Unsupported. */
6661 return dr_unaligned_unsupported;