Don't warn when alignment of global common data exceeds maximum alignment.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob37f46d1aaa3cec67e974c8ea9fe4975ee9757d40
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 HOST_WIDE_INT lhs, rhs;
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
150 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
152 unsigned int i = 0;
153 if (gimple_call_internal_p (call))
155 internal_fn ifn = gimple_call_internal_fn (call);
156 if (internal_load_fn_p (ifn))
157 /* For loads the LHS type does the trick. */
158 i = ~0U;
159 else if (internal_store_fn_p (ifn))
161 /* For stores use the tyep of the stored value. */
162 i = internal_fn_stored_value_index (ifn);
163 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 i = ~0U;
166 else if (internal_fn_mask_index (ifn) == 0)
167 i = 1;
169 if (i < gimple_call_num_args (call))
171 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
174 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 if (rhs < lhs)
176 scalar_type = rhs_type;
181 return scalar_type;
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186 tested at run-time. Return TRUE if DDR was successfully inserted.
187 Return false if versioning is not supported. */
189 static opt_result
190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
194 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195 return opt_result::failure_at (vect_location,
196 "will not create alias checks, as"
197 " --param vect-max-version-for-alias-checks"
198 " == 0\n");
200 opt_result res
201 = runtime_alias_check_p (ddr, loop,
202 optimize_loop_nest_for_speed_p (loop));
203 if (!res)
204 return res;
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207 return opt_result::success ();
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
212 static void
213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
215 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216 for (unsigned int i = 0; i < checks.length(); ++i)
217 if (checks[i] == value)
218 return;
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location,
222 "need run-time check that %T is nonzero\n",
223 value);
224 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229 DR_INFO_B. At least one of the accesses is a write. */
231 static bool
232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
234 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
237 /* Single statements are always kept in their original order. */
238 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240 return true;
242 /* STMT_A and STMT_B belong to overlapping groups. All loads are
243 emitted at the position of the first scalar load.
244 Stores in a group are emitted at the position of the last scalar store.
245 Compute that position and check whether the resulting order matches
246 the current one. */
247 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248 if (il_a)
250 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 s = DR_GROUP_NEXT_ELEMENT (s))
253 il_a = get_later_stmt (il_a, s);
254 else /* DR_IS_READ */
255 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 s = DR_GROUP_NEXT_ELEMENT (s))
257 if (get_later_stmt (il_a, s) == il_a)
258 il_a = s;
260 else
261 il_a = stmtinfo_a;
262 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263 if (il_b)
265 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 s = DR_GROUP_NEXT_ELEMENT (s))
268 il_b = get_later_stmt (il_b, s);
269 else /* DR_IS_READ */
270 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 s = DR_GROUP_NEXT_ELEMENT (s))
272 if (get_later_stmt (il_b, s) == il_b)
273 il_b = s;
275 else
276 il_b = stmtinfo_b;
277 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
281 /* A subroutine of vect_analyze_data_ref_dependence. Handle
282 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283 distances. These distances are conservatively correct but they don't
284 reflect a guaranteed dependence.
286 Return true if this function does all the work necessary to avoid
287 an alias or false if the caller should use the dependence distances
288 to limit the vectorization factor in the usual way. LOOP_DEPTH is
289 the depth of the loop described by LOOP_VINFO and the other arguments
290 are as for vect_analyze_data_ref_dependence. */
292 static bool
293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 int loop_depth, unsigned int *max_vf)
297 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
300 int dist = dist_v[loop_depth];
301 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
303 /* If the user asserted safelen >= DIST consecutive iterations
304 can be executed concurrently, assume independence.
306 ??? An alternative would be to add the alias check even
307 in this case, and vectorize the fallback loop with the
308 maximum VF set to safelen. However, if the user has
309 explicitly given a length, it's less likely that that
310 would be a win. */
311 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
313 if ((unsigned int) loop->safelen < *max_vf)
314 *max_vf = loop->safelen;
315 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 continue;
319 /* For dependence distances of 2 or more, we have the option
320 of limiting VF or checking for an alias at runtime.
321 Prefer to check at runtime if we can, to avoid limiting
322 the VF unnecessarily when the bases are in fact independent.
324 Note that the alias checks will be removed if the VF ends up
325 being small enough. */
326 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
333 return true;
337 /* Function vect_analyze_data_ref_dependence.
339 FIXME: I needed to change the sense of the returned flag.
341 Return FALSE if there (might) exist a dependence between a memory-reference
342 DRA and a memory-reference DRB. When versioning for alias may check a
343 dependence at run-time, return TRUE. Adjust *MAX_VF according to
344 the data dependence. */
346 static opt_result
347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 loop_vec_info loop_vinfo,
349 unsigned int *max_vf)
351 unsigned int i;
352 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353 struct data_reference *dra = DDR_A (ddr);
354 struct data_reference *drb = DDR_B (ddr);
355 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359 lambda_vector dist_v;
360 unsigned int loop_depth;
362 /* In loop analysis all data references should be vectorizable. */
363 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
364 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
365 gcc_unreachable ();
367 /* Independent data accesses. */
368 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
369 return opt_result::success ();
371 if (dra == drb
372 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
373 return opt_result::success ();
375 /* We do not have to consider dependences between accesses that belong
376 to the same group, unless the stride could be smaller than the
377 group size. */
378 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
379 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
380 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
381 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
382 return opt_result::success ();
384 /* Even if we have an anti-dependence then, as the vectorized loop covers at
385 least two scalar iterations, there is always also a true dependence.
386 As the vectorizer does not re-order loads and stores we can ignore
387 the anti-dependence if TBAA can disambiguate both DRs similar to the
388 case with known negative distance anti-dependences (positive
389 distance anti-dependences would violate TBAA constraints). */
390 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
391 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
392 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
393 get_alias_set (DR_REF (drb))))
394 return opt_result::success ();
396 /* Unknown data dependence. */
397 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
399 /* If user asserted safelen consecutive iterations can be
400 executed concurrently, assume independence. */
401 if (loop->safelen >= 2)
403 if ((unsigned int) loop->safelen < *max_vf)
404 *max_vf = loop->safelen;
405 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
406 return opt_result::success ();
409 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
410 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
411 return opt_result::failure_at
412 (stmtinfo_a->stmt,
413 "versioning for alias not supported for: "
414 "can't determine dependence between %T and %T\n",
415 DR_REF (dra), DR_REF (drb));
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
419 "versioning for alias required: "
420 "can't determine dependence between %T and %T\n",
421 DR_REF (dra), DR_REF (drb));
423 /* Add to list of ddrs that need to be tested at run-time. */
424 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
427 /* Known data dependence. */
428 if (DDR_NUM_DIST_VECTS (ddr) == 0)
430 /* If user asserted safelen consecutive iterations can be
431 executed concurrently, assume independence. */
432 if (loop->safelen >= 2)
434 if ((unsigned int) loop->safelen < *max_vf)
435 *max_vf = loop->safelen;
436 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
437 return opt_result::success ();
440 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
441 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
442 return opt_result::failure_at
443 (stmtinfo_a->stmt,
444 "versioning for alias not supported for: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra), DR_REF (drb));
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
450 "versioning for alias required: "
451 "bad dist vector for %T and %T\n",
452 DR_REF (dra), DR_REF (drb));
453 /* Add to list of ddrs that need to be tested at run-time. */
454 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
457 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
459 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
460 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
461 loop_depth, max_vf))
462 return opt_result::success ();
464 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
466 int dist = dist_v[loop_depth];
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance = %d.\n", dist);
472 if (dist == 0)
474 if (dump_enabled_p ())
475 dump_printf_loc (MSG_NOTE, vect_location,
476 "dependence distance == 0 between %T and %T\n",
477 DR_REF (dra), DR_REF (drb));
479 /* When we perform grouped accesses and perform implicit CSE
480 by detecting equal accesses and doing disambiguation with
481 runtime alias tests like for
482 .. = a[i];
483 .. = a[i+1];
484 a[i] = ..;
485 a[i+1] = ..;
486 *p = ..;
487 .. = a[i];
488 .. = a[i+1];
489 where we will end up loading { a[i], a[i+1] } once, make
490 sure that inserting group loads before the first load and
491 stores after the last store will do the right thing.
492 Similar for groups like
493 a[i] = ...;
494 ... = a[i];
495 a[i+1] = ...;
496 where loads from the group interleave with the store. */
497 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
498 return opt_result::failure_at (stmtinfo_a->stmt,
499 "READ_WRITE dependence"
500 " in interleaving.\n");
502 if (loop->safelen < 2)
504 tree indicator = dr_zero_step_indicator (dra);
505 if (!indicator || integer_zerop (indicator))
506 return opt_result::failure_at (stmtinfo_a->stmt,
507 "access also has a zero step\n");
508 else if (TREE_CODE (indicator) != INTEGER_CST)
509 vect_check_nonzero_value (loop_vinfo, indicator);
511 continue;
514 if (dist > 0 && DDR_REVERSED_P (ddr))
516 /* If DDR_REVERSED_P the order of the data-refs in DDR was
517 reversed (to make distance vector positive), and the actual
518 distance is negative. */
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance negative.\n");
522 /* When doing outer loop vectorization, we need to check if there is
523 a backward dependence at the inner loop level if the dependence
524 at the outer loop is reversed. See PR81740. */
525 if (nested_in_vect_loop_p (loop, stmtinfo_a)
526 || nested_in_vect_loop_p (loop, stmtinfo_b))
528 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
529 DDR_LOOP_NEST (ddr));
530 if (dist_v[inner_depth] < 0)
531 return opt_result::failure_at (stmtinfo_a->stmt,
532 "not vectorized, dependence "
533 "between data-refs %T and %T\n",
534 DR_REF (dra), DR_REF (drb));
536 /* Record a negative dependence distance to later limit the
537 amount of stmt copying / unrolling we can perform.
538 Only need to handle read-after-write dependence. */
539 if (DR_IS_READ (drb)
540 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
541 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
542 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
543 continue;
546 unsigned int abs_dist = abs (dist);
547 if (abs_dist >= 2 && abs_dist < *max_vf)
549 /* The dependence distance requires reduction of the maximal
550 vectorization factor. */
551 *max_vf = abs_dist;
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "adjusting maximal vectorization factor to %i\n",
555 *max_vf);
558 if (abs_dist >= *max_vf)
560 /* Dependence distance does not create dependence, as far as
561 vectorization is concerned, in this case. */
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_NOTE, vect_location,
564 "dependence distance >= VF.\n");
565 continue;
568 return opt_result::failure_at (stmtinfo_a->stmt,
569 "not vectorized, possible dependence "
570 "between data-refs %T and %T\n",
571 DR_REF (dra), DR_REF (drb));
574 return opt_result::success ();
577 /* Function vect_analyze_data_ref_dependences.
579 Examine all the data references in the loop, and make sure there do not
580 exist any data dependences between them. Set *MAX_VF according to
581 the maximum vectorization factor the data dependences allow. */
583 opt_result
584 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
585 unsigned int *max_vf)
587 unsigned int i;
588 struct data_dependence_relation *ddr;
590 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
592 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
594 LOOP_VINFO_DDRS (loop_vinfo)
595 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
596 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
597 /* We do not need read-read dependences. */
598 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
599 &LOOP_VINFO_DDRS (loop_vinfo),
600 LOOP_VINFO_LOOP_NEST (loop_vinfo),
601 false);
602 gcc_assert (res);
605 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
607 /* For epilogues we either have no aliases or alias versioning
608 was applied to original loop. Therefore we may just get max_vf
609 using VF of original loop. */
610 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
611 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
612 else
613 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
615 opt_result res
616 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
617 if (!res)
618 return res;
621 return opt_result::success ();
625 /* Function vect_slp_analyze_data_ref_dependence.
627 Return TRUE if there (might) exist a dependence between a memory-reference
628 DRA and a memory-reference DRB for VINFO. When versioning for alias
629 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
630 according to the data dependence. */
632 static bool
633 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
634 struct data_dependence_relation *ddr)
636 struct data_reference *dra = DDR_A (ddr);
637 struct data_reference *drb = DDR_B (ddr);
638 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
639 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
641 /* We need to check dependences of statements marked as unvectorizable
642 as well, they still can prohibit vectorization. */
644 /* Independent data accesses. */
645 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
646 return false;
648 if (dra == drb)
649 return false;
651 /* Read-read is OK. */
652 if (DR_IS_READ (dra) && DR_IS_READ (drb))
653 return false;
655 /* If dra and drb are part of the same interleaving chain consider
656 them independent. */
657 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
658 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
659 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
660 return false;
662 /* Unknown data dependence. */
663 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "can't determine dependence between %T and %T\n",
668 DR_REF (dra), DR_REF (drb));
670 else if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "determined dependence between %T and %T\n",
673 DR_REF (dra), DR_REF (drb));
675 return true;
679 /* Analyze dependences involved in the transform of SLP NODE. STORES
680 contain the vector of scalar stores of this instance if we are
681 disambiguating the loads. */
683 static bool
684 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
685 vec<stmt_vec_info> stores,
686 stmt_vec_info last_store_info)
688 /* This walks over all stmts involved in the SLP load/store done
689 in NODE verifying we can sink them up to the last stmt in the
690 group. */
691 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
693 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
694 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
696 stmt_vec_info access_info
697 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
698 if (access_info == last_access_info)
699 continue;
700 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
701 ao_ref ref;
702 bool ref_initialized_p = false;
703 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
704 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
706 gimple *stmt = gsi_stmt (gsi);
707 if (! gimple_vuse (stmt))
708 continue;
710 /* If we couldn't record a (single) data reference for this
711 stmt we have to resort to the alias oracle. */
712 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
713 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
714 if (!dr_b)
716 /* We are moving a store - this means
717 we cannot use TBAA for disambiguation. */
718 if (!ref_initialized_p)
719 ao_ref_init (&ref, DR_REF (dr_a));
720 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
721 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
722 return false;
723 continue;
726 bool dependent = false;
727 /* If we run into a store of this same instance (we've just
728 marked those) then delay dependence checking until we run
729 into the last store because this is where it will have
730 been sunk to (and we verify if we can do that as well). */
731 if (gimple_visited_p (stmt))
733 if (stmt_info != last_store_info)
734 continue;
736 for (stmt_vec_info &store_info : stores)
738 data_reference *store_dr
739 = STMT_VINFO_DATA_REF (store_info);
740 ddr_p ddr = initialize_data_dependence_relation
741 (dr_a, store_dr, vNULL);
742 dependent
743 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
744 free_dependence_relation (ddr);
745 if (dependent)
746 break;
749 else
751 ddr_p ddr = initialize_data_dependence_relation (dr_a,
752 dr_b, vNULL);
753 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
754 free_dependence_relation (ddr);
756 if (dependent)
757 return false;
761 else /* DR_IS_READ */
763 stmt_vec_info first_access_info
764 = vect_find_first_scalar_stmt_in_slp (node);
765 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
767 stmt_vec_info access_info
768 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
769 if (access_info == first_access_info)
770 continue;
771 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
772 ao_ref ref;
773 bool ref_initialized_p = false;
774 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
775 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
777 gimple *stmt = gsi_stmt (gsi);
778 if (! gimple_vdef (stmt))
779 continue;
781 /* If we couldn't record a (single) data reference for this
782 stmt we have to resort to the alias oracle. */
783 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
784 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
786 /* We are hoisting a load - this means we can use
787 TBAA for disambiguation. */
788 if (!ref_initialized_p)
789 ao_ref_init (&ref, DR_REF (dr_a));
790 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
792 if (!dr_b)
793 return false;
794 /* Resort to dependence checking below. */
796 else
797 /* No dependence. */
798 continue;
800 bool dependent = false;
801 /* If we run into a store of this same instance (we've just
802 marked those) then delay dependence checking until we run
803 into the last store because this is where it will have
804 been sunk to (and we verify if we can do that as well). */
805 if (gimple_visited_p (stmt))
807 if (stmt_info != last_store_info)
808 continue;
810 for (stmt_vec_info &store_info : stores)
812 data_reference *store_dr
813 = STMT_VINFO_DATA_REF (store_info);
814 ddr_p ddr = initialize_data_dependence_relation
815 (dr_a, store_dr, vNULL);
816 dependent
817 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
818 free_dependence_relation (ddr);
819 if (dependent)
820 break;
823 else
825 ddr_p ddr = initialize_data_dependence_relation (dr_a,
826 dr_b, vNULL);
827 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
828 free_dependence_relation (ddr);
830 if (dependent)
831 return false;
835 return true;
839 /* Function vect_analyze_data_ref_dependences.
841 Examine all the data references in the basic-block, and make sure there
842 do not exist any data dependences between them. Set *MAX_VF according to
843 the maximum vectorization factor the data dependences allow. */
845 bool
846 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
848 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
850 /* The stores of this instance are at the root of the SLP tree. */
851 slp_tree store = NULL;
852 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
853 store = SLP_INSTANCE_TREE (instance);
855 /* Verify we can sink stores to the vectorized stmt insert location. */
856 stmt_vec_info last_store_info = NULL;
857 if (store)
859 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
860 return false;
862 /* Mark stores in this instance and remember the last one. */
863 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
864 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
865 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
868 bool res = true;
870 /* Verify we can sink loads to the vectorized stmt insert location,
871 special-casing stores of this instance. */
872 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
873 if (! vect_slp_analyze_node_dependences (vinfo, load,
874 store
875 ? SLP_TREE_SCALAR_STMTS (store)
876 : vNULL, last_store_info))
878 res = false;
879 break;
882 /* Unset the visited flag. */
883 if (store)
884 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
885 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
887 return res;
890 /* Record the base alignment guarantee given by DRB, which occurs
891 in STMT_INFO. */
893 static void
894 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
895 innermost_loop_behavior *drb)
897 bool existed;
898 innermost_loop_behavior *&entry
899 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
900 if (!existed || entry->base_alignment < drb->base_alignment)
902 entry = drb;
903 if (dump_enabled_p ())
904 dump_printf_loc (MSG_NOTE, vect_location,
905 "recording new base alignment for %T\n"
906 " alignment: %d\n"
907 " misalignment: %d\n"
908 " based on: %G",
909 drb->base_address,
910 drb->base_alignment,
911 drb->base_misalignment,
912 stmt_info->stmt);
916 /* If the region we're going to vectorize is reached, all unconditional
917 data references occur at least once. We can therefore pool the base
918 alignment guarantees from each unconditional reference. Do this by
919 going through all the data references in VINFO and checking whether
920 the containing statement makes the reference unconditionally. If so,
921 record the alignment of the base address in VINFO so that it can be
922 used for all other references with the same base. */
924 void
925 vect_record_base_alignments (vec_info *vinfo)
927 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
928 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
929 for (data_reference *dr : vinfo->shared->datarefs)
931 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
932 stmt_vec_info stmt_info = dr_info->stmt;
933 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
934 && STMT_VINFO_VECTORIZABLE (stmt_info)
935 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
937 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
939 /* If DR is nested in the loop that is being vectorized, we can also
940 record the alignment of the base wrt the outer loop. */
941 if (loop && nested_in_vect_loop_p (loop, stmt_info))
942 vect_record_base_alignment
943 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
948 /* Return the target alignment for the vectorized form of DR_INFO. */
950 static poly_uint64
951 vect_calculate_target_alignment (dr_vec_info *dr_info)
953 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
954 return targetm.vectorize.preferred_vector_alignment (vectype);
957 /* Function vect_compute_data_ref_alignment
959 Compute the misalignment of the data reference DR_INFO.
961 Output:
962 1. DR_MISALIGNMENT (DR_INFO) is defined.
964 FOR NOW: No analysis is actually performed. Misalignment is calculated
965 only for trivial cases. TODO. */
967 static void
968 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
970 stmt_vec_info stmt_info = dr_info->stmt;
971 vec_base_alignments *base_alignments = &vinfo->base_alignments;
972 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
973 class loop *loop = NULL;
974 tree ref = DR_REF (dr_info->dr);
975 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
977 if (dump_enabled_p ())
978 dump_printf_loc (MSG_NOTE, vect_location,
979 "vect_compute_data_ref_alignment:\n");
981 if (loop_vinfo)
982 loop = LOOP_VINFO_LOOP (loop_vinfo);
984 /* Initialize misalignment to unknown. */
985 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
987 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
988 return;
990 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
991 bool step_preserves_misalignment_p;
993 poly_uint64 vector_alignment
994 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
995 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
997 /* If the main loop has peeled for alignment we have no way of knowing
998 whether the data accesses in the epilogues are aligned. We can't at
999 compile time answer the question whether we have entered the main loop or
1000 not. Fixes PR 92351. */
1001 if (loop_vinfo)
1003 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1004 if (orig_loop_vinfo
1005 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1006 return;
1009 unsigned HOST_WIDE_INT vect_align_c;
1010 if (!vector_alignment.is_constant (&vect_align_c))
1011 return;
1013 /* No step for BB vectorization. */
1014 if (!loop)
1016 gcc_assert (integer_zerop (drb->step));
1017 step_preserves_misalignment_p = true;
1020 /* In case the dataref is in an inner-loop of the loop that is being
1021 vectorized (LOOP), we use the base and misalignment information
1022 relative to the outer-loop (LOOP). This is ok only if the misalignment
1023 stays the same throughout the execution of the inner-loop, which is why
1024 we have to check that the stride of the dataref in the inner-loop evenly
1025 divides by the vector alignment. */
1026 else if (nested_in_vect_loop_p (loop, stmt_info))
1028 step_preserves_misalignment_p
1029 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1031 if (dump_enabled_p ())
1033 if (step_preserves_misalignment_p)
1034 dump_printf_loc (MSG_NOTE, vect_location,
1035 "inner step divides the vector alignment.\n");
1036 else
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038 "inner step doesn't divide the vector"
1039 " alignment.\n");
1043 /* Similarly we can only use base and misalignment information relative to
1044 an innermost loop if the misalignment stays the same throughout the
1045 execution of the loop. As above, this is the case if the stride of
1046 the dataref evenly divides by the alignment. */
1047 else
1049 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1050 step_preserves_misalignment_p
1051 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1053 if (!step_preserves_misalignment_p && dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "step doesn't divide the vector alignment.\n");
1058 unsigned int base_alignment = drb->base_alignment;
1059 unsigned int base_misalignment = drb->base_misalignment;
1061 /* Calculate the maximum of the pooled base address alignment and the
1062 alignment that we can compute for DR itself. */
1063 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1064 if (entry && base_alignment < (*entry)->base_alignment)
1066 base_alignment = (*entry)->base_alignment;
1067 base_misalignment = (*entry)->base_misalignment;
1070 if (drb->offset_alignment < vect_align_c
1071 || !step_preserves_misalignment_p
1072 /* We need to know whether the step wrt the vectorized loop is
1073 negative when computing the starting misalignment below. */
1074 || TREE_CODE (drb->step) != INTEGER_CST)
1076 if (dump_enabled_p ())
1077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1078 "Unknown alignment for access: %T\n", ref);
1079 return;
1082 if (base_alignment < vect_align_c)
1084 unsigned int max_alignment;
1085 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1086 if (max_alignment < vect_align_c
1087 || !vect_can_force_dr_alignment_p (base,
1088 vect_align_c * BITS_PER_UNIT))
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "can't force alignment of ref: %T\n", ref);
1093 return;
1096 /* Force the alignment of the decl.
1097 NOTE: This is the only change to the code we make during
1098 the analysis phase, before deciding to vectorize the loop. */
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_NOTE, vect_location,
1101 "force alignment of %T\n", ref);
1103 dr_info->base_decl = base;
1104 dr_info->base_misaligned = true;
1105 base_misalignment = 0;
1107 poly_int64 misalignment
1108 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1110 /* If this is a backward running DR then first access in the larger
1111 vectype actually is N-1 elements before the address in the DR.
1112 Adjust misalign accordingly. */
1113 if (tree_int_cst_sgn (drb->step) < 0)
1114 /* PLUS because STEP is negative. */
1115 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1116 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1118 unsigned int const_misalignment;
1119 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 "Non-constant misalignment for access: %T\n", ref);
1124 return;
1127 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "misalign = %d bytes of ref %T\n",
1132 DR_MISALIGNMENT (dr_info), ref);
1134 return;
1137 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1138 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1139 is made aligned via peeling. */
1141 static bool
1142 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1143 dr_vec_info *dr_peel_info)
1145 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1146 DR_TARGET_ALIGNMENT (dr_info)))
1148 poly_offset_int diff
1149 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1150 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1151 if (known_eq (diff, 0)
1152 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1153 return true;
1155 return false;
1158 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1159 aligned via peeling. */
1161 static bool
1162 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1163 dr_vec_info *dr_peel_info)
1165 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1166 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1167 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1168 DR_OFFSET (dr_peel_info->dr), 0)
1169 || !operand_equal_p (DR_STEP (dr_info->dr),
1170 DR_STEP (dr_peel_info->dr), 0))
1171 return false;
1173 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1176 /* Function vect_update_misalignment_for_peel.
1177 Sets DR_INFO's misalignment
1178 - to 0 if it has the same alignment as DR_PEEL_INFO,
1179 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1180 - to -1 (unknown) otherwise.
1182 DR_INFO - the data reference whose misalignment is to be adjusted.
1183 DR_PEEL_INFO - the data reference whose misalignment is being made
1184 zero in the vector loop by the peel.
1185 NPEEL - the number of iterations in the peel loop if the misalignment
1186 of DR_PEEL_INFO is known at compile time. */
1188 static void
1189 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1190 dr_vec_info *dr_peel_info, int npeel)
1192 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1193 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1195 SET_DR_MISALIGNMENT (dr_info, 0);
1196 return;
1199 unsigned HOST_WIDE_INT alignment;
1200 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1201 && known_alignment_for_access_p (dr_info)
1202 && known_alignment_for_access_p (dr_peel_info))
1204 int misal = DR_MISALIGNMENT (dr_info);
1205 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1206 misal &= alignment - 1;
1207 SET_DR_MISALIGNMENT (dr_info, misal);
1208 return;
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1213 "to unknown (-1).\n");
1214 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1217 /* Return true if alignment is relevant for DR_INFO. */
1219 static bool
1220 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1222 stmt_vec_info stmt_info = dr_info->stmt;
1224 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1225 return false;
1227 /* For interleaving, only the alignment of the first access matters. */
1228 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1229 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1230 return false;
1232 /* Scatter-gather and invariant accesses continue to address individual
1233 scalars, so vector-level alignment is irrelevant. */
1234 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1235 || integer_zerop (DR_STEP (dr_info->dr)))
1236 return false;
1238 /* Strided accesses perform only component accesses, alignment is
1239 irrelevant for them. */
1240 if (STMT_VINFO_STRIDED_P (stmt_info)
1241 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 return false;
1244 return true;
1247 /* Given an memory reference EXP return whether its alignment is less
1248 than its size. */
1250 static bool
1251 not_size_aligned (tree exp)
1253 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1254 return true;
1256 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1257 > get_object_alignment (exp));
1260 /* Function vector_alignment_reachable_p
1262 Return true if vector alignment for DR_INFO is reachable by peeling
1263 a few loop iterations. Return false otherwise. */
1265 static bool
1266 vector_alignment_reachable_p (dr_vec_info *dr_info)
1268 stmt_vec_info stmt_info = dr_info->stmt;
1269 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1271 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1273 /* For interleaved access we peel only if number of iterations in
1274 the prolog loop ({VF - misalignment}), is a multiple of the
1275 number of the interleaved accesses. */
1276 int elem_size, mis_in_elements;
1278 /* FORNOW: handle only known alignment. */
1279 if (!known_alignment_for_access_p (dr_info))
1280 return false;
1282 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1283 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1284 elem_size = vector_element_size (vector_size, nelements);
1285 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1287 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1288 return false;
1291 /* If misalignment is known at the compile time then allow peeling
1292 only if natural alignment is reachable through peeling. */
1293 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1295 HOST_WIDE_INT elmsize =
1296 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1297 if (dump_enabled_p ())
1299 dump_printf_loc (MSG_NOTE, vect_location,
1300 "data size = %wd. misalignment = %d.\n", elmsize,
1301 DR_MISALIGNMENT (dr_info));
1303 if (DR_MISALIGNMENT (dr_info) % elmsize)
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1307 "data size does not divide the misalignment.\n");
1308 return false;
1312 if (!known_alignment_for_access_p (dr_info))
1314 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1315 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1316 if (dump_enabled_p ())
1317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318 "Unknown misalignment, %snaturally aligned\n",
1319 is_packed ? "not " : "");
1320 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1323 return true;
1327 /* Calculate the cost of the memory access represented by DR_INFO. */
1329 static void
1330 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1331 unsigned int *inside_cost,
1332 unsigned int *outside_cost,
1333 stmt_vector_for_cost *body_cost_vec,
1334 stmt_vector_for_cost *prologue_cost_vec)
1336 stmt_vec_info stmt_info = dr_info->stmt;
1337 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1338 int ncopies;
1340 if (PURE_SLP_STMT (stmt_info))
1341 ncopies = 1;
1342 else
1343 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1345 if (DR_IS_READ (dr_info->dr))
1346 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1347 outside_cost, prologue_cost_vec, body_cost_vec, false);
1348 else
1349 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1351 if (dump_enabled_p ())
1352 dump_printf_loc (MSG_NOTE, vect_location,
1353 "vect_get_data_access_cost: inside_cost = %d, "
1354 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1358 typedef struct _vect_peel_info
1360 dr_vec_info *dr_info;
1361 int npeel;
1362 unsigned int count;
1363 } *vect_peel_info;
1365 typedef struct _vect_peel_extended_info
1367 vec_info *vinfo;
1368 struct _vect_peel_info peel_info;
1369 unsigned int inside_cost;
1370 unsigned int outside_cost;
1371 } *vect_peel_extended_info;
1374 /* Peeling hashtable helpers. */
1376 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1378 static inline hashval_t hash (const _vect_peel_info *);
1379 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1382 inline hashval_t
1383 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1385 return (hashval_t) peel_info->npeel;
1388 inline bool
1389 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1391 return (a->npeel == b->npeel);
1395 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1397 static void
1398 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1399 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1400 int npeel)
1402 struct _vect_peel_info elem, *slot;
1403 _vect_peel_info **new_slot;
1404 bool supportable_dr_alignment
1405 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1407 elem.npeel = npeel;
1408 slot = peeling_htab->find (&elem);
1409 if (slot)
1410 slot->count++;
1411 else
1413 slot = XNEW (struct _vect_peel_info);
1414 slot->npeel = npeel;
1415 slot->dr_info = dr_info;
1416 slot->count = 1;
1417 new_slot = peeling_htab->find_slot (slot, INSERT);
1418 *new_slot = slot;
1421 if (!supportable_dr_alignment
1422 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1423 slot->count += VECT_MAX_COST;
1427 /* Traverse peeling hash table to find peeling option that aligns maximum
1428 number of data accesses. */
1431 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1432 _vect_peel_extended_info *max)
1434 vect_peel_info elem = *slot;
1436 if (elem->count > max->peel_info.count
1437 || (elem->count == max->peel_info.count
1438 && max->peel_info.npeel > elem->npeel))
1440 max->peel_info.npeel = elem->npeel;
1441 max->peel_info.count = elem->count;
1442 max->peel_info.dr_info = elem->dr_info;
1445 return 1;
1448 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1449 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1450 we assume DR0_INFO's misalignment will be zero after peeling. */
1452 static void
1453 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1454 dr_vec_info *dr0_info,
1455 unsigned int *inside_cost,
1456 unsigned int *outside_cost,
1457 stmt_vector_for_cost *body_cost_vec,
1458 stmt_vector_for_cost *prologue_cost_vec,
1459 unsigned int npeel,
1460 bool unknown_misalignment)
1462 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1464 for (data_reference *dr : datarefs)
1466 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1467 if (!vect_relevant_for_alignment_p (dr_info))
1468 continue;
1470 int save_misalignment;
1471 save_misalignment = DR_MISALIGNMENT (dr_info);
1472 if (npeel == 0)
1474 else if (unknown_misalignment && dr_info == dr0_info)
1475 SET_DR_MISALIGNMENT (dr_info, 0);
1476 else
1477 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1478 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1479 body_cost_vec, prologue_cost_vec);
1480 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1484 /* Traverse peeling hash table and calculate cost for each peeling option.
1485 Find the one with the lowest cost. */
1488 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1489 _vect_peel_extended_info *min)
1491 vect_peel_info elem = *slot;
1492 int dummy;
1493 unsigned int inside_cost = 0, outside_cost = 0;
1494 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1495 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1496 epilogue_cost_vec;
1498 prologue_cost_vec.create (2);
1499 body_cost_vec.create (2);
1500 epilogue_cost_vec.create (2);
1502 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1503 &outside_cost, &body_cost_vec,
1504 &prologue_cost_vec, elem->npeel, false);
1506 body_cost_vec.release ();
1508 outside_cost += vect_get_known_peeling_cost
1509 (loop_vinfo, elem->npeel, &dummy,
1510 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1511 &prologue_cost_vec, &epilogue_cost_vec);
1513 /* Prologue and epilogue costs are added to the target model later.
1514 These costs depend only on the scalar iteration cost, the
1515 number of peeling iterations finally chosen, and the number of
1516 misaligned statements. So discard the information found here. */
1517 prologue_cost_vec.release ();
1518 epilogue_cost_vec.release ();
1520 if (inside_cost < min->inside_cost
1521 || (inside_cost == min->inside_cost
1522 && outside_cost < min->outside_cost))
1524 min->inside_cost = inside_cost;
1525 min->outside_cost = outside_cost;
1526 min->peel_info.dr_info = elem->dr_info;
1527 min->peel_info.npeel = elem->npeel;
1528 min->peel_info.count = elem->count;
1531 return 1;
1535 /* Choose best peeling option by traversing peeling hash table and either
1536 choosing an option with the lowest cost (if cost model is enabled) or the
1537 option that aligns as many accesses as possible. */
1539 static struct _vect_peel_extended_info
1540 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1541 loop_vec_info loop_vinfo)
1543 struct _vect_peel_extended_info res;
1545 res.peel_info.dr_info = NULL;
1546 res.vinfo = loop_vinfo;
1548 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1550 res.inside_cost = INT_MAX;
1551 res.outside_cost = INT_MAX;
1552 peeling_htab->traverse <_vect_peel_extended_info *,
1553 vect_peeling_hash_get_lowest_cost> (&res);
1555 else
1557 res.peel_info.count = 0;
1558 peeling_htab->traverse <_vect_peel_extended_info *,
1559 vect_peeling_hash_get_most_frequent> (&res);
1560 res.inside_cost = 0;
1561 res.outside_cost = 0;
1564 return res;
1567 /* Return true if the new peeling NPEEL is supported. */
1569 static bool
1570 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1571 unsigned npeel)
1573 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1574 enum dr_alignment_support supportable_dr_alignment;
1576 /* Ensure that all data refs can be vectorized after the peel. */
1577 for (data_reference *dr : datarefs)
1579 int save_misalignment;
1581 if (dr == dr0_info->dr)
1582 continue;
1584 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1585 if (!vect_relevant_for_alignment_p (dr_info))
1586 continue;
1588 save_misalignment = DR_MISALIGNMENT (dr_info);
1589 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1590 supportable_dr_alignment
1591 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1592 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1594 if (!supportable_dr_alignment)
1595 return false;
1598 return true;
1601 /* Compare two data-references DRA and DRB to group them into chunks
1602 with related alignment. */
1604 static int
1605 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1607 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1608 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1609 int cmp;
1611 /* Stabilize sort. */
1612 if (dra == drb)
1613 return 0;
1615 /* Ordering of DRs according to base. */
1616 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1617 DR_BASE_ADDRESS (drb));
1618 if (cmp != 0)
1619 return cmp;
1621 /* And according to DR_OFFSET. */
1622 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1623 if (cmp != 0)
1624 return cmp;
1626 /* And after step. */
1627 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1628 if (cmp != 0)
1629 return cmp;
1631 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1632 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1633 if (cmp == 0)
1634 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1635 return cmp;
1638 /* Function vect_enhance_data_refs_alignment
1640 This pass will use loop versioning and loop peeling in order to enhance
1641 the alignment of data references in the loop.
1643 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1644 original loop is to be vectorized. Any other loops that are created by
1645 the transformations performed in this pass - are not supposed to be
1646 vectorized. This restriction will be relaxed.
1648 This pass will require a cost model to guide it whether to apply peeling
1649 or versioning or a combination of the two. For example, the scheme that
1650 intel uses when given a loop with several memory accesses, is as follows:
1651 choose one memory access ('p') which alignment you want to force by doing
1652 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1653 other accesses are not necessarily aligned, or (2) use loop versioning to
1654 generate one loop in which all accesses are aligned, and another loop in
1655 which only 'p' is necessarily aligned.
1657 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1658 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1659 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1661 Devising a cost model is the most critical aspect of this work. It will
1662 guide us on which access to peel for, whether to use loop versioning, how
1663 many versions to create, etc. The cost model will probably consist of
1664 generic considerations as well as target specific considerations (on
1665 powerpc for example, misaligned stores are more painful than misaligned
1666 loads).
1668 Here are the general steps involved in alignment enhancements:
1670 -- original loop, before alignment analysis:
1671 for (i=0; i<N; i++){
1672 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1673 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1676 -- After vect_compute_data_refs_alignment:
1677 for (i=0; i<N; i++){
1678 x = q[i]; # DR_MISALIGNMENT(q) = 3
1679 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1682 -- Possibility 1: we do loop versioning:
1683 if (p is aligned) {
1684 for (i=0; i<N; i++){ # loop 1A
1685 x = q[i]; # DR_MISALIGNMENT(q) = 3
1686 p[i] = y; # DR_MISALIGNMENT(p) = 0
1689 else {
1690 for (i=0; i<N; i++){ # loop 1B
1691 x = q[i]; # DR_MISALIGNMENT(q) = 3
1692 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1696 -- Possibility 2: we do loop peeling:
1697 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1698 x = q[i];
1699 p[i] = y;
1701 for (i = 3; i < N; i++){ # loop 2A
1702 x = q[i]; # DR_MISALIGNMENT(q) = 0
1703 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1706 -- Possibility 3: combination of loop peeling and versioning:
1707 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1708 x = q[i];
1709 p[i] = y;
1711 if (p is aligned) {
1712 for (i = 3; i<N; i++){ # loop 3A
1713 x = q[i]; # DR_MISALIGNMENT(q) = 0
1714 p[i] = y; # DR_MISALIGNMENT(p) = 0
1717 else {
1718 for (i = 3; i<N; i++){ # loop 3B
1719 x = q[i]; # DR_MISALIGNMENT(q) = 0
1720 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1724 These loops are later passed to loop_transform to be vectorized. The
1725 vectorizer will use the alignment information to guide the transformation
1726 (whether to generate regular loads/stores, or with special handling for
1727 misalignment). */
1729 opt_result
1730 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1732 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1733 enum dr_alignment_support supportable_dr_alignment;
1734 dr_vec_info *first_store = NULL;
1735 dr_vec_info *dr0_info = NULL;
1736 struct data_reference *dr;
1737 unsigned int i;
1738 bool do_peeling = false;
1739 bool do_versioning = false;
1740 unsigned int npeel = 0;
1741 bool one_misalignment_known = false;
1742 bool one_misalignment_unknown = false;
1743 bool one_dr_unsupportable = false;
1744 dr_vec_info *unsupportable_dr_info = NULL;
1745 unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1746 hash_table<peel_info_hasher> peeling_htab (1);
1748 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1750 /* Reset data so we can safely be called multiple times. */
1751 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1752 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1754 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1755 return opt_result::success ();
1757 /* Sort the vector of datarefs so DRs that have the same or dependent
1758 alignment are next to each other. */
1759 auto_vec<data_reference_p> datarefs
1760 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1761 datarefs.qsort (dr_align_group_sort_cmp);
1763 /* Compute the number of DRs that become aligned when we peel
1764 a dataref so it becomes aligned. */
1765 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1766 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1767 unsigned i0;
1768 for (i0 = 0; i0 < datarefs.length (); ++i0)
1769 if (DR_BASE_ADDRESS (datarefs[i0]))
1770 break;
1771 for (i = i0 + 1; i <= datarefs.length (); ++i)
1773 if (i == datarefs.length ()
1774 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1775 DR_BASE_ADDRESS (datarefs[i]), 0)
1776 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1777 DR_OFFSET (datarefs[i]), 0)
1778 || !operand_equal_p (DR_STEP (datarefs[i0]),
1779 DR_STEP (datarefs[i]), 0))
1781 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1782 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1783 will get known misalignment if we align one of the refs
1784 with the largest DR_TARGET_ALIGNMENT. */
1785 for (unsigned j = i0; j < i; ++j)
1787 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1788 for (unsigned k = i0; k < i; ++k)
1790 if (k == j)
1791 continue;
1792 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1793 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1794 dr_infoj))
1795 n_same_align_refs[j]++;
1798 i0 = i;
1802 /* While cost model enhancements are expected in the future, the high level
1803 view of the code at this time is as follows:
1805 A) If there is a misaligned access then see if peeling to align
1806 this access can make all data references satisfy
1807 vect_supportable_dr_alignment. If so, update data structures
1808 as needed and return true.
1810 B) If peeling wasn't possible and there is a data reference with an
1811 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1812 then see if loop versioning checks can be used to make all data
1813 references satisfy vect_supportable_dr_alignment. If so, update
1814 data structures as needed and return true.
1816 C) If neither peeling nor versioning were successful then return false if
1817 any data reference does not satisfy vect_supportable_dr_alignment.
1819 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1821 Note, Possibility 3 above (which is peeling and versioning together) is not
1822 being done at this time. */
1824 /* (1) Peeling to force alignment. */
1826 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1827 Considerations:
1828 + How many accesses will become aligned due to the peeling
1829 - How many accesses will become unaligned due to the peeling,
1830 and the cost of misaligned accesses.
1831 - The cost of peeling (the extra runtime checks, the increase
1832 in code size). */
1834 FOR_EACH_VEC_ELT (datarefs, i, dr)
1836 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1837 if (!vect_relevant_for_alignment_p (dr_info))
1838 continue;
1840 stmt_vec_info stmt_info = dr_info->stmt;
1841 supportable_dr_alignment
1842 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1843 do_peeling = vector_alignment_reachable_p (dr_info);
1844 if (do_peeling)
1846 if (known_alignment_for_access_p (dr_info))
1848 unsigned int npeel_tmp = 0;
1849 bool negative = tree_int_cst_compare (DR_STEP (dr),
1850 size_zero_node) < 0;
1852 /* If known_alignment_for_access_p then we have set
1853 DR_MISALIGNMENT which is only done if we know it at compiler
1854 time, so it is safe to assume target alignment is constant.
1856 unsigned int target_align =
1857 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1858 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1859 mis = (negative
1860 ? DR_MISALIGNMENT (dr_info)
1861 : -DR_MISALIGNMENT (dr_info));
1862 if (DR_MISALIGNMENT (dr_info) != 0)
1863 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1865 /* For multiple types, it is possible that the bigger type access
1866 will have more than one peeling option. E.g., a loop with two
1867 types: one of size (vector size / 4), and the other one of
1868 size (vector size / 8). Vectorization factor will 8. If both
1869 accesses are misaligned by 3, the first one needs one scalar
1870 iteration to be aligned, and the second one needs 5. But the
1871 first one will be aligned also by peeling 5 scalar
1872 iterations, and in that case both accesses will be aligned.
1873 Hence, except for the immediate peeling amount, we also want
1874 to try to add full vector size, while we don't exceed
1875 vectorization factor.
1876 We do this automatically for cost model, since we calculate
1877 cost for every peeling option. */
1878 poly_uint64 nscalars = npeel_tmp;
1879 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1881 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1882 nscalars = (STMT_SLP_TYPE (stmt_info)
1883 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1886 /* Save info about DR in the hash table. Also include peeling
1887 amounts according to the explanation above. */
1888 while (known_le (npeel_tmp, nscalars))
1890 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1891 dr_info, npeel_tmp);
1892 npeel_tmp += MAX (1, target_align / dr_size);
1895 one_misalignment_known = true;
1897 else
1899 /* If we don't know any misalignment values, we prefer
1900 peeling for data-ref that has the maximum number of data-refs
1901 with the same alignment, unless the target prefers to align
1902 stores over load. */
1903 unsigned same_align_drs = n_same_align_refs[i];
1904 if (!dr0_info
1905 || dr0_same_align_drs < same_align_drs)
1907 dr0_same_align_drs = same_align_drs;
1908 dr0_info = dr_info;
1910 /* For data-refs with the same number of related
1911 accesses prefer the one where the misalign
1912 computation will be invariant in the outermost loop. */
1913 else if (dr0_same_align_drs == same_align_drs)
1915 class loop *ivloop0, *ivloop;
1916 ivloop0 = outermost_invariant_loop_for_expr
1917 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1918 ivloop = outermost_invariant_loop_for_expr
1919 (loop, DR_BASE_ADDRESS (dr));
1920 if ((ivloop && !ivloop0)
1921 || (ivloop && ivloop0
1922 && flow_loop_nested_p (ivloop, ivloop0)))
1923 dr0_info = dr_info;
1926 one_misalignment_unknown = true;
1928 /* Check for data refs with unsupportable alignment that
1929 can be peeled. */
1930 if (!supportable_dr_alignment)
1932 one_dr_unsupportable = true;
1933 unsupportable_dr_info = dr_info;
1936 if (!first_store && DR_IS_WRITE (dr))
1938 first_store = dr_info;
1939 first_store_same_align_drs = same_align_drs;
1943 else
1945 if (!aligned_access_p (dr_info))
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949 "vector alignment may not be reachable\n");
1950 break;
1955 /* Check if we can possibly peel the loop. */
1956 if (!vect_can_advance_ivs_p (loop_vinfo)
1957 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1958 || loop->inner)
1959 do_peeling = false;
1961 struct _vect_peel_extended_info peel_for_known_alignment;
1962 struct _vect_peel_extended_info peel_for_unknown_alignment;
1963 struct _vect_peel_extended_info best_peel;
1965 peel_for_unknown_alignment.inside_cost = INT_MAX;
1966 peel_for_unknown_alignment.outside_cost = INT_MAX;
1967 peel_for_unknown_alignment.peel_info.count = 0;
1969 if (do_peeling
1970 && one_misalignment_unknown)
1972 /* Check if the target requires to prefer stores over loads, i.e., if
1973 misaligned stores are more expensive than misaligned loads (taking
1974 drs with same alignment into account). */
1975 unsigned int load_inside_cost = 0;
1976 unsigned int load_outside_cost = 0;
1977 unsigned int store_inside_cost = 0;
1978 unsigned int store_outside_cost = 0;
1979 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1981 stmt_vector_for_cost dummy;
1982 dummy.create (2);
1983 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1984 &load_inside_cost,
1985 &load_outside_cost,
1986 &dummy, &dummy, estimated_npeels, true);
1987 dummy.release ();
1989 if (first_store)
1991 dummy.create (2);
1992 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1993 &store_inside_cost,
1994 &store_outside_cost,
1995 &dummy, &dummy,
1996 estimated_npeels, true);
1997 dummy.release ();
1999 else
2001 store_inside_cost = INT_MAX;
2002 store_outside_cost = INT_MAX;
2005 if (load_inside_cost > store_inside_cost
2006 || (load_inside_cost == store_inside_cost
2007 && load_outside_cost > store_outside_cost))
2009 dr0_info = first_store;
2010 dr0_same_align_drs = first_store_same_align_drs;
2011 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2012 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2014 else
2016 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2017 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2020 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2021 prologue_cost_vec.create (2);
2022 epilogue_cost_vec.create (2);
2024 int dummy2;
2025 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2026 (loop_vinfo, estimated_npeels, &dummy2,
2027 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2028 &prologue_cost_vec, &epilogue_cost_vec);
2030 prologue_cost_vec.release ();
2031 epilogue_cost_vec.release ();
2033 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2036 peel_for_unknown_alignment.peel_info.npeel = 0;
2037 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2039 best_peel = peel_for_unknown_alignment;
2041 peel_for_known_alignment.inside_cost = INT_MAX;
2042 peel_for_known_alignment.outside_cost = INT_MAX;
2043 peel_for_known_alignment.peel_info.count = 0;
2044 peel_for_known_alignment.peel_info.dr_info = NULL;
2046 if (do_peeling && one_misalignment_known)
2048 /* Peeling is possible, but there is no data access that is not supported
2049 unless aligned. So we try to choose the best possible peeling from
2050 the hash table. */
2051 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2052 (&peeling_htab, loop_vinfo);
2055 /* Compare costs of peeling for known and unknown alignment. */
2056 if (peel_for_known_alignment.peel_info.dr_info != NULL
2057 && peel_for_unknown_alignment.inside_cost
2058 >= peel_for_known_alignment.inside_cost)
2060 best_peel = peel_for_known_alignment;
2062 /* If the best peeling for known alignment has NPEEL == 0, perform no
2063 peeling at all except if there is an unsupportable dr that we can
2064 align. */
2065 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2066 do_peeling = false;
2069 /* If there is an unsupportable data ref, prefer this over all choices so far
2070 since we'd have to discard a chosen peeling except when it accidentally
2071 aligned the unsupportable data ref. */
2072 if (one_dr_unsupportable)
2073 dr0_info = unsupportable_dr_info;
2074 else if (do_peeling)
2076 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2077 TODO: Use nopeel_outside_cost or get rid of it? */
2078 unsigned nopeel_inside_cost = 0;
2079 unsigned nopeel_outside_cost = 0;
2081 stmt_vector_for_cost dummy;
2082 dummy.create (2);
2083 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2084 &nopeel_outside_cost, &dummy, &dummy,
2085 0, false);
2086 dummy.release ();
2088 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2089 costs will be recorded. */
2090 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2091 prologue_cost_vec.create (2);
2092 epilogue_cost_vec.create (2);
2094 int dummy2;
2095 nopeel_outside_cost += vect_get_known_peeling_cost
2096 (loop_vinfo, 0, &dummy2,
2097 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2098 &prologue_cost_vec, &epilogue_cost_vec);
2100 prologue_cost_vec.release ();
2101 epilogue_cost_vec.release ();
2103 npeel = best_peel.peel_info.npeel;
2104 dr0_info = best_peel.peel_info.dr_info;
2106 /* If no peeling is not more expensive than the best peeling we
2107 have so far, don't perform any peeling. */
2108 if (nopeel_inside_cost <= best_peel.inside_cost)
2109 do_peeling = false;
2112 if (do_peeling)
2114 stmt_vec_info stmt_info = dr0_info->stmt;
2115 if (known_alignment_for_access_p (dr0_info))
2117 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2118 size_zero_node) < 0;
2119 if (!npeel)
2121 /* Since it's known at compile time, compute the number of
2122 iterations in the peeled loop (the peeling factor) for use in
2123 updating DR_MISALIGNMENT values. The peeling factor is the
2124 vectorization factor minus the misalignment as an element
2125 count. */
2126 mis = (negative
2127 ? DR_MISALIGNMENT (dr0_info)
2128 : -DR_MISALIGNMENT (dr0_info));
2129 /* If known_alignment_for_access_p then we have set
2130 DR_MISALIGNMENT which is only done if we know it at compiler
2131 time, so it is safe to assume target alignment is constant.
2133 unsigned int target_align =
2134 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2135 npeel = ((mis & (target_align - 1))
2136 / vect_get_scalar_dr_size (dr0_info));
2139 /* For interleaved data access every iteration accesses all the
2140 members of the group, therefore we divide the number of iterations
2141 by the group size. */
2142 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2143 npeel /= DR_GROUP_SIZE (stmt_info);
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_NOTE, vect_location,
2147 "Try peeling by %d\n", npeel);
2150 /* Ensure that all datarefs can be vectorized after the peel. */
2151 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2152 do_peeling = false;
2154 /* Check if all datarefs are supportable and log. */
2155 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2156 return opt_result::success ();
2158 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2159 if (do_peeling)
2161 unsigned max_allowed_peel
2162 = param_vect_max_peeling_for_alignment;
2163 if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
2164 max_allowed_peel = 0;
2165 if (max_allowed_peel != (unsigned)-1)
2167 unsigned max_peel = npeel;
2168 if (max_peel == 0)
2170 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2171 unsigned HOST_WIDE_INT target_align_c;
2172 if (target_align.is_constant (&target_align_c))
2173 max_peel =
2174 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2175 else
2177 do_peeling = false;
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_NOTE, vect_location,
2180 "Disable peeling, max peels set and vector"
2181 " alignment unknown\n");
2184 if (max_peel > max_allowed_peel)
2186 do_peeling = false;
2187 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_NOTE, vect_location,
2189 "Disable peeling, max peels reached: %d\n", max_peel);
2194 /* Cost model #2 - if peeling may result in a remaining loop not
2195 iterating enough to be vectorized then do not peel. Since this
2196 is a cost heuristic rather than a correctness decision, use the
2197 most likely runtime value for variable vectorization factors. */
2198 if (do_peeling
2199 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2201 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2202 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2203 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2204 < assumed_vf + max_peel)
2205 do_peeling = false;
2208 if (do_peeling)
2210 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2211 If the misalignment of DR_i is identical to that of dr0 then set
2212 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2213 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2214 by the peeling factor times the element size of DR_i (MOD the
2215 vectorization factor times the size). Otherwise, the
2216 misalignment of DR_i must be set to unknown. */
2217 FOR_EACH_VEC_ELT (datarefs, i, dr)
2218 if (dr != dr0_info->dr)
2220 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2221 if (!vect_relevant_for_alignment_p (dr_info))
2222 continue;
2224 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2227 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2228 if (npeel)
2229 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2230 else
2231 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2232 = DR_MISALIGNMENT (dr0_info);
2233 SET_DR_MISALIGNMENT (dr0_info, 0);
2234 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE, vect_location,
2237 "Alignment of access forced using peeling.\n");
2238 dump_printf_loc (MSG_NOTE, vect_location,
2239 "Peeling for alignment will be applied.\n");
2242 /* The inside-loop cost will be accounted for in vectorizable_load
2243 and vectorizable_store correctly with adjusted alignments.
2244 Drop the body_cst_vec on the floor here. */
2245 return opt_result::success ();
2249 /* (2) Versioning to force alignment. */
2251 /* Try versioning if:
2252 1) optimize loop for speed and the cost-model is not cheap
2253 2) there is at least one unsupported misaligned data ref with an unknown
2254 misalignment, and
2255 3) all misaligned data refs with a known misalignment are supported, and
2256 4) the number of runtime alignment checks is within reason. */
2258 do_versioning
2259 = (optimize_loop_nest_for_speed_p (loop)
2260 && !loop->inner /* FORNOW */
2261 && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
2263 if (do_versioning)
2265 FOR_EACH_VEC_ELT (datarefs, i, dr)
2267 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2268 if (aligned_access_p (dr_info)
2269 || !vect_relevant_for_alignment_p (dr_info))
2270 continue;
2272 stmt_vec_info stmt_info = dr_info->stmt;
2273 if (STMT_VINFO_STRIDED_P (stmt_info))
2275 do_versioning = false;
2276 break;
2279 supportable_dr_alignment
2280 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2282 if (!supportable_dr_alignment)
2284 int mask;
2285 tree vectype;
2287 if (known_alignment_for_access_p (dr_info)
2288 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2289 >= (unsigned) param_vect_max_version_for_alignment_checks)
2291 do_versioning = false;
2292 break;
2295 vectype = STMT_VINFO_VECTYPE (stmt_info);
2296 gcc_assert (vectype);
2298 /* At present we don't support versioning for alignment
2299 with variable VF, since there's no guarantee that the
2300 VF is a power of two. We could relax this if we added
2301 a way of enforcing a power-of-two size. */
2302 unsigned HOST_WIDE_INT size;
2303 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2305 do_versioning = false;
2306 break;
2309 /* Forcing alignment in the first iteration is no good if
2310 we don't keep it across iterations. For now, just disable
2311 versioning in this case.
2312 ?? We could actually unroll the loop to achieve the required
2313 overall step alignment, and forcing the alignment could be
2314 done by doing some iterations of the non-vectorized loop. */
2315 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2316 * DR_STEP_ALIGNMENT (dr),
2317 DR_TARGET_ALIGNMENT (dr_info)))
2319 do_versioning = false;
2320 break;
2323 /* The rightmost bits of an aligned address must be zeros.
2324 Construct the mask needed for this test. For example,
2325 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2326 mask must be 15 = 0xf. */
2327 mask = size - 1;
2329 /* FORNOW: use the same mask to test all potentially unaligned
2330 references in the loop. */
2331 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2332 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2334 do_versioning = false;
2335 break;
2338 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2339 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2343 /* Versioning requires at least one misaligned data reference. */
2344 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2345 do_versioning = false;
2346 else if (!do_versioning)
2347 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2350 if (do_versioning)
2352 const vec<stmt_vec_info> &may_misalign_stmts
2353 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2354 stmt_vec_info stmt_info;
2356 /* It can now be assumed that the data references in the statements
2357 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2358 of the loop being vectorized. */
2359 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2361 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2362 SET_DR_MISALIGNMENT (dr_info, 0);
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE, vect_location,
2365 "Alignment of access forced using versioning.\n");
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_NOTE, vect_location,
2370 "Versioning for alignment will be applied.\n");
2372 /* Peeling and versioning can't be done together at this time. */
2373 gcc_assert (! (do_peeling && do_versioning));
2375 return opt_result::success ();
2378 /* This point is reached if neither peeling nor versioning is being done. */
2379 gcc_assert (! (do_peeling || do_versioning));
2381 return opt_result::success ();
2385 /* Function vect_analyze_data_refs_alignment
2387 Analyze the alignment of the data-references in the loop.
2388 Return FALSE if a data reference is found that cannot be vectorized. */
2390 opt_result
2391 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2393 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2395 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2396 struct data_reference *dr;
2397 unsigned int i;
2399 vect_record_base_alignments (loop_vinfo);
2400 FOR_EACH_VEC_ELT (datarefs, i, dr)
2402 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2403 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2404 vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2407 return opt_result::success ();
2411 /* Analyze alignment of DRs of stmts in NODE. */
2413 static bool
2414 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2416 /* We vectorize from the first scalar stmt in the node unless
2417 the node is permuted in which case we start from the first
2418 element in the group. */
2419 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2420 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2421 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2422 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2424 /* We need to commit to a vector type for the group now. */
2425 if (is_a <bb_vec_info> (vinfo)
2426 && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430 "desired vector type conflicts with earlier one "
2431 "for %G", first_stmt_info->stmt);
2432 return false;
2435 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2436 vect_compute_data_ref_alignment (vinfo, dr_info);
2437 /* In several places we need alignment of the first element anyway. */
2438 if (dr_info != first_dr_info)
2439 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2441 /* For creating the data-ref pointer we need alignment of the
2442 first element as well. */
2443 first_stmt_info
2444 = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node));
2445 if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2447 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2448 if (dr_info != first_dr_info)
2449 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2452 return true;
2455 /* Function vect_slp_analyze_instance_alignment
2457 Analyze the alignment of the data-references in the SLP instance.
2458 Return FALSE if a data reference is found that cannot be vectorized. */
2460 bool
2461 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2462 slp_instance instance)
2464 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2466 slp_tree node;
2467 unsigned i;
2468 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2469 if (! vect_slp_analyze_node_alignment (vinfo, node))
2470 return false;
2472 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2473 && ! vect_slp_analyze_node_alignment
2474 (vinfo, SLP_INSTANCE_TREE (instance)))
2475 return false;
2477 return true;
2481 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2482 accesses of legal size, step, etc. Detect gaps, single element
2483 interleaving, and other special cases. Set grouped access info.
2484 Collect groups of strided stores for further use in SLP analysis.
2485 Worker for vect_analyze_group_access. */
2487 static bool
2488 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2490 data_reference *dr = dr_info->dr;
2491 tree step = DR_STEP (dr);
2492 tree scalar_type = TREE_TYPE (DR_REF (dr));
2493 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2494 stmt_vec_info stmt_info = dr_info->stmt;
2495 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2496 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2497 HOST_WIDE_INT dr_step = -1;
2498 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2499 bool slp_impossible = false;
2501 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2502 size of the interleaving group (including gaps). */
2503 if (tree_fits_shwi_p (step))
2505 dr_step = tree_to_shwi (step);
2506 /* Check that STEP is a multiple of type size. Otherwise there is
2507 a non-element-sized gap at the end of the group which we
2508 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2509 ??? As we can handle non-constant step fine here we should
2510 simply remove uses of DR_GROUP_GAP between the last and first
2511 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2512 simply not include that gap. */
2513 if ((dr_step % type_size) != 0)
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_NOTE, vect_location,
2517 "Step %T is not a multiple of the element size"
2518 " for %T\n",
2519 step, DR_REF (dr));
2520 return false;
2522 groupsize = absu_hwi (dr_step) / type_size;
2524 else
2525 groupsize = 0;
2527 /* Not consecutive access is possible only if it is a part of interleaving. */
2528 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2530 /* Check if it this DR is a part of interleaving, and is a single
2531 element of the group that is accessed in the loop. */
2533 /* Gaps are supported only for loads. STEP must be a multiple of the type
2534 size. */
2535 if (DR_IS_READ (dr)
2536 && (dr_step % type_size) == 0
2537 && groupsize > 0
2538 /* This could be UINT_MAX but as we are generating code in a very
2539 inefficient way we have to cap earlier.
2540 See PR91403 for example. */
2541 && groupsize <= 4096)
2543 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2544 DR_GROUP_SIZE (stmt_info) = groupsize;
2545 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_NOTE, vect_location,
2548 "Detected single element interleaving %T"
2549 " step %T\n",
2550 DR_REF (dr), step);
2552 return true;
2555 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2557 "not consecutive access %G", stmt_info->stmt);
2559 if (bb_vinfo)
2561 /* Mark the statement as unvectorizable. */
2562 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2563 return true;
2566 if (dump_enabled_p ())
2567 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2568 STMT_VINFO_STRIDED_P (stmt_info) = true;
2569 return true;
2572 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2574 /* First stmt in the interleaving chain. Check the chain. */
2575 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2576 struct data_reference *data_ref = dr;
2577 unsigned int count = 1;
2578 tree prev_init = DR_INIT (data_ref);
2579 HOST_WIDE_INT diff, gaps = 0;
2581 /* By construction, all group members have INTEGER_CST DR_INITs. */
2582 while (next)
2584 /* We never have the same DR multiple times. */
2585 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2586 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2588 data_ref = STMT_VINFO_DATA_REF (next);
2590 /* All group members have the same STEP by construction. */
2591 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2593 /* Check that the distance between two accesses is equal to the type
2594 size. Otherwise, we have gaps. */
2595 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2596 - TREE_INT_CST_LOW (prev_init)) / type_size;
2597 if (diff != 1)
2599 /* FORNOW: SLP of accesses with gaps is not supported. */
2600 slp_impossible = true;
2601 if (DR_IS_WRITE (data_ref))
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2605 "interleaved store with gaps\n");
2606 return false;
2609 gaps += diff - 1;
2612 last_accessed_element += diff;
2614 /* Store the gap from the previous member of the group. If there is no
2615 gap in the access, DR_GROUP_GAP is always 1. */
2616 DR_GROUP_GAP (next) = diff;
2618 prev_init = DR_INIT (data_ref);
2619 next = DR_GROUP_NEXT_ELEMENT (next);
2620 /* Count the number of data-refs in the chain. */
2621 count++;
2624 if (groupsize == 0)
2625 groupsize = count + gaps;
2627 /* This could be UINT_MAX but as we are generating code in a very
2628 inefficient way we have to cap earlier. See PR78699 for example. */
2629 if (groupsize > 4096)
2631 if (dump_enabled_p ())
2632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2633 "group is too large\n");
2634 return false;
2637 /* Check that the size of the interleaving is equal to count for stores,
2638 i.e., that there are no gaps. */
2639 if (groupsize != count
2640 && !DR_IS_READ (dr))
2642 groupsize = count;
2643 STMT_VINFO_STRIDED_P (stmt_info) = true;
2646 /* If there is a gap after the last load in the group it is the
2647 difference between the groupsize and the last accessed
2648 element.
2649 When there is no gap, this difference should be 0. */
2650 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2652 DR_GROUP_SIZE (stmt_info) = groupsize;
2653 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_NOTE, vect_location,
2656 "Detected interleaving ");
2657 if (DR_IS_READ (dr))
2658 dump_printf (MSG_NOTE, "load ");
2659 else if (STMT_VINFO_STRIDED_P (stmt_info))
2660 dump_printf (MSG_NOTE, "strided store ");
2661 else
2662 dump_printf (MSG_NOTE, "store ");
2663 dump_printf (MSG_NOTE, "of size %u\n",
2664 (unsigned)groupsize);
2665 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2666 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2667 while (next)
2669 if (DR_GROUP_GAP (next) != 1)
2670 dump_printf_loc (MSG_NOTE, vect_location,
2671 "\t<gap of %d elements>\n",
2672 DR_GROUP_GAP (next) - 1);
2673 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2674 next = DR_GROUP_NEXT_ELEMENT (next);
2676 if (DR_GROUP_GAP (stmt_info) != 0)
2677 dump_printf_loc (MSG_NOTE, vect_location,
2678 "\t<gap of %d elements>\n",
2679 DR_GROUP_GAP (stmt_info));
2682 /* SLP: create an SLP data structure for every interleaving group of
2683 stores for further analysis in vect_analyse_slp. */
2684 if (DR_IS_WRITE (dr) && !slp_impossible)
2686 if (loop_vinfo)
2687 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2688 if (bb_vinfo)
2689 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2693 return true;
2696 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2697 accesses of legal size, step, etc. Detect gaps, single element
2698 interleaving, and other special cases. Set grouped access info.
2699 Collect groups of strided stores for further use in SLP analysis. */
2701 static bool
2702 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2704 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2706 /* Dissolve the group if present. */
2707 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2708 while (stmt_info)
2710 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2711 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2712 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2713 stmt_info = next;
2715 return false;
2717 return true;
2720 /* Analyze the access pattern of the data-reference DR_INFO.
2721 In case of non-consecutive accesses call vect_analyze_group_access() to
2722 analyze groups of accesses. */
2724 static bool
2725 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2727 data_reference *dr = dr_info->dr;
2728 tree step = DR_STEP (dr);
2729 tree scalar_type = TREE_TYPE (DR_REF (dr));
2730 stmt_vec_info stmt_info = dr_info->stmt;
2731 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2732 class loop *loop = NULL;
2734 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2735 return true;
2737 if (loop_vinfo)
2738 loop = LOOP_VINFO_LOOP (loop_vinfo);
2740 if (loop_vinfo && !step)
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744 "bad data-ref access in loop\n");
2745 return false;
2748 /* Allow loads with zero step in inner-loop vectorization. */
2749 if (loop_vinfo && integer_zerop (step))
2751 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752 if (!nested_in_vect_loop_p (loop, stmt_info))
2753 return DR_IS_READ (dr);
2754 /* Allow references with zero step for outer loops marked
2755 with pragma omp simd only - it guarantees absence of
2756 loop-carried dependencies between inner loop iterations. */
2757 if (loop->safelen < 2)
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_NOTE, vect_location,
2761 "zero step in inner loop of nest\n");
2762 return false;
2766 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2768 /* Interleaved accesses are not yet supported within outer-loop
2769 vectorization for references in the inner-loop. */
2770 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2772 /* For the rest of the analysis we use the outer-loop step. */
2773 step = STMT_VINFO_DR_STEP (stmt_info);
2774 if (integer_zerop (step))
2776 if (dump_enabled_p ())
2777 dump_printf_loc (MSG_NOTE, vect_location,
2778 "zero step in outer loop.\n");
2779 return DR_IS_READ (dr);
2783 /* Consecutive? */
2784 if (TREE_CODE (step) == INTEGER_CST)
2786 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2787 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2788 || (dr_step < 0
2789 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2791 /* Mark that it is not interleaving. */
2792 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2793 return true;
2797 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_NOTE, vect_location,
2801 "grouped access in outer loop.\n");
2802 return false;
2806 /* Assume this is a DR handled by non-constant strided load case. */
2807 if (TREE_CODE (step) != INTEGER_CST)
2808 return (STMT_VINFO_STRIDED_P (stmt_info)
2809 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2810 || vect_analyze_group_access (vinfo, dr_info)));
2812 /* Not consecutive access - check if it's a part of interleaving group. */
2813 return vect_analyze_group_access (vinfo, dr_info);
2816 typedef std::pair<data_reference_p, int> data_ref_pair;
2818 /* Compare two data-references DRA and DRB to group them into chunks
2819 suitable for grouping. */
2821 static int
2822 dr_group_sort_cmp (const void *dra_, const void *drb_)
2824 data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2825 data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2826 data_reference_p dra = dra_pair.first;
2827 data_reference_p drb = drb_pair.first;
2828 int cmp;
2830 /* Stabilize sort. */
2831 if (dra == drb)
2832 return 0;
2834 /* Different group IDs lead never belong to the same group. */
2835 if (dra_pair.second != drb_pair.second)
2836 return dra_pair.second < drb_pair.second ? -1 : 1;
2838 /* Ordering of DRs according to base. */
2839 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2840 DR_BASE_ADDRESS (drb));
2841 if (cmp != 0)
2842 return cmp;
2844 /* And according to DR_OFFSET. */
2845 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2846 if (cmp != 0)
2847 return cmp;
2849 /* Put reads before writes. */
2850 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2851 return DR_IS_READ (dra) ? -1 : 1;
2853 /* Then sort after access size. */
2854 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2855 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2856 if (cmp != 0)
2857 return cmp;
2859 /* And after step. */
2860 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2861 if (cmp != 0)
2862 return cmp;
2864 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2865 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2866 if (cmp == 0)
2867 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2868 return cmp;
2871 /* If OP is the result of a conversion, return the unconverted value,
2872 otherwise return null. */
2874 static tree
2875 strip_conversion (tree op)
2877 if (TREE_CODE (op) != SSA_NAME)
2878 return NULL_TREE;
2879 gimple *stmt = SSA_NAME_DEF_STMT (op);
2880 if (!is_gimple_assign (stmt)
2881 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2882 return NULL_TREE;
2883 return gimple_assign_rhs1 (stmt);
2886 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2887 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2888 be grouped in SLP mode. */
2890 static bool
2891 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2892 bool allow_slp_p)
2894 if (gimple_assign_single_p (stmt1_info->stmt))
2895 return gimple_assign_single_p (stmt2_info->stmt);
2897 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2898 if (call1 && gimple_call_internal_p (call1))
2900 /* Check for two masked loads or two masked stores. */
2901 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2902 if (!call2 || !gimple_call_internal_p (call2))
2903 return false;
2904 internal_fn ifn = gimple_call_internal_fn (call1);
2905 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2906 return false;
2907 if (ifn != gimple_call_internal_fn (call2))
2908 return false;
2910 /* Check that the masks are the same. Cope with casts of masks,
2911 like those created by build_mask_conversion. */
2912 tree mask1 = gimple_call_arg (call1, 2);
2913 tree mask2 = gimple_call_arg (call2, 2);
2914 if (!operand_equal_p (mask1, mask2, 0)
2915 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2917 mask1 = strip_conversion (mask1);
2918 if (!mask1)
2919 return false;
2920 mask2 = strip_conversion (mask2);
2921 if (!mask2)
2922 return false;
2923 if (!operand_equal_p (mask1, mask2, 0))
2924 return false;
2926 return true;
2929 return false;
2932 /* Function vect_analyze_data_ref_accesses.
2934 Analyze the access pattern of all the data references in the loop.
2936 FORNOW: the only access pattern that is considered vectorizable is a
2937 simple step 1 (consecutive) access.
2939 FORNOW: handle only arrays and pointer accesses. */
2941 opt_result
2942 vect_analyze_data_ref_accesses (vec_info *vinfo,
2943 vec<int> *dataref_groups)
2945 unsigned int i;
2946 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2948 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2950 if (datarefs.is_empty ())
2951 return opt_result::success ();
2953 /* Sort the array of datarefs to make building the interleaving chains
2954 linear. Don't modify the original vector's order, it is needed for
2955 determining what dependencies are reversed. */
2956 vec<data_ref_pair> datarefs_copy;
2957 datarefs_copy.create (datarefs.length ());
2958 for (unsigned i = 0; i < datarefs.length (); i++)
2960 int group_id;
2961 /* If the caller computed DR grouping use that, otherwise group by
2962 basic blocks. */
2963 if (dataref_groups)
2964 group_id = (*dataref_groups)[i];
2965 else
2966 group_id = gimple_bb (DR_STMT (datarefs[i]))->index;
2967 datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2969 datarefs_copy.qsort (dr_group_sort_cmp);
2970 hash_set<stmt_vec_info> to_fixup;
2972 /* Build the interleaving chains. */
2973 for (i = 0; i < datarefs_copy.length () - 1;)
2975 data_reference_p dra = datarefs_copy[i].first;
2976 int dra_group_id = datarefs_copy[i].second;
2977 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2978 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2979 stmt_vec_info lastinfo = NULL;
2980 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2981 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2983 ++i;
2984 continue;
2986 for (i = i + 1; i < datarefs_copy.length (); ++i)
2988 data_reference_p drb = datarefs_copy[i].first;
2989 int drb_group_id = datarefs_copy[i].second;
2990 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2991 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2992 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2993 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2994 break;
2996 /* ??? Imperfect sorting (non-compatible types, non-modulo
2997 accesses, same accesses) can lead to a group to be artificially
2998 split here as we don't just skip over those. If it really
2999 matters we can push those to a worklist and re-iterate
3000 over them. The we can just skip ahead to the next DR here. */
3002 /* DRs in a different DR group should not be put into the same
3003 interleaving group. */
3004 if (dra_group_id != drb_group_id)
3005 break;
3007 /* Check that the data-refs have same first location (except init)
3008 and they are both either store or load (not load and store,
3009 not masked loads or stores). */
3010 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3011 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3012 DR_BASE_ADDRESS (drb)) != 0
3013 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3014 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3015 break;
3017 /* Check that the data-refs have the same constant size. */
3018 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3019 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3020 if (!tree_fits_uhwi_p (sza)
3021 || !tree_fits_uhwi_p (szb)
3022 || !tree_int_cst_equal (sza, szb))
3023 break;
3025 /* Check that the data-refs have the same step. */
3026 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3027 break;
3029 /* Check the types are compatible.
3030 ??? We don't distinguish this during sorting. */
3031 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3032 TREE_TYPE (DR_REF (drb))))
3033 break;
3035 /* Check that the DR_INITs are compile-time constants. */
3036 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3037 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3038 break;
3040 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3041 just hold extra information. */
3042 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3043 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3044 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3045 break;
3047 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3048 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3049 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3050 HOST_WIDE_INT init_prev
3051 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3052 gcc_assert (init_a <= init_b
3053 && init_a <= init_prev
3054 && init_prev <= init_b);
3056 /* Do not place the same access in the interleaving chain twice. */
3057 if (init_b == init_prev)
3059 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3060 < gimple_uid (DR_STMT (drb)));
3061 /* Simply link in duplicates and fix up the chain below. */
3063 else
3065 /* If init_b == init_a + the size of the type * k, we have an
3066 interleaving, and DRA is accessed before DRB. */
3067 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3068 if (type_size_a == 0
3069 || (init_b - init_a) % type_size_a != 0)
3070 break;
3072 /* If we have a store, the accesses are adjacent. This splits
3073 groups into chunks we support (we don't support vectorization
3074 of stores with gaps). */
3075 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3076 break;
3078 /* If the step (if not zero or non-constant) is smaller than the
3079 difference between data-refs' inits this splits groups into
3080 suitable sizes. */
3081 if (tree_fits_shwi_p (DR_STEP (dra)))
3083 unsigned HOST_WIDE_INT step
3084 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3085 if (step != 0
3086 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3087 break;
3091 if (dump_enabled_p ())
3092 dump_printf_loc (MSG_NOTE, vect_location,
3093 DR_IS_READ (dra)
3094 ? "Detected interleaving load %T and %T\n"
3095 : "Detected interleaving store %T and %T\n",
3096 DR_REF (dra), DR_REF (drb));
3098 /* Link the found element into the group list. */
3099 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3101 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3102 lastinfo = stmtinfo_a;
3104 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3105 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3106 lastinfo = stmtinfo_b;
3108 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3109 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3111 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3112 dump_printf_loc (MSG_NOTE, vect_location,
3113 "Load suitable for SLP vectorization only.\n");
3115 if (init_b == init_prev
3116 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3117 && dump_enabled_p ())
3118 dump_printf_loc (MSG_NOTE, vect_location,
3119 "Queuing group with duplicate access for fixup\n");
3123 /* Fixup groups with duplicate entries by splitting it. */
3124 while (1)
3126 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3127 if (!(it != to_fixup.end ()))
3128 break;
3129 stmt_vec_info grp = *it;
3130 to_fixup.remove (grp);
3132 /* Find the earliest duplicate group member. */
3133 unsigned first_duplicate = -1u;
3134 stmt_vec_info next, g = grp;
3135 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3137 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3138 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3139 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3140 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3141 g = next;
3143 if (first_duplicate == -1U)
3144 continue;
3146 /* Then move all stmts after the first duplicate to a new group.
3147 Note this is a heuristic but one with the property that *it
3148 is fixed up completely. */
3149 g = grp;
3150 stmt_vec_info newgroup = NULL, ng = grp;
3151 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3153 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3155 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3156 if (!newgroup)
3157 newgroup = next;
3158 else
3159 DR_GROUP_NEXT_ELEMENT (ng) = next;
3160 ng = next;
3161 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3163 else
3164 g = DR_GROUP_NEXT_ELEMENT (g);
3166 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3168 /* Fixup the new group which still may contain duplicates. */
3169 to_fixup.add (newgroup);
3172 data_ref_pair *dr_pair;
3173 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3175 dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3176 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3177 && !vect_analyze_data_ref_access (vinfo, dr_info))
3179 if (dump_enabled_p ())
3180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3181 "not vectorized: complicated access pattern.\n");
3183 if (is_a <bb_vec_info> (vinfo))
3185 /* Mark the statement as not vectorizable. */
3186 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3187 continue;
3189 else
3191 datarefs_copy.release ();
3192 return opt_result::failure_at (dr_info->stmt->stmt,
3193 "not vectorized:"
3194 " complicated access pattern.\n");
3199 datarefs_copy.release ();
3200 return opt_result::success ();
3203 /* Function vect_vfa_segment_size.
3205 Input:
3206 DR_INFO: The data reference.
3207 LENGTH_FACTOR: segment length to consider.
3209 Return a value suitable for the dr_with_seg_len::seg_len field.
3210 This is the "distance travelled" by the pointer from the first
3211 iteration in the segment to the last. Note that it does not include
3212 the size of the access; in effect it only describes the first byte. */
3214 static tree
3215 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3217 length_factor = size_binop (MINUS_EXPR,
3218 fold_convert (sizetype, length_factor),
3219 size_one_node);
3220 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3221 length_factor);
3224 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3225 gives the worst-case number of bytes covered by the segment. */
3227 static unsigned HOST_WIDE_INT
3228 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3230 stmt_vec_info stmt_vinfo = dr_info->stmt;
3231 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3232 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3233 unsigned HOST_WIDE_INT access_size = ref_size;
3234 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3236 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3237 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3239 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3240 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3241 == dr_explicit_realign_optimized))
3243 /* We might access a full vector's worth. */
3244 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3245 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3247 return access_size;
3250 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3251 describes. */
3253 static unsigned int
3254 vect_vfa_align (dr_vec_info *dr_info)
3256 return dr_alignment (dr_info->dr);
3259 /* Function vect_no_alias_p.
3261 Given data references A and B with equal base and offset, see whether
3262 the alias relation can be decided at compilation time. Return 1 if
3263 it can and the references alias, 0 if it can and the references do
3264 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3265 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3266 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3268 static int
3269 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3270 tree segment_length_a, tree segment_length_b,
3271 unsigned HOST_WIDE_INT access_size_a,
3272 unsigned HOST_WIDE_INT access_size_b)
3274 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3275 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3276 poly_uint64 const_length_a;
3277 poly_uint64 const_length_b;
3279 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3280 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3281 [a, a+12) */
3282 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3284 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3285 offset_a -= const_length_a;
3287 else
3288 const_length_a = tree_to_poly_uint64 (segment_length_a);
3289 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3291 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3292 offset_b -= const_length_b;
3294 else
3295 const_length_b = tree_to_poly_uint64 (segment_length_b);
3297 const_length_a += access_size_a;
3298 const_length_b += access_size_b;
3300 if (ranges_known_overlap_p (offset_a, const_length_a,
3301 offset_b, const_length_b))
3302 return 1;
3304 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3305 offset_b, const_length_b))
3306 return 0;
3308 return -1;
3311 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3312 in DDR is >= VF. */
3314 static bool
3315 dependence_distance_ge_vf (data_dependence_relation *ddr,
3316 unsigned int loop_depth, poly_uint64 vf)
3318 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3319 || DDR_NUM_DIST_VECTS (ddr) == 0)
3320 return false;
3322 /* If the dependence is exact, we should have limited the VF instead. */
3323 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3325 unsigned int i;
3326 lambda_vector dist_v;
3327 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3329 HOST_WIDE_INT dist = dist_v[loop_depth];
3330 if (dist != 0
3331 && !(dist > 0 && DDR_REVERSED_P (ddr))
3332 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3333 return false;
3336 if (dump_enabled_p ())
3337 dump_printf_loc (MSG_NOTE, vect_location,
3338 "dependence distance between %T and %T is >= VF\n",
3339 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3341 return true;
3344 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3346 static void
3347 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3349 dump_printf (dump_kind, "%s (%T) >= ",
3350 lower_bound.unsigned_p ? "unsigned" : "abs",
3351 lower_bound.expr);
3352 dump_dec (dump_kind, lower_bound.min_value);
3355 /* Record that the vectorized loop requires the vec_lower_bound described
3356 by EXPR, UNSIGNED_P and MIN_VALUE. */
3358 static void
3359 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3360 poly_uint64 min_value)
3362 vec<vec_lower_bound> &lower_bounds
3363 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3364 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3365 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3367 unsigned_p &= lower_bounds[i].unsigned_p;
3368 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3369 if (lower_bounds[i].unsigned_p != unsigned_p
3370 || maybe_lt (lower_bounds[i].min_value, min_value))
3372 lower_bounds[i].unsigned_p = unsigned_p;
3373 lower_bounds[i].min_value = min_value;
3374 if (dump_enabled_p ())
3376 dump_printf_loc (MSG_NOTE, vect_location,
3377 "updating run-time check to ");
3378 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3379 dump_printf (MSG_NOTE, "\n");
3382 return;
3385 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3386 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3389 dump_lower_bound (MSG_NOTE, lower_bound);
3390 dump_printf (MSG_NOTE, "\n");
3392 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3395 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3396 will span fewer than GAP bytes. */
3398 static bool
3399 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3400 poly_int64 gap)
3402 stmt_vec_info stmt_info = dr_info->stmt;
3403 HOST_WIDE_INT count
3404 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3405 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3406 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3407 return (estimated_poly_value (gap)
3408 <= count * vect_get_scalar_dr_size (dr_info));
3411 /* Return true if we know that there is no alias between DR_INFO_A and
3412 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3413 When returning true, set *LOWER_BOUND_OUT to this N. */
3415 static bool
3416 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3417 poly_uint64 *lower_bound_out)
3419 /* Check that there is a constant gap of known sign between DR_A
3420 and DR_B. */
3421 data_reference *dr_a = dr_info_a->dr;
3422 data_reference *dr_b = dr_info_b->dr;
3423 poly_int64 init_a, init_b;
3424 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3425 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3426 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3427 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3428 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3429 || !ordered_p (init_a, init_b))
3430 return false;
3432 /* Sort DR_A and DR_B by the address they access. */
3433 if (maybe_lt (init_b, init_a))
3435 std::swap (init_a, init_b);
3436 std::swap (dr_info_a, dr_info_b);
3437 std::swap (dr_a, dr_b);
3440 /* If the two accesses could be dependent within a scalar iteration,
3441 make sure that we'd retain their order. */
3442 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3443 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3444 return false;
3446 /* There is no alias if abs (DR_STEP) is greater than or equal to
3447 the bytes spanned by the combination of the two accesses. */
3448 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3449 return true;
3452 /* Function vect_prune_runtime_alias_test_list.
3454 Prune a list of ddrs to be tested at run-time by versioning for alias.
3455 Merge several alias checks into one if possible.
3456 Return FALSE if resulting list of ddrs is longer then allowed by
3457 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3459 opt_result
3460 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3462 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3463 hash_set <tree_pair_hash> compared_objects;
3465 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3466 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3467 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3468 const vec<vec_object_pair> &check_unequal_addrs
3469 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3470 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3471 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3473 ddr_p ddr;
3474 unsigned int i;
3475 tree length_factor;
3477 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3479 /* Step values are irrelevant for aliasing if the number of vector
3480 iterations is equal to the number of scalar iterations (which can
3481 happen for fully-SLP loops). */
3482 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3484 if (!vf_one_p)
3486 /* Convert the checks for nonzero steps into bound tests. */
3487 tree value;
3488 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3489 vect_check_lower_bound (loop_vinfo, value, true, 1);
3492 if (may_alias_ddrs.is_empty ())
3493 return opt_result::success ();
3495 comp_alias_ddrs.create (may_alias_ddrs.length ());
3497 unsigned int loop_depth
3498 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3499 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3501 /* First, we collect all data ref pairs for aliasing checks. */
3502 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3504 poly_uint64 lower_bound;
3505 tree segment_length_a, segment_length_b;
3506 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3507 unsigned int align_a, align_b;
3509 /* Ignore the alias if the VF we chose ended up being no greater
3510 than the dependence distance. */
3511 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3512 continue;
3514 if (DDR_OBJECT_A (ddr))
3516 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3517 if (!compared_objects.add (new_pair))
3519 if (dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location,
3521 "checking that %T and %T"
3522 " have different addresses\n",
3523 new_pair.first, new_pair.second);
3524 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3526 continue;
3529 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3530 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3532 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3533 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3535 bool preserves_scalar_order_p
3536 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3537 bool ignore_step_p
3538 = (vf_one_p
3539 && (preserves_scalar_order_p
3540 || operand_equal_p (DR_STEP (dr_info_a->dr),
3541 DR_STEP (dr_info_b->dr))));
3543 /* Skip the pair if inter-iteration dependencies are irrelevant
3544 and intra-iteration dependencies are guaranteed to be honored. */
3545 if (ignore_step_p
3546 && (preserves_scalar_order_p
3547 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3548 &lower_bound)))
3550 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_NOTE, vect_location,
3552 "no need for alias check between "
3553 "%T and %T when VF is 1\n",
3554 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3555 continue;
3558 /* See whether we can handle the alias using a bounds check on
3559 the step, and whether that's likely to be the best approach.
3560 (It might not be, for example, if the minimum step is much larger
3561 than the number of bytes handled by one vector iteration.) */
3562 if (!ignore_step_p
3563 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3564 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3565 &lower_bound)
3566 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3567 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3569 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3570 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3573 "%T and %T when the step %T is outside ",
3574 DR_REF (dr_info_a->dr),
3575 DR_REF (dr_info_b->dr),
3576 DR_STEP (dr_info_a->dr));
3577 if (unsigned_p)
3578 dump_printf (MSG_NOTE, "[0");
3579 else
3581 dump_printf (MSG_NOTE, "(");
3582 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3584 dump_printf (MSG_NOTE, ", ");
3585 dump_dec (MSG_NOTE, lower_bound);
3586 dump_printf (MSG_NOTE, ")\n");
3588 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3589 unsigned_p, lower_bound);
3590 continue;
3593 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3594 if (dr_group_first_a)
3596 stmt_info_a = dr_group_first_a;
3597 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3600 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3601 if (dr_group_first_b)
3603 stmt_info_b = dr_group_first_b;
3604 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3607 if (ignore_step_p)
3609 segment_length_a = size_zero_node;
3610 segment_length_b = size_zero_node;
3612 else
3614 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3615 DR_STEP (dr_info_b->dr), 0))
3616 length_factor = scalar_loop_iters;
3617 else
3618 length_factor = size_int (vect_factor);
3619 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3620 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3622 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3623 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3624 align_a = vect_vfa_align (dr_info_a);
3625 align_b = vect_vfa_align (dr_info_b);
3627 /* See whether the alias is known at compilation time. */
3628 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3629 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3630 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3631 DR_OFFSET (dr_info_b->dr), 0)
3632 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3633 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3634 && poly_int_tree_p (segment_length_a)
3635 && poly_int_tree_p (segment_length_b))
3637 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3638 segment_length_a,
3639 segment_length_b,
3640 access_size_a,
3641 access_size_b);
3642 if (res >= 0 && dump_enabled_p ())
3644 dump_printf_loc (MSG_NOTE, vect_location,
3645 "can tell at compile time that %T and %T",
3646 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3647 if (res == 0)
3648 dump_printf (MSG_NOTE, " do not alias\n");
3649 else
3650 dump_printf (MSG_NOTE, " alias\n");
3653 if (res == 0)
3654 continue;
3656 if (res == 1)
3657 return opt_result::failure_at (stmt_info_b->stmt,
3658 "not vectorized:"
3659 " compilation time alias: %G%G",
3660 stmt_info_a->stmt,
3661 stmt_info_b->stmt);
3664 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3665 access_size_a, align_a);
3666 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3667 access_size_b, align_b);
3668 /* Canonicalize the order to be the one that's needed for accurate
3669 RAW, WAR and WAW flags, in cases where the data references are
3670 well-ordered. The order doesn't really matter otherwise,
3671 but we might as well be consistent. */
3672 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3673 std::swap (dr_a, dr_b);
3675 dr_with_seg_len_pair_t dr_with_seg_len_pair
3676 (dr_a, dr_b, (preserves_scalar_order_p
3677 ? dr_with_seg_len_pair_t::WELL_ORDERED
3678 : dr_with_seg_len_pair_t::REORDERED));
3680 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3683 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3685 unsigned int count = (comp_alias_ddrs.length ()
3686 + check_unequal_addrs.length ());
3688 if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
3689 return opt_result::failure_at
3690 (vect_location, "would need a runtime alias check\n");
3692 if (dump_enabled_p ())
3693 dump_printf_loc (MSG_NOTE, vect_location,
3694 "improved number of alias checks from %d to %d\n",
3695 may_alias_ddrs.length (), count);
3696 unsigned limit = param_vect_max_version_for_alias_checks;
3697 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3698 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3699 if (count > limit)
3700 return opt_result::failure_at
3701 (vect_location,
3702 "number of versioning for alias run-time tests exceeds %d "
3703 "(--param vect-max-version-for-alias-checks)\n", limit);
3705 return opt_result::success ();
3708 /* Check whether we can use an internal function for a gather load
3709 or scatter store. READ_P is true for loads and false for stores.
3710 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3711 the type of the memory elements being loaded or stored. OFFSET_TYPE
3712 is the type of the offset that is being applied to the invariant
3713 base address. SCALE is the amount by which the offset should
3714 be multiplied *after* it has been converted to address width.
3716 Return true if the function is supported, storing the function id in
3717 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3719 bool
3720 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3721 tree vectype, tree memory_type, tree offset_type,
3722 int scale, internal_fn *ifn_out,
3723 tree *offset_vectype_out)
3725 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3726 unsigned int element_bits = vector_element_bits (vectype);
3727 if (element_bits != memory_bits)
3728 /* For now the vector elements must be the same width as the
3729 memory elements. */
3730 return false;
3732 /* Work out which function we need. */
3733 internal_fn ifn, alt_ifn;
3734 if (read_p)
3736 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3737 alt_ifn = IFN_MASK_GATHER_LOAD;
3739 else
3741 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3742 alt_ifn = IFN_MASK_SCATTER_STORE;
3745 for (;;)
3747 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3748 if (!offset_vectype)
3749 return false;
3751 /* Test whether the target supports this combination. */
3752 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3753 offset_vectype, scale))
3755 *ifn_out = ifn;
3756 *offset_vectype_out = offset_vectype;
3757 return true;
3759 else if (!masked_p
3760 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3761 memory_type,
3762 offset_vectype,
3763 scale))
3765 *ifn_out = alt_ifn;
3766 *offset_vectype_out = offset_vectype;
3767 return true;
3770 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3771 && TYPE_PRECISION (offset_type) >= element_bits)
3772 return false;
3774 offset_type = build_nonstandard_integer_type
3775 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3779 /* STMT_INFO is a call to an internal gather load or scatter store function.
3780 Describe the operation in INFO. */
3782 static void
3783 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3784 gather_scatter_info *info)
3786 gcall *call = as_a <gcall *> (stmt_info->stmt);
3787 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3788 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3790 info->ifn = gimple_call_internal_fn (call);
3791 info->decl = NULL_TREE;
3792 info->base = gimple_call_arg (call, 0);
3793 info->offset = gimple_call_arg (call, 1);
3794 info->offset_dt = vect_unknown_def_type;
3795 info->offset_vectype = NULL_TREE;
3796 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3797 info->element_type = TREE_TYPE (vectype);
3798 info->memory_type = TREE_TYPE (DR_REF (dr));
3801 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3802 gather load or scatter store. Describe the operation in *INFO if so. */
3804 bool
3805 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3806 gather_scatter_info *info)
3808 HOST_WIDE_INT scale = 1;
3809 poly_int64 pbitpos, pbitsize;
3810 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3811 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3812 tree offtype = NULL_TREE;
3813 tree decl = NULL_TREE, base, off;
3814 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3815 tree memory_type = TREE_TYPE (DR_REF (dr));
3816 machine_mode pmode;
3817 int punsignedp, reversep, pvolatilep = 0;
3818 internal_fn ifn;
3819 tree offset_vectype;
3820 bool masked_p = false;
3822 /* See whether this is already a call to a gather/scatter internal function.
3823 If not, see whether it's a masked load or store. */
3824 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3825 if (call && gimple_call_internal_p (call))
3827 ifn = gimple_call_internal_fn (call);
3828 if (internal_gather_scatter_fn_p (ifn))
3830 vect_describe_gather_scatter_call (stmt_info, info);
3831 return true;
3833 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3836 /* True if we should aim to use internal functions rather than
3837 built-in functions. */
3838 bool use_ifn_p = (DR_IS_READ (dr)
3839 ? supports_vec_gather_load_p ()
3840 : supports_vec_scatter_store_p ());
3842 base = DR_REF (dr);
3843 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3844 see if we can use the def stmt of the address. */
3845 if (masked_p
3846 && TREE_CODE (base) == MEM_REF
3847 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3848 && integer_zerop (TREE_OPERAND (base, 1))
3849 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3851 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3852 if (is_gimple_assign (def_stmt)
3853 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3854 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3857 /* The gather and scatter builtins need address of the form
3858 loop_invariant + vector * {1, 2, 4, 8}
3860 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3861 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3862 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3863 multiplications and additions in it. To get a vector, we need
3864 a single SSA_NAME that will be defined in the loop and will
3865 contain everything that is not loop invariant and that can be
3866 vectorized. The following code attempts to find such a preexistng
3867 SSA_NAME OFF and put the loop invariants into a tree BASE
3868 that can be gimplified before the loop. */
3869 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3870 &punsignedp, &reversep, &pvolatilep);
3871 if (reversep)
3872 return false;
3874 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3876 if (TREE_CODE (base) == MEM_REF)
3878 if (!integer_zerop (TREE_OPERAND (base, 1)))
3880 if (off == NULL_TREE)
3881 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3882 else
3883 off = size_binop (PLUS_EXPR, off,
3884 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3886 base = TREE_OPERAND (base, 0);
3888 else
3889 base = build_fold_addr_expr (base);
3891 if (off == NULL_TREE)
3892 off = size_zero_node;
3894 /* If base is not loop invariant, either off is 0, then we start with just
3895 the constant offset in the loop invariant BASE and continue with base
3896 as OFF, otherwise give up.
3897 We could handle that case by gimplifying the addition of base + off
3898 into some SSA_NAME and use that as off, but for now punt. */
3899 if (!expr_invariant_in_loop_p (loop, base))
3901 if (!integer_zerop (off))
3902 return false;
3903 off = base;
3904 base = size_int (pbytepos);
3906 /* Otherwise put base + constant offset into the loop invariant BASE
3907 and continue with OFF. */
3908 else
3910 base = fold_convert (sizetype, base);
3911 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3914 /* OFF at this point may be either a SSA_NAME or some tree expression
3915 from get_inner_reference. Try to peel off loop invariants from it
3916 into BASE as long as possible. */
3917 STRIP_NOPS (off);
3918 while (offtype == NULL_TREE)
3920 enum tree_code code;
3921 tree op0, op1, add = NULL_TREE;
3923 if (TREE_CODE (off) == SSA_NAME)
3925 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3927 if (expr_invariant_in_loop_p (loop, off))
3928 return false;
3930 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3931 break;
3933 op0 = gimple_assign_rhs1 (def_stmt);
3934 code = gimple_assign_rhs_code (def_stmt);
3935 op1 = gimple_assign_rhs2 (def_stmt);
3937 else
3939 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3940 return false;
3941 code = TREE_CODE (off);
3942 extract_ops_from_tree (off, &code, &op0, &op1);
3944 switch (code)
3946 case POINTER_PLUS_EXPR:
3947 case PLUS_EXPR:
3948 if (expr_invariant_in_loop_p (loop, op0))
3950 add = op0;
3951 off = op1;
3952 do_add:
3953 add = fold_convert (sizetype, add);
3954 if (scale != 1)
3955 add = size_binop (MULT_EXPR, add, size_int (scale));
3956 base = size_binop (PLUS_EXPR, base, add);
3957 continue;
3959 if (expr_invariant_in_loop_p (loop, op1))
3961 add = op1;
3962 off = op0;
3963 goto do_add;
3965 break;
3966 case MINUS_EXPR:
3967 if (expr_invariant_in_loop_p (loop, op1))
3969 add = fold_convert (sizetype, op1);
3970 add = size_binop (MINUS_EXPR, size_zero_node, add);
3971 off = op0;
3972 goto do_add;
3974 break;
3975 case MULT_EXPR:
3976 if (scale == 1 && tree_fits_shwi_p (op1))
3978 int new_scale = tree_to_shwi (op1);
3979 /* Only treat this as a scaling operation if the target
3980 supports it for at least some offset type. */
3981 if (use_ifn_p
3982 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3983 masked_p, vectype, memory_type,
3984 signed_char_type_node,
3985 new_scale, &ifn,
3986 &offset_vectype)
3987 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3988 masked_p, vectype, memory_type,
3989 unsigned_char_type_node,
3990 new_scale, &ifn,
3991 &offset_vectype))
3992 break;
3993 scale = new_scale;
3994 off = op0;
3995 continue;
3997 break;
3998 case SSA_NAME:
3999 off = op0;
4000 continue;
4001 CASE_CONVERT:
4002 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4003 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4004 break;
4006 /* Don't include the conversion if the target is happy with
4007 the current offset type. */
4008 if (use_ifn_p
4009 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4010 masked_p, vectype, memory_type,
4011 TREE_TYPE (off), scale, &ifn,
4012 &offset_vectype))
4013 break;
4015 if (TYPE_PRECISION (TREE_TYPE (op0))
4016 == TYPE_PRECISION (TREE_TYPE (off)))
4018 off = op0;
4019 continue;
4022 /* Include the conversion if it is widening and we're using
4023 the IFN path or the target can handle the converted from
4024 offset or the current size is not already the same as the
4025 data vector element size. */
4026 if ((TYPE_PRECISION (TREE_TYPE (op0))
4027 < TYPE_PRECISION (TREE_TYPE (off)))
4028 && (use_ifn_p
4029 || (DR_IS_READ (dr)
4030 ? (targetm.vectorize.builtin_gather
4031 && targetm.vectorize.builtin_gather (vectype,
4032 TREE_TYPE (op0),
4033 scale))
4034 : (targetm.vectorize.builtin_scatter
4035 && targetm.vectorize.builtin_scatter (vectype,
4036 TREE_TYPE (op0),
4037 scale)))
4038 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4039 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4041 off = op0;
4042 offtype = TREE_TYPE (off);
4043 STRIP_NOPS (off);
4044 continue;
4046 break;
4047 default:
4048 break;
4050 break;
4053 /* If at the end OFF still isn't a SSA_NAME or isn't
4054 defined in the loop, punt. */
4055 if (TREE_CODE (off) != SSA_NAME
4056 || expr_invariant_in_loop_p (loop, off))
4057 return false;
4059 if (offtype == NULL_TREE)
4060 offtype = TREE_TYPE (off);
4062 if (use_ifn_p)
4064 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4065 vectype, memory_type, offtype, scale,
4066 &ifn, &offset_vectype))
4067 ifn = IFN_LAST;
4068 decl = NULL_TREE;
4070 else
4072 if (DR_IS_READ (dr))
4074 if (targetm.vectorize.builtin_gather)
4075 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4077 else
4079 if (targetm.vectorize.builtin_scatter)
4080 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4082 ifn = IFN_LAST;
4083 /* The offset vector type will be read from DECL when needed. */
4084 offset_vectype = NULL_TREE;
4087 info->ifn = ifn;
4088 info->decl = decl;
4089 info->base = base;
4090 info->offset = off;
4091 info->offset_dt = vect_unknown_def_type;
4092 info->offset_vectype = offset_vectype;
4093 info->scale = scale;
4094 info->element_type = TREE_TYPE (vectype);
4095 info->memory_type = memory_type;
4096 return true;
4099 /* Find the data references in STMT, analyze them with respect to LOOP and
4100 append them to DATAREFS. Return false if datarefs in this stmt cannot
4101 be handled. */
4103 opt_result
4104 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4105 vec<data_reference_p> *datarefs,
4106 vec<int> *dataref_groups, int group_id)
4108 /* We can ignore clobbers for dataref analysis - they are removed during
4109 loop vectorization and BB vectorization checks dependences with a
4110 stmt walk. */
4111 if (gimple_clobber_p (stmt))
4112 return opt_result::success ();
4114 if (gimple_has_volatile_ops (stmt))
4115 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4116 stmt);
4118 if (stmt_can_throw_internal (cfun, stmt))
4119 return opt_result::failure_at (stmt,
4120 "not vectorized:"
4121 " statement can throw an exception: %G",
4122 stmt);
4124 auto_vec<data_reference_p, 2> refs;
4125 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4126 if (!res)
4127 return res;
4129 if (refs.is_empty ())
4130 return opt_result::success ();
4132 if (refs.length () > 1)
4134 while (!refs.is_empty ())
4135 free_data_ref (refs.pop ());
4136 return opt_result::failure_at (stmt,
4137 "not vectorized: more than one "
4138 "data ref in stmt: %G", stmt);
4141 data_reference_p dr = refs.pop ();
4142 if (gcall *call = dyn_cast <gcall *> (stmt))
4143 if (!gimple_call_internal_p (call)
4144 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4145 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4147 free_data_ref (dr);
4148 return opt_result::failure_at (stmt,
4149 "not vectorized: dr in a call %G", stmt);
4152 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4153 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4155 free_data_ref (dr);
4156 return opt_result::failure_at (stmt,
4157 "not vectorized:"
4158 " statement is bitfield access %G", stmt);
4161 if (DR_BASE_ADDRESS (dr)
4162 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4164 free_data_ref (dr);
4165 return opt_result::failure_at (stmt,
4166 "not vectorized:"
4167 " base addr of dr is a constant\n");
4170 /* Check whether this may be a SIMD lane access and adjust the
4171 DR to make it easier for us to handle it. */
4172 if (loop
4173 && loop->simduid
4174 && (!DR_BASE_ADDRESS (dr)
4175 || !DR_OFFSET (dr)
4176 || !DR_INIT (dr)
4177 || !DR_STEP (dr)))
4179 struct data_reference *newdr
4180 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4181 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4182 if (DR_BASE_ADDRESS (newdr)
4183 && DR_OFFSET (newdr)
4184 && DR_INIT (newdr)
4185 && DR_STEP (newdr)
4186 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4187 && integer_zerop (DR_STEP (newdr)))
4189 tree base_address = DR_BASE_ADDRESS (newdr);
4190 tree off = DR_OFFSET (newdr);
4191 tree step = ssize_int (1);
4192 if (integer_zerop (off)
4193 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4195 off = TREE_OPERAND (base_address, 1);
4196 base_address = TREE_OPERAND (base_address, 0);
4198 STRIP_NOPS (off);
4199 if (TREE_CODE (off) == MULT_EXPR
4200 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4202 step = TREE_OPERAND (off, 1);
4203 off = TREE_OPERAND (off, 0);
4204 STRIP_NOPS (off);
4206 if (CONVERT_EXPR_P (off)
4207 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4208 < TYPE_PRECISION (TREE_TYPE (off))))
4209 off = TREE_OPERAND (off, 0);
4210 if (TREE_CODE (off) == SSA_NAME)
4212 gimple *def = SSA_NAME_DEF_STMT (off);
4213 /* Look through widening conversion. */
4214 if (is_gimple_assign (def)
4215 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4217 tree rhs1 = gimple_assign_rhs1 (def);
4218 if (TREE_CODE (rhs1) == SSA_NAME
4219 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4220 && (TYPE_PRECISION (TREE_TYPE (off))
4221 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4222 def = SSA_NAME_DEF_STMT (rhs1);
4224 if (is_gimple_call (def)
4225 && gimple_call_internal_p (def)
4226 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4228 tree arg = gimple_call_arg (def, 0);
4229 tree reft = TREE_TYPE (DR_REF (newdr));
4230 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4231 arg = SSA_NAME_VAR (arg);
4232 if (arg == loop->simduid
4233 /* For now. */
4234 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4236 DR_BASE_ADDRESS (newdr) = base_address;
4237 DR_OFFSET (newdr) = ssize_int (0);
4238 DR_STEP (newdr) = step;
4239 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4240 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4241 /* Mark as simd-lane access. */
4242 tree arg2 = gimple_call_arg (def, 1);
4243 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4244 free_data_ref (dr);
4245 datarefs->safe_push (newdr);
4246 if (dataref_groups)
4247 dataref_groups->safe_push (group_id);
4248 return opt_result::success ();
4253 free_data_ref (newdr);
4256 datarefs->safe_push (dr);
4257 if (dataref_groups)
4258 dataref_groups->safe_push (group_id);
4259 return opt_result::success ();
4262 /* Function vect_analyze_data_refs.
4264 Find all the data references in the loop or basic block.
4266 The general structure of the analysis of data refs in the vectorizer is as
4267 follows:
4268 1- vect_analyze_data_refs(loop/bb): call
4269 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4270 in the loop/bb and their dependences.
4271 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4272 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4273 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4277 opt_result
4278 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4280 class loop *loop = NULL;
4281 unsigned int i;
4282 struct data_reference *dr;
4283 tree scalar_type;
4285 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4287 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4288 loop = LOOP_VINFO_LOOP (loop_vinfo);
4290 /* Go through the data-refs, check that the analysis succeeded. Update
4291 pointer from stmt_vec_info struct to DR and vectype. */
4293 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4294 FOR_EACH_VEC_ELT (datarefs, i, dr)
4296 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4297 poly_uint64 vf;
4299 gcc_assert (DR_REF (dr));
4300 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4301 gcc_assert (!stmt_info->dr_aux.dr);
4302 stmt_info->dr_aux.dr = dr;
4303 stmt_info->dr_aux.stmt = stmt_info;
4305 /* Check that analysis of the data-ref succeeded. */
4306 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4307 || !DR_STEP (dr))
4309 bool maybe_gather
4310 = DR_IS_READ (dr)
4311 && !TREE_THIS_VOLATILE (DR_REF (dr));
4312 bool maybe_scatter
4313 = DR_IS_WRITE (dr)
4314 && !TREE_THIS_VOLATILE (DR_REF (dr))
4315 && (targetm.vectorize.builtin_scatter != NULL
4316 || supports_vec_scatter_store_p ());
4318 /* If target supports vector gather loads or scatter stores,
4319 see if they can't be used. */
4320 if (is_a <loop_vec_info> (vinfo)
4321 && !nested_in_vect_loop_p (loop, stmt_info))
4323 if (maybe_gather || maybe_scatter)
4325 if (maybe_gather)
4326 gatherscatter = GATHER;
4327 else
4328 gatherscatter = SCATTER;
4332 if (gatherscatter == SG_NONE)
4334 if (dump_enabled_p ())
4335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4336 "not vectorized: data ref analysis "
4337 "failed %G", stmt_info->stmt);
4338 if (is_a <bb_vec_info> (vinfo))
4340 /* In BB vectorization the ref can still participate
4341 in dependence analysis, we just can't vectorize it. */
4342 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4343 continue;
4345 return opt_result::failure_at (stmt_info->stmt,
4346 "not vectorized:"
4347 " data ref analysis failed: %G",
4348 stmt_info->stmt);
4352 /* See if this was detected as SIMD lane access. */
4353 if (dr->aux == (void *)-1
4354 || dr->aux == (void *)-2
4355 || dr->aux == (void *)-3
4356 || dr->aux == (void *)-4)
4358 if (nested_in_vect_loop_p (loop, stmt_info))
4359 return opt_result::failure_at (stmt_info->stmt,
4360 "not vectorized:"
4361 " data ref analysis failed: %G",
4362 stmt_info->stmt);
4363 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4364 = -(uintptr_t) dr->aux;
4367 tree base = get_base_address (DR_REF (dr));
4368 if (base && VAR_P (base) && DECL_NONALIASED (base))
4370 if (dump_enabled_p ())
4371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4372 "not vectorized: base object not addressable "
4373 "for stmt: %G", stmt_info->stmt);
4374 if (is_a <bb_vec_info> (vinfo))
4376 /* In BB vectorization the ref can still participate
4377 in dependence analysis, we just can't vectorize it. */
4378 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4379 continue;
4381 return opt_result::failure_at (stmt_info->stmt,
4382 "not vectorized: base object not"
4383 " addressable for stmt: %G",
4384 stmt_info->stmt);
4387 if (is_a <loop_vec_info> (vinfo)
4388 && DR_STEP (dr)
4389 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4391 if (nested_in_vect_loop_p (loop, stmt_info))
4392 return opt_result::failure_at (stmt_info->stmt,
4393 "not vectorized: "
4394 "not suitable for strided load %G",
4395 stmt_info->stmt);
4396 STMT_VINFO_STRIDED_P (stmt_info) = true;
4399 /* Update DR field in stmt_vec_info struct. */
4401 /* If the dataref is in an inner-loop of the loop that is considered for
4402 for vectorization, we also want to analyze the access relative to
4403 the outer-loop (DR contains information only relative to the
4404 inner-most enclosing loop). We do that by building a reference to the
4405 first location accessed by the inner-loop, and analyze it relative to
4406 the outer-loop. */
4407 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4409 /* Build a reference to the first location accessed by the
4410 inner loop: *(BASE + INIT + OFFSET). By construction,
4411 this address must be invariant in the inner loop, so we
4412 can consider it as being used in the outer loop. */
4413 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4414 tree offset = unshare_expr (DR_OFFSET (dr));
4415 tree init = unshare_expr (DR_INIT (dr));
4416 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4417 init, offset);
4418 tree init_addr = fold_build_pointer_plus (base, init_offset);
4419 tree init_ref = build_fold_indirect_ref (init_addr);
4421 if (dump_enabled_p ())
4422 dump_printf_loc (MSG_NOTE, vect_location,
4423 "analyze in outer loop: %T\n", init_ref);
4425 opt_result res
4426 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4427 init_ref, loop, stmt_info->stmt);
4428 if (!res)
4429 /* dr_analyze_innermost already explained the failure. */
4430 return res;
4432 if (dump_enabled_p ())
4433 dump_printf_loc (MSG_NOTE, vect_location,
4434 "\touter base_address: %T\n"
4435 "\touter offset from base address: %T\n"
4436 "\touter constant offset from base address: %T\n"
4437 "\touter step: %T\n"
4438 "\touter base alignment: %d\n\n"
4439 "\touter base misalignment: %d\n"
4440 "\touter offset alignment: %d\n"
4441 "\touter step alignment: %d\n",
4442 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4443 STMT_VINFO_DR_OFFSET (stmt_info),
4444 STMT_VINFO_DR_INIT (stmt_info),
4445 STMT_VINFO_DR_STEP (stmt_info),
4446 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4447 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4448 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4449 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4452 /* Set vectype for STMT. */
4453 scalar_type = TREE_TYPE (DR_REF (dr));
4454 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4455 if (!vectype)
4457 if (dump_enabled_p ())
4459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4460 "not vectorized: no vectype for stmt: %G",
4461 stmt_info->stmt);
4462 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4463 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4464 scalar_type);
4465 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4468 if (is_a <bb_vec_info> (vinfo))
4470 /* No vector type is fine, the ref can still participate
4471 in dependence analysis, we just can't vectorize it. */
4472 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4473 continue;
4475 if (fatal)
4476 *fatal = false;
4477 return opt_result::failure_at (stmt_info->stmt,
4478 "not vectorized:"
4479 " no vectype for stmt: %G"
4480 " scalar_type: %T\n",
4481 stmt_info->stmt, scalar_type);
4483 else
4485 if (dump_enabled_p ())
4486 dump_printf_loc (MSG_NOTE, vect_location,
4487 "got vectype for stmt: %G%T\n",
4488 stmt_info->stmt, vectype);
4491 /* Adjust the minimal vectorization factor according to the
4492 vector type. */
4493 vf = TYPE_VECTOR_SUBPARTS (vectype);
4494 *min_vf = upper_bound (*min_vf, vf);
4496 /* Leave the BB vectorizer to pick the vector type later, based on
4497 the final dataref group size and SLP node size. */
4498 if (is_a <loop_vec_info> (vinfo))
4499 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4501 if (gatherscatter != SG_NONE)
4503 gather_scatter_info gs_info;
4504 if (!vect_check_gather_scatter (stmt_info,
4505 as_a <loop_vec_info> (vinfo),
4506 &gs_info)
4507 || !get_vectype_for_scalar_type (vinfo,
4508 TREE_TYPE (gs_info.offset)))
4510 if (fatal)
4511 *fatal = false;
4512 return opt_result::failure_at
4513 (stmt_info->stmt,
4514 (gatherscatter == GATHER)
4515 ? "not vectorized: not suitable for gather load %G"
4516 : "not vectorized: not suitable for scatter store %G",
4517 stmt_info->stmt);
4519 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4523 /* We used to stop processing and prune the list here. Verify we no
4524 longer need to. */
4525 gcc_assert (i == datarefs.length ());
4527 return opt_result::success ();
4531 /* Function vect_get_new_vect_var.
4533 Returns a name for a new variable. The current naming scheme appends the
4534 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4535 the name of vectorizer generated variables, and appends that to NAME if
4536 provided. */
4538 tree
4539 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4541 const char *prefix;
4542 tree new_vect_var;
4544 switch (var_kind)
4546 case vect_simple_var:
4547 prefix = "vect";
4548 break;
4549 case vect_scalar_var:
4550 prefix = "stmp";
4551 break;
4552 case vect_mask_var:
4553 prefix = "mask";
4554 break;
4555 case vect_pointer_var:
4556 prefix = "vectp";
4557 break;
4558 default:
4559 gcc_unreachable ();
4562 if (name)
4564 char* tmp = concat (prefix, "_", name, NULL);
4565 new_vect_var = create_tmp_reg (type, tmp);
4566 free (tmp);
4568 else
4569 new_vect_var = create_tmp_reg (type, prefix);
4571 return new_vect_var;
4574 /* Like vect_get_new_vect_var but return an SSA name. */
4576 tree
4577 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4579 const char *prefix;
4580 tree new_vect_var;
4582 switch (var_kind)
4584 case vect_simple_var:
4585 prefix = "vect";
4586 break;
4587 case vect_scalar_var:
4588 prefix = "stmp";
4589 break;
4590 case vect_pointer_var:
4591 prefix = "vectp";
4592 break;
4593 default:
4594 gcc_unreachable ();
4597 if (name)
4599 char* tmp = concat (prefix, "_", name, NULL);
4600 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4601 free (tmp);
4603 else
4604 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4606 return new_vect_var;
4609 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4611 static void
4612 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4614 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4615 int misalign = DR_MISALIGNMENT (dr_info);
4616 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4617 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4618 else
4619 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4620 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4621 misalign);
4624 /* Function vect_create_addr_base_for_vector_ref.
4626 Create an expression that computes the address of the first memory location
4627 that will be accessed for a data reference.
4629 Input:
4630 STMT_INFO: The statement containing the data reference.
4631 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4632 OFFSET: Optional. If supplied, it is be added to the initial address.
4633 LOOP: Specify relative to which loop-nest should the address be computed.
4634 For example, when the dataref is in an inner-loop nested in an
4635 outer-loop that is now being vectorized, LOOP can be either the
4636 outer-loop, or the inner-loop. The first memory location accessed
4637 by the following dataref ('in' points to short):
4639 for (i=0; i<N; i++)
4640 for (j=0; j<M; j++)
4641 s += in[i+j]
4643 is as follows:
4644 if LOOP=i_loop: &in (relative to i_loop)
4645 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4646 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4647 initial address. Unlike OFFSET, which is number of elements to
4648 be added, BYTE_OFFSET is measured in bytes.
4650 Output:
4651 1. Return an SSA_NAME whose value is the address of the memory location of
4652 the first vector of the data reference.
4653 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4654 these statement(s) which define the returned SSA_NAME.
4656 FORNOW: We are only handling array accesses with step 1. */
4658 tree
4659 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4660 gimple_seq *new_stmt_list,
4661 tree offset,
4662 tree byte_offset)
4664 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4665 struct data_reference *dr = dr_info->dr;
4666 const char *base_name;
4667 tree addr_base;
4668 tree dest;
4669 gimple_seq seq = NULL;
4670 tree vect_ptr_type;
4671 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4672 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4673 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4675 tree data_ref_base = unshare_expr (drb->base_address);
4676 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4677 tree init = unshare_expr (drb->init);
4679 if (loop_vinfo)
4680 base_name = get_name (data_ref_base);
4681 else
4683 base_offset = ssize_int (0);
4684 init = ssize_int (0);
4685 base_name = get_name (DR_REF (dr));
4688 /* Create base_offset */
4689 base_offset = size_binop (PLUS_EXPR,
4690 fold_convert (sizetype, base_offset),
4691 fold_convert (sizetype, init));
4693 if (offset)
4695 offset = fold_build2 (MULT_EXPR, sizetype,
4696 fold_convert (sizetype, offset), step);
4697 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4698 base_offset, offset);
4700 if (byte_offset)
4702 byte_offset = fold_convert (sizetype, byte_offset);
4703 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4704 base_offset, byte_offset);
4707 /* base + base_offset */
4708 if (loop_vinfo)
4709 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4710 else
4712 addr_base = build1 (ADDR_EXPR,
4713 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4714 unshare_expr (DR_REF (dr)));
4717 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4718 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4719 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4720 gimple_seq_add_seq (new_stmt_list, seq);
4722 if (DR_PTR_INFO (dr)
4723 && TREE_CODE (addr_base) == SSA_NAME
4724 && !SSA_NAME_PTR_INFO (addr_base))
4726 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4727 if (offset || byte_offset)
4728 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4731 if (dump_enabled_p ())
4732 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4734 return addr_base;
4738 /* Function vect_create_data_ref_ptr.
4740 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4741 location accessed in the loop by STMT_INFO, along with the def-use update
4742 chain to appropriately advance the pointer through the loop iterations.
4743 Also set aliasing information for the pointer. This pointer is used by
4744 the callers to this function to create a memory reference expression for
4745 vector load/store access.
4747 Input:
4748 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4749 GIMPLE_ASSIGN <name, data-ref> or
4750 GIMPLE_ASSIGN <data-ref, name>.
4751 2. AGGR_TYPE: the type of the reference, which should be either a vector
4752 or an array.
4753 3. AT_LOOP: the loop where the vector memref is to be created.
4754 4. OFFSET (optional): an offset to be added to the initial address accessed
4755 by the data-ref in STMT_INFO.
4756 5. BSI: location where the new stmts are to be placed if there is no loop
4757 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4758 pointing to the initial address.
4759 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4760 to the initial address accessed by the data-ref in STMT_INFO. This is
4761 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4762 in bytes.
4763 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4764 to the IV during each iteration of the loop. NULL says to move
4765 by one copy of AGGR_TYPE up or down, depending on the step of the
4766 data reference.
4768 Output:
4769 1. Declare a new ptr to vector_type, and have it point to the base of the
4770 data reference (initial addressed accessed by the data reference).
4771 For example, for vector of type V8HI, the following code is generated:
4773 v8hi *ap;
4774 ap = (v8hi *)initial_address;
4776 if OFFSET is not supplied:
4777 initial_address = &a[init];
4778 if OFFSET is supplied:
4779 initial_address = &a[init + OFFSET];
4780 if BYTE_OFFSET is supplied:
4781 initial_address = &a[init] + BYTE_OFFSET;
4783 Return the initial_address in INITIAL_ADDRESS.
4785 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4786 update the pointer in each iteration of the loop.
4788 Return the increment stmt that updates the pointer in PTR_INCR.
4790 3. Return the pointer. */
4792 tree
4793 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4794 tree aggr_type, class loop *at_loop, tree offset,
4795 tree *initial_address, gimple_stmt_iterator *gsi,
4796 gimple **ptr_incr, bool only_init,
4797 tree byte_offset, tree iv_step)
4799 const char *base_name;
4800 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4801 class loop *loop = NULL;
4802 bool nested_in_vect_loop = false;
4803 class loop *containing_loop = NULL;
4804 tree aggr_ptr_type;
4805 tree aggr_ptr;
4806 tree new_temp;
4807 gimple_seq new_stmt_list = NULL;
4808 edge pe = NULL;
4809 basic_block new_bb;
4810 tree aggr_ptr_init;
4811 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4812 struct data_reference *dr = dr_info->dr;
4813 tree aptr;
4814 gimple_stmt_iterator incr_gsi;
4815 bool insert_after;
4816 tree indx_before_incr, indx_after_incr;
4817 gimple *incr;
4818 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4820 gcc_assert (iv_step != NULL_TREE
4821 || TREE_CODE (aggr_type) == ARRAY_TYPE
4822 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4824 if (loop_vinfo)
4826 loop = LOOP_VINFO_LOOP (loop_vinfo);
4827 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4828 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4829 pe = loop_preheader_edge (loop);
4831 else
4833 gcc_assert (bb_vinfo);
4834 only_init = true;
4835 *ptr_incr = NULL;
4838 /* Create an expression for the first address accessed by this load
4839 in LOOP. */
4840 base_name = get_name (DR_BASE_ADDRESS (dr));
4842 if (dump_enabled_p ())
4844 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4845 dump_printf_loc (MSG_NOTE, vect_location,
4846 "create %s-pointer variable to type: %T",
4847 get_tree_code_name (TREE_CODE (aggr_type)),
4848 aggr_type);
4849 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4850 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4851 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4852 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4853 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4854 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4855 else
4856 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4857 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4860 /* (1) Create the new aggregate-pointer variable.
4861 Vector and array types inherit the alias set of their component
4862 type by default so we need to use a ref-all pointer if the data
4863 reference does not conflict with the created aggregated data
4864 reference because it is not addressable. */
4865 bool need_ref_all = false;
4866 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4867 get_alias_set (DR_REF (dr))))
4868 need_ref_all = true;
4869 /* Likewise for any of the data references in the stmt group. */
4870 else if (DR_GROUP_SIZE (stmt_info) > 1)
4872 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4875 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4876 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4877 get_alias_set (DR_REF (sdr))))
4879 need_ref_all = true;
4880 break;
4882 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4884 while (sinfo);
4886 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4887 need_ref_all);
4888 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4891 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4892 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4893 def-use update cycles for the pointer: one relative to the outer-loop
4894 (LOOP), which is what steps (3) and (4) below do. The other is relative
4895 to the inner-loop (which is the inner-most loop containing the dataref),
4896 and this is done be step (5) below.
4898 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4899 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4900 redundant. Steps (3),(4) create the following:
4902 vp0 = &base_addr;
4903 LOOP: vp1 = phi(vp0,vp2)
4906 vp2 = vp1 + step
4907 goto LOOP
4909 If there is an inner-loop nested in loop, then step (5) will also be
4910 applied, and an additional update in the inner-loop will be created:
4912 vp0 = &base_addr;
4913 LOOP: vp1 = phi(vp0,vp2)
4915 inner: vp3 = phi(vp1,vp4)
4916 vp4 = vp3 + inner_step
4917 if () goto inner
4919 vp2 = vp1 + step
4920 if () goto LOOP */
4922 /* (2) Calculate the initial address of the aggregate-pointer, and set
4923 the aggregate-pointer to point to it before the loop. */
4925 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4927 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4928 stmt_info, &new_stmt_list,
4929 offset, byte_offset);
4930 if (new_stmt_list)
4932 if (pe)
4934 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4935 gcc_assert (!new_bb);
4937 else
4938 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4941 *initial_address = new_temp;
4942 aggr_ptr_init = new_temp;
4944 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4945 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4946 inner-loop nested in LOOP (during outer-loop vectorization). */
4948 /* No update in loop is required. */
4949 if (only_init && (!loop_vinfo || at_loop == loop))
4950 aptr = aggr_ptr_init;
4951 else
4953 /* Accesses to invariant addresses should be handled specially
4954 by the caller. */
4955 tree step = vect_dr_behavior (vinfo, dr_info)->step;
4956 gcc_assert (!integer_zerop (step));
4958 if (iv_step == NULL_TREE)
4960 /* The step of the aggregate pointer is the type size,
4961 negated for downward accesses. */
4962 iv_step = TYPE_SIZE_UNIT (aggr_type);
4963 if (tree_int_cst_sgn (step) == -1)
4964 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4967 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4969 create_iv (aggr_ptr_init,
4970 fold_convert (aggr_ptr_type, iv_step),
4971 aggr_ptr, loop, &incr_gsi, insert_after,
4972 &indx_before_incr, &indx_after_incr);
4973 incr = gsi_stmt (incr_gsi);
4975 /* Copy the points-to information if it exists. */
4976 if (DR_PTR_INFO (dr))
4978 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4979 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4981 if (ptr_incr)
4982 *ptr_incr = incr;
4984 aptr = indx_before_incr;
4987 if (!nested_in_vect_loop || only_init)
4988 return aptr;
4991 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4992 nested in LOOP, if exists. */
4994 gcc_assert (nested_in_vect_loop);
4995 if (!only_init)
4997 standard_iv_increment_position (containing_loop, &incr_gsi,
4998 &insert_after);
4999 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5000 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5001 &indx_after_incr);
5002 incr = gsi_stmt (incr_gsi);
5004 /* Copy the points-to information if it exists. */
5005 if (DR_PTR_INFO (dr))
5007 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5008 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5010 if (ptr_incr)
5011 *ptr_incr = incr;
5013 return indx_before_incr;
5015 else
5016 gcc_unreachable ();
5020 /* Function bump_vector_ptr
5022 Increment a pointer (to a vector type) by vector-size. If requested,
5023 i.e. if PTR-INCR is given, then also connect the new increment stmt
5024 to the existing def-use update-chain of the pointer, by modifying
5025 the PTR_INCR as illustrated below:
5027 The pointer def-use update-chain before this function:
5028 DATAREF_PTR = phi (p_0, p_2)
5029 ....
5030 PTR_INCR: p_2 = DATAREF_PTR + step
5032 The pointer def-use update-chain after this function:
5033 DATAREF_PTR = phi (p_0, p_2)
5034 ....
5035 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5036 ....
5037 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5039 Input:
5040 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5041 in the loop.
5042 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5043 the loop. The increment amount across iterations is expected
5044 to be vector_size.
5045 BSI - location where the new update stmt is to be placed.
5046 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5047 BUMP - optional. The offset by which to bump the pointer. If not given,
5048 the offset is assumed to be vector_size.
5050 Output: Return NEW_DATAREF_PTR as illustrated above.
5054 tree
5055 bump_vector_ptr (vec_info *vinfo,
5056 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5057 stmt_vec_info stmt_info, tree bump)
5059 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5060 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5061 tree update = TYPE_SIZE_UNIT (vectype);
5062 gimple *incr_stmt;
5063 ssa_op_iter iter;
5064 use_operand_p use_p;
5065 tree new_dataref_ptr;
5067 if (bump)
5068 update = bump;
5070 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5071 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5072 else
5073 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5074 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5075 dataref_ptr, update);
5076 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5077 /* Fold the increment, avoiding excessive chains use-def chains of
5078 those, leading to compile-time issues for passes until the next
5079 forwprop pass which would do this as well. */
5080 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5081 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5083 incr_stmt = gsi_stmt (fold_gsi);
5084 update_stmt (incr_stmt);
5087 /* Copy the points-to information if it exists. */
5088 if (DR_PTR_INFO (dr))
5090 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5091 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5094 if (!ptr_incr)
5095 return new_dataref_ptr;
5097 /* Update the vector-pointer's cross-iteration increment. */
5098 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5100 tree use = USE_FROM_PTR (use_p);
5102 if (use == dataref_ptr)
5103 SET_USE (use_p, new_dataref_ptr);
5104 else
5105 gcc_assert (operand_equal_p (use, update, 0));
5108 return new_dataref_ptr;
5112 /* Copy memory reference info such as base/clique from the SRC reference
5113 to the DEST MEM_REF. */
5115 void
5116 vect_copy_ref_info (tree dest, tree src)
5118 if (TREE_CODE (dest) != MEM_REF)
5119 return;
5121 tree src_base = src;
5122 while (handled_component_p (src_base))
5123 src_base = TREE_OPERAND (src_base, 0);
5124 if (TREE_CODE (src_base) != MEM_REF
5125 && TREE_CODE (src_base) != TARGET_MEM_REF)
5126 return;
5128 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5129 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5133 /* Function vect_create_destination_var.
5135 Create a new temporary of type VECTYPE. */
5137 tree
5138 vect_create_destination_var (tree scalar_dest, tree vectype)
5140 tree vec_dest;
5141 const char *name;
5142 char *new_name;
5143 tree type;
5144 enum vect_var_kind kind;
5146 kind = vectype
5147 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5148 ? vect_mask_var
5149 : vect_simple_var
5150 : vect_scalar_var;
5151 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5153 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5155 name = get_name (scalar_dest);
5156 if (name)
5157 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5158 else
5159 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5160 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5161 free (new_name);
5163 return vec_dest;
5166 /* Function vect_grouped_store_supported.
5168 Returns TRUE if interleave high and interleave low permutations
5169 are supported, and FALSE otherwise. */
5171 bool
5172 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5174 machine_mode mode = TYPE_MODE (vectype);
5176 /* vect_permute_store_chain requires the group size to be equal to 3 or
5177 be a power of two. */
5178 if (count != 3 && exact_log2 (count) == -1)
5180 if (dump_enabled_p ())
5181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5182 "the size of the group of accesses"
5183 " is not a power of 2 or not eqaul to 3\n");
5184 return false;
5187 /* Check that the permutation is supported. */
5188 if (VECTOR_MODE_P (mode))
5190 unsigned int i;
5191 if (count == 3)
5193 unsigned int j0 = 0, j1 = 0, j2 = 0;
5194 unsigned int i, j;
5196 unsigned int nelt;
5197 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5199 if (dump_enabled_p ())
5200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5201 "cannot handle groups of 3 stores for"
5202 " variable-length vectors\n");
5203 return false;
5206 vec_perm_builder sel (nelt, nelt, 1);
5207 sel.quick_grow (nelt);
5208 vec_perm_indices indices;
5209 for (j = 0; j < 3; j++)
5211 int nelt0 = ((3 - j) * nelt) % 3;
5212 int nelt1 = ((3 - j) * nelt + 1) % 3;
5213 int nelt2 = ((3 - j) * nelt + 2) % 3;
5214 for (i = 0; i < nelt; i++)
5216 if (3 * i + nelt0 < nelt)
5217 sel[3 * i + nelt0] = j0++;
5218 if (3 * i + nelt1 < nelt)
5219 sel[3 * i + nelt1] = nelt + j1++;
5220 if (3 * i + nelt2 < nelt)
5221 sel[3 * i + nelt2] = 0;
5223 indices.new_vector (sel, 2, nelt);
5224 if (!can_vec_perm_const_p (mode, indices))
5226 if (dump_enabled_p ())
5227 dump_printf (MSG_MISSED_OPTIMIZATION,
5228 "permutation op not supported by target.\n");
5229 return false;
5232 for (i = 0; i < nelt; i++)
5234 if (3 * i + nelt0 < nelt)
5235 sel[3 * i + nelt0] = 3 * i + nelt0;
5236 if (3 * i + nelt1 < nelt)
5237 sel[3 * i + nelt1] = 3 * i + nelt1;
5238 if (3 * i + nelt2 < nelt)
5239 sel[3 * i + nelt2] = nelt + j2++;
5241 indices.new_vector (sel, 2, nelt);
5242 if (!can_vec_perm_const_p (mode, indices))
5244 if (dump_enabled_p ())
5245 dump_printf (MSG_MISSED_OPTIMIZATION,
5246 "permutation op not supported by target.\n");
5247 return false;
5250 return true;
5252 else
5254 /* If length is not equal to 3 then only power of 2 is supported. */
5255 gcc_assert (pow2p_hwi (count));
5256 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5258 /* The encoding has 2 interleaved stepped patterns. */
5259 vec_perm_builder sel (nelt, 2, 3);
5260 sel.quick_grow (6);
5261 for (i = 0; i < 3; i++)
5263 sel[i * 2] = i;
5264 sel[i * 2 + 1] = i + nelt;
5266 vec_perm_indices indices (sel, 2, nelt);
5267 if (can_vec_perm_const_p (mode, indices))
5269 for (i = 0; i < 6; i++)
5270 sel[i] += exact_div (nelt, 2);
5271 indices.new_vector (sel, 2, nelt);
5272 if (can_vec_perm_const_p (mode, indices))
5273 return true;
5278 if (dump_enabled_p ())
5279 dump_printf (MSG_MISSED_OPTIMIZATION,
5280 "permutation op not supported by target.\n");
5281 return false;
5285 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5286 type VECTYPE. MASKED_P says whether the masked form is needed. */
5288 bool
5289 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5290 bool masked_p)
5292 if (masked_p)
5293 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5294 vec_mask_store_lanes_optab,
5295 vectype, count);
5296 else
5297 return vect_lanes_optab_supported_p ("vec_store_lanes",
5298 vec_store_lanes_optab,
5299 vectype, count);
5303 /* Function vect_permute_store_chain.
5305 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5306 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5307 the data correctly for the stores. Return the final references for stores
5308 in RESULT_CHAIN.
5310 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5311 The input is 4 vectors each containing 8 elements. We assign a number to
5312 each element, the input sequence is:
5314 1st vec: 0 1 2 3 4 5 6 7
5315 2nd vec: 8 9 10 11 12 13 14 15
5316 3rd vec: 16 17 18 19 20 21 22 23
5317 4th vec: 24 25 26 27 28 29 30 31
5319 The output sequence should be:
5321 1st vec: 0 8 16 24 1 9 17 25
5322 2nd vec: 2 10 18 26 3 11 19 27
5323 3rd vec: 4 12 20 28 5 13 21 30
5324 4th vec: 6 14 22 30 7 15 23 31
5326 i.e., we interleave the contents of the four vectors in their order.
5328 We use interleave_high/low instructions to create such output. The input of
5329 each interleave_high/low operation is two vectors:
5330 1st vec 2nd vec
5331 0 1 2 3 4 5 6 7
5332 the even elements of the result vector are obtained left-to-right from the
5333 high/low elements of the first vector. The odd elements of the result are
5334 obtained left-to-right from the high/low elements of the second vector.
5335 The output of interleave_high will be: 0 4 1 5
5336 and of interleave_low: 2 6 3 7
5339 The permutation is done in log LENGTH stages. In each stage interleave_high
5340 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5341 where the first argument is taken from the first half of DR_CHAIN and the
5342 second argument from it's second half.
5343 In our example,
5345 I1: interleave_high (1st vec, 3rd vec)
5346 I2: interleave_low (1st vec, 3rd vec)
5347 I3: interleave_high (2nd vec, 4th vec)
5348 I4: interleave_low (2nd vec, 4th vec)
5350 The output for the first stage is:
5352 I1: 0 16 1 17 2 18 3 19
5353 I2: 4 20 5 21 6 22 7 23
5354 I3: 8 24 9 25 10 26 11 27
5355 I4: 12 28 13 29 14 30 15 31
5357 The output of the second stage, i.e. the final result is:
5359 I1: 0 8 16 24 1 9 17 25
5360 I2: 2 10 18 26 3 11 19 27
5361 I3: 4 12 20 28 5 13 21 30
5362 I4: 6 14 22 30 7 15 23 31. */
5364 void
5365 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5366 unsigned int length,
5367 stmt_vec_info stmt_info,
5368 gimple_stmt_iterator *gsi,
5369 vec<tree> *result_chain)
5371 tree vect1, vect2, high, low;
5372 gimple *perm_stmt;
5373 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5374 tree perm_mask_low, perm_mask_high;
5375 tree data_ref;
5376 tree perm3_mask_low, perm3_mask_high;
5377 unsigned int i, j, n, log_length = exact_log2 (length);
5379 result_chain->quick_grow (length);
5380 memcpy (result_chain->address (), dr_chain.address (),
5381 length * sizeof (tree));
5383 if (length == 3)
5385 /* vect_grouped_store_supported ensures that this is constant. */
5386 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5387 unsigned int j0 = 0, j1 = 0, j2 = 0;
5389 vec_perm_builder sel (nelt, nelt, 1);
5390 sel.quick_grow (nelt);
5391 vec_perm_indices indices;
5392 for (j = 0; j < 3; j++)
5394 int nelt0 = ((3 - j) * nelt) % 3;
5395 int nelt1 = ((3 - j) * nelt + 1) % 3;
5396 int nelt2 = ((3 - j) * nelt + 2) % 3;
5398 for (i = 0; i < nelt; i++)
5400 if (3 * i + nelt0 < nelt)
5401 sel[3 * i + nelt0] = j0++;
5402 if (3 * i + nelt1 < nelt)
5403 sel[3 * i + nelt1] = nelt + j1++;
5404 if (3 * i + nelt2 < nelt)
5405 sel[3 * i + nelt2] = 0;
5407 indices.new_vector (sel, 2, nelt);
5408 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5410 for (i = 0; i < nelt; i++)
5412 if (3 * i + nelt0 < nelt)
5413 sel[3 * i + nelt0] = 3 * i + nelt0;
5414 if (3 * i + nelt1 < nelt)
5415 sel[3 * i + nelt1] = 3 * i + nelt1;
5416 if (3 * i + nelt2 < nelt)
5417 sel[3 * i + nelt2] = nelt + j2++;
5419 indices.new_vector (sel, 2, nelt);
5420 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5422 vect1 = dr_chain[0];
5423 vect2 = dr_chain[1];
5425 /* Create interleaving stmt:
5426 low = VEC_PERM_EXPR <vect1, vect2,
5427 {j, nelt, *, j + 1, nelt + j + 1, *,
5428 j + 2, nelt + j + 2, *, ...}> */
5429 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5430 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5431 vect2, perm3_mask_low);
5432 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5434 vect1 = data_ref;
5435 vect2 = dr_chain[2];
5436 /* Create interleaving stmt:
5437 low = VEC_PERM_EXPR <vect1, vect2,
5438 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5439 6, 7, nelt + j + 2, ...}> */
5440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5442 vect2, perm3_mask_high);
5443 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5444 (*result_chain)[j] = data_ref;
5447 else
5449 /* If length is not equal to 3 then only power of 2 is supported. */
5450 gcc_assert (pow2p_hwi (length));
5452 /* The encoding has 2 interleaved stepped patterns. */
5453 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5454 vec_perm_builder sel (nelt, 2, 3);
5455 sel.quick_grow (6);
5456 for (i = 0; i < 3; i++)
5458 sel[i * 2] = i;
5459 sel[i * 2 + 1] = i + nelt;
5461 vec_perm_indices indices (sel, 2, nelt);
5462 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5464 for (i = 0; i < 6; i++)
5465 sel[i] += exact_div (nelt, 2);
5466 indices.new_vector (sel, 2, nelt);
5467 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5469 for (i = 0, n = log_length; i < n; i++)
5471 for (j = 0; j < length/2; j++)
5473 vect1 = dr_chain[j];
5474 vect2 = dr_chain[j+length/2];
5476 /* Create interleaving stmt:
5477 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5478 ...}> */
5479 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5480 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5481 vect2, perm_mask_high);
5482 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5483 (*result_chain)[2*j] = high;
5485 /* Create interleaving stmt:
5486 low = VEC_PERM_EXPR <vect1, vect2,
5487 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5488 ...}> */
5489 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5490 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5491 vect2, perm_mask_low);
5492 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5493 (*result_chain)[2*j+1] = low;
5495 memcpy (dr_chain.address (), result_chain->address (),
5496 length * sizeof (tree));
5501 /* Function vect_setup_realignment
5503 This function is called when vectorizing an unaligned load using
5504 the dr_explicit_realign[_optimized] scheme.
5505 This function generates the following code at the loop prolog:
5507 p = initial_addr;
5508 x msq_init = *(floor(p)); # prolog load
5509 realignment_token = call target_builtin;
5510 loop:
5511 x msq = phi (msq_init, ---)
5513 The stmts marked with x are generated only for the case of
5514 dr_explicit_realign_optimized.
5516 The code above sets up a new (vector) pointer, pointing to the first
5517 location accessed by STMT_INFO, and a "floor-aligned" load using that
5518 pointer. It also generates code to compute the "realignment-token"
5519 (if the relevant target hook was defined), and creates a phi-node at the
5520 loop-header bb whose arguments are the result of the prolog-load (created
5521 by this function) and the result of a load that takes place in the loop
5522 (to be created by the caller to this function).
5524 For the case of dr_explicit_realign_optimized:
5525 The caller to this function uses the phi-result (msq) to create the
5526 realignment code inside the loop, and sets up the missing phi argument,
5527 as follows:
5528 loop:
5529 msq = phi (msq_init, lsq)
5530 lsq = *(floor(p')); # load in loop
5531 result = realign_load (msq, lsq, realignment_token);
5533 For the case of dr_explicit_realign:
5534 loop:
5535 msq = *(floor(p)); # load in loop
5536 p' = p + (VS-1);
5537 lsq = *(floor(p')); # load in loop
5538 result = realign_load (msq, lsq, realignment_token);
5540 Input:
5541 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5542 a memory location that may be unaligned.
5543 BSI - place where new code is to be inserted.
5544 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5545 is used.
5547 Output:
5548 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5549 target hook, if defined.
5550 Return value - the result of the loop-header phi node. */
5552 tree
5553 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5554 gimple_stmt_iterator *gsi, tree *realignment_token,
5555 enum dr_alignment_support alignment_support_scheme,
5556 tree init_addr,
5557 class loop **at_loop)
5559 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5560 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5561 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5562 struct data_reference *dr = dr_info->dr;
5563 class loop *loop = NULL;
5564 edge pe = NULL;
5565 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5566 tree vec_dest;
5567 gimple *inc;
5568 tree ptr;
5569 tree data_ref;
5570 basic_block new_bb;
5571 tree msq_init = NULL_TREE;
5572 tree new_temp;
5573 gphi *phi_stmt;
5574 tree msq = NULL_TREE;
5575 gimple_seq stmts = NULL;
5576 bool compute_in_loop = false;
5577 bool nested_in_vect_loop = false;
5578 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5579 class loop *loop_for_initial_load = NULL;
5581 if (loop_vinfo)
5583 loop = LOOP_VINFO_LOOP (loop_vinfo);
5584 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5587 gcc_assert (alignment_support_scheme == dr_explicit_realign
5588 || alignment_support_scheme == dr_explicit_realign_optimized);
5590 /* We need to generate three things:
5591 1. the misalignment computation
5592 2. the extra vector load (for the optimized realignment scheme).
5593 3. the phi node for the two vectors from which the realignment is
5594 done (for the optimized realignment scheme). */
5596 /* 1. Determine where to generate the misalignment computation.
5598 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5599 calculation will be generated by this function, outside the loop (in the
5600 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5601 caller, inside the loop.
5603 Background: If the misalignment remains fixed throughout the iterations of
5604 the loop, then both realignment schemes are applicable, and also the
5605 misalignment computation can be done outside LOOP. This is because we are
5606 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5607 are a multiple of VS (the Vector Size), and therefore the misalignment in
5608 different vectorized LOOP iterations is always the same.
5609 The problem arises only if the memory access is in an inner-loop nested
5610 inside LOOP, which is now being vectorized using outer-loop vectorization.
5611 This is the only case when the misalignment of the memory access may not
5612 remain fixed throughout the iterations of the inner-loop (as explained in
5613 detail in vect_supportable_dr_alignment). In this case, not only is the
5614 optimized realignment scheme not applicable, but also the misalignment
5615 computation (and generation of the realignment token that is passed to
5616 REALIGN_LOAD) have to be done inside the loop.
5618 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5619 or not, which in turn determines if the misalignment is computed inside
5620 the inner-loop, or outside LOOP. */
5622 if (init_addr != NULL_TREE || !loop_vinfo)
5624 compute_in_loop = true;
5625 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5629 /* 2. Determine where to generate the extra vector load.
5631 For the optimized realignment scheme, instead of generating two vector
5632 loads in each iteration, we generate a single extra vector load in the
5633 preheader of the loop, and in each iteration reuse the result of the
5634 vector load from the previous iteration. In case the memory access is in
5635 an inner-loop nested inside LOOP, which is now being vectorized using
5636 outer-loop vectorization, we need to determine whether this initial vector
5637 load should be generated at the preheader of the inner-loop, or can be
5638 generated at the preheader of LOOP. If the memory access has no evolution
5639 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5640 to be generated inside LOOP (in the preheader of the inner-loop). */
5642 if (nested_in_vect_loop)
5644 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5645 bool invariant_in_outerloop =
5646 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5647 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5649 else
5650 loop_for_initial_load = loop;
5651 if (at_loop)
5652 *at_loop = loop_for_initial_load;
5654 if (loop_for_initial_load)
5655 pe = loop_preheader_edge (loop_for_initial_load);
5657 /* 3. For the case of the optimized realignment, create the first vector
5658 load at the loop preheader. */
5660 if (alignment_support_scheme == dr_explicit_realign_optimized)
5662 /* Create msq_init = *(floor(p1)) in the loop preheader */
5663 gassign *new_stmt;
5665 gcc_assert (!compute_in_loop);
5666 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5667 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5668 loop_for_initial_load, NULL_TREE,
5669 &init_addr, NULL, &inc, true);
5670 if (TREE_CODE (ptr) == SSA_NAME)
5671 new_temp = copy_ssa_name (ptr);
5672 else
5673 new_temp = make_ssa_name (TREE_TYPE (ptr));
5674 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5675 tree type = TREE_TYPE (ptr);
5676 new_stmt = gimple_build_assign
5677 (new_temp, BIT_AND_EXPR, ptr,
5678 fold_build2 (MINUS_EXPR, type,
5679 build_int_cst (type, 0),
5680 build_int_cst (type, align)));
5681 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5682 gcc_assert (!new_bb);
5683 data_ref
5684 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5685 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5686 vect_copy_ref_info (data_ref, DR_REF (dr));
5687 new_stmt = gimple_build_assign (vec_dest, data_ref);
5688 new_temp = make_ssa_name (vec_dest, new_stmt);
5689 gimple_assign_set_lhs (new_stmt, new_temp);
5690 if (pe)
5692 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5693 gcc_assert (!new_bb);
5695 else
5696 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5698 msq_init = gimple_assign_lhs (new_stmt);
5701 /* 4. Create realignment token using a target builtin, if available.
5702 It is done either inside the containing loop, or before LOOP (as
5703 determined above). */
5705 if (targetm.vectorize.builtin_mask_for_load)
5707 gcall *new_stmt;
5708 tree builtin_decl;
5710 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5711 if (!init_addr)
5713 /* Generate the INIT_ADDR computation outside LOOP. */
5714 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5715 stmt_info, &stmts,
5716 NULL_TREE);
5717 if (loop)
5719 pe = loop_preheader_edge (loop);
5720 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5721 gcc_assert (!new_bb);
5723 else
5724 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5727 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5728 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5729 vec_dest =
5730 vect_create_destination_var (scalar_dest,
5731 gimple_call_return_type (new_stmt));
5732 new_temp = make_ssa_name (vec_dest, new_stmt);
5733 gimple_call_set_lhs (new_stmt, new_temp);
5735 if (compute_in_loop)
5736 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5737 else
5739 /* Generate the misalignment computation outside LOOP. */
5740 pe = loop_preheader_edge (loop);
5741 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5742 gcc_assert (!new_bb);
5745 *realignment_token = gimple_call_lhs (new_stmt);
5747 /* The result of the CALL_EXPR to this builtin is determined from
5748 the value of the parameter and no global variables are touched
5749 which makes the builtin a "const" function. Requiring the
5750 builtin to have the "const" attribute makes it unnecessary
5751 to call mark_call_clobbered. */
5752 gcc_assert (TREE_READONLY (builtin_decl));
5755 if (alignment_support_scheme == dr_explicit_realign)
5756 return msq;
5758 gcc_assert (!compute_in_loop);
5759 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5762 /* 5. Create msq = phi <msq_init, lsq> in loop */
5764 pe = loop_preheader_edge (containing_loop);
5765 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5766 msq = make_ssa_name (vec_dest);
5767 phi_stmt = create_phi_node (msq, containing_loop->header);
5768 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5770 return msq;
5774 /* Function vect_grouped_load_supported.
5776 COUNT is the size of the load group (the number of statements plus the
5777 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5778 only one statement, with a gap of COUNT - 1.
5780 Returns true if a suitable permute exists. */
5782 bool
5783 vect_grouped_load_supported (tree vectype, bool single_element_p,
5784 unsigned HOST_WIDE_INT count)
5786 machine_mode mode = TYPE_MODE (vectype);
5788 /* If this is single-element interleaving with an element distance
5789 that leaves unused vector loads around punt - we at least create
5790 very sub-optimal code in that case (and blow up memory,
5791 see PR65518). */
5792 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5794 if (dump_enabled_p ())
5795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5796 "single-element interleaving not supported "
5797 "for not adjacent vector loads\n");
5798 return false;
5801 /* vect_permute_load_chain requires the group size to be equal to 3 or
5802 be a power of two. */
5803 if (count != 3 && exact_log2 (count) == -1)
5805 if (dump_enabled_p ())
5806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5807 "the size of the group of accesses"
5808 " is not a power of 2 or not equal to 3\n");
5809 return false;
5812 /* Check that the permutation is supported. */
5813 if (VECTOR_MODE_P (mode))
5815 unsigned int i, j;
5816 if (count == 3)
5818 unsigned int nelt;
5819 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5821 if (dump_enabled_p ())
5822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5823 "cannot handle groups of 3 loads for"
5824 " variable-length vectors\n");
5825 return false;
5828 vec_perm_builder sel (nelt, nelt, 1);
5829 sel.quick_grow (nelt);
5830 vec_perm_indices indices;
5831 unsigned int k;
5832 for (k = 0; k < 3; k++)
5834 for (i = 0; i < nelt; i++)
5835 if (3 * i + k < 2 * nelt)
5836 sel[i] = 3 * i + k;
5837 else
5838 sel[i] = 0;
5839 indices.new_vector (sel, 2, nelt);
5840 if (!can_vec_perm_const_p (mode, indices))
5842 if (dump_enabled_p ())
5843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5844 "shuffle of 3 loads is not supported by"
5845 " target\n");
5846 return false;
5848 for (i = 0, j = 0; i < nelt; i++)
5849 if (3 * i + k < 2 * nelt)
5850 sel[i] = i;
5851 else
5852 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5853 indices.new_vector (sel, 2, nelt);
5854 if (!can_vec_perm_const_p (mode, indices))
5856 if (dump_enabled_p ())
5857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5858 "shuffle of 3 loads is not supported by"
5859 " target\n");
5860 return false;
5863 return true;
5865 else
5867 /* If length is not equal to 3 then only power of 2 is supported. */
5868 gcc_assert (pow2p_hwi (count));
5869 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5871 /* The encoding has a single stepped pattern. */
5872 vec_perm_builder sel (nelt, 1, 3);
5873 sel.quick_grow (3);
5874 for (i = 0; i < 3; i++)
5875 sel[i] = i * 2;
5876 vec_perm_indices indices (sel, 2, nelt);
5877 if (can_vec_perm_const_p (mode, indices))
5879 for (i = 0; i < 3; i++)
5880 sel[i] = i * 2 + 1;
5881 indices.new_vector (sel, 2, nelt);
5882 if (can_vec_perm_const_p (mode, indices))
5883 return true;
5888 if (dump_enabled_p ())
5889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5890 "extract even/odd not supported by target\n");
5891 return false;
5894 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5895 type VECTYPE. MASKED_P says whether the masked form is needed. */
5897 bool
5898 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5899 bool masked_p)
5901 if (masked_p)
5902 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5903 vec_mask_load_lanes_optab,
5904 vectype, count);
5905 else
5906 return vect_lanes_optab_supported_p ("vec_load_lanes",
5907 vec_load_lanes_optab,
5908 vectype, count);
5911 /* Function vect_permute_load_chain.
5913 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5914 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5915 the input data correctly. Return the final references for loads in
5916 RESULT_CHAIN.
5918 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5919 The input is 4 vectors each containing 8 elements. We assign a number to each
5920 element, the input sequence is:
5922 1st vec: 0 1 2 3 4 5 6 7
5923 2nd vec: 8 9 10 11 12 13 14 15
5924 3rd vec: 16 17 18 19 20 21 22 23
5925 4th vec: 24 25 26 27 28 29 30 31
5927 The output sequence should be:
5929 1st vec: 0 4 8 12 16 20 24 28
5930 2nd vec: 1 5 9 13 17 21 25 29
5931 3rd vec: 2 6 10 14 18 22 26 30
5932 4th vec: 3 7 11 15 19 23 27 31
5934 i.e., the first output vector should contain the first elements of each
5935 interleaving group, etc.
5937 We use extract_even/odd instructions to create such output. The input of
5938 each extract_even/odd operation is two vectors
5939 1st vec 2nd vec
5940 0 1 2 3 4 5 6 7
5942 and the output is the vector of extracted even/odd elements. The output of
5943 extract_even will be: 0 2 4 6
5944 and of extract_odd: 1 3 5 7
5947 The permutation is done in log LENGTH stages. In each stage extract_even
5948 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5949 their order. In our example,
5951 E1: extract_even (1st vec, 2nd vec)
5952 E2: extract_odd (1st vec, 2nd vec)
5953 E3: extract_even (3rd vec, 4th vec)
5954 E4: extract_odd (3rd vec, 4th vec)
5956 The output for the first stage will be:
5958 E1: 0 2 4 6 8 10 12 14
5959 E2: 1 3 5 7 9 11 13 15
5960 E3: 16 18 20 22 24 26 28 30
5961 E4: 17 19 21 23 25 27 29 31
5963 In order to proceed and create the correct sequence for the next stage (or
5964 for the correct output, if the second stage is the last one, as in our
5965 example), we first put the output of extract_even operation and then the
5966 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5967 The input for the second stage is:
5969 1st vec (E1): 0 2 4 6 8 10 12 14
5970 2nd vec (E3): 16 18 20 22 24 26 28 30
5971 3rd vec (E2): 1 3 5 7 9 11 13 15
5972 4th vec (E4): 17 19 21 23 25 27 29 31
5974 The output of the second stage:
5976 E1: 0 4 8 12 16 20 24 28
5977 E2: 2 6 10 14 18 22 26 30
5978 E3: 1 5 9 13 17 21 25 29
5979 E4: 3 7 11 15 19 23 27 31
5981 And RESULT_CHAIN after reordering:
5983 1st vec (E1): 0 4 8 12 16 20 24 28
5984 2nd vec (E3): 1 5 9 13 17 21 25 29
5985 3rd vec (E2): 2 6 10 14 18 22 26 30
5986 4th vec (E4): 3 7 11 15 19 23 27 31. */
5988 static void
5989 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5990 unsigned int length,
5991 stmt_vec_info stmt_info,
5992 gimple_stmt_iterator *gsi,
5993 vec<tree> *result_chain)
5995 tree data_ref, first_vect, second_vect;
5996 tree perm_mask_even, perm_mask_odd;
5997 tree perm3_mask_low, perm3_mask_high;
5998 gimple *perm_stmt;
5999 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6000 unsigned int i, j, log_length = exact_log2 (length);
6002 result_chain->quick_grow (length);
6003 memcpy (result_chain->address (), dr_chain.address (),
6004 length * sizeof (tree));
6006 if (length == 3)
6008 /* vect_grouped_load_supported ensures that this is constant. */
6009 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6010 unsigned int k;
6012 vec_perm_builder sel (nelt, nelt, 1);
6013 sel.quick_grow (nelt);
6014 vec_perm_indices indices;
6015 for (k = 0; k < 3; k++)
6017 for (i = 0; i < nelt; i++)
6018 if (3 * i + k < 2 * nelt)
6019 sel[i] = 3 * i + k;
6020 else
6021 sel[i] = 0;
6022 indices.new_vector (sel, 2, nelt);
6023 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6025 for (i = 0, j = 0; i < nelt; i++)
6026 if (3 * i + k < 2 * nelt)
6027 sel[i] = i;
6028 else
6029 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6030 indices.new_vector (sel, 2, nelt);
6031 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6033 first_vect = dr_chain[0];
6034 second_vect = dr_chain[1];
6036 /* Create interleaving stmt (low part of):
6037 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6038 ...}> */
6039 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6040 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6041 second_vect, perm3_mask_low);
6042 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6044 /* Create interleaving stmt (high part of):
6045 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6046 ...}> */
6047 first_vect = data_ref;
6048 second_vect = dr_chain[2];
6049 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6050 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6051 second_vect, perm3_mask_high);
6052 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6053 (*result_chain)[k] = data_ref;
6056 else
6058 /* If length is not equal to 3 then only power of 2 is supported. */
6059 gcc_assert (pow2p_hwi (length));
6061 /* The encoding has a single stepped pattern. */
6062 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6063 vec_perm_builder sel (nelt, 1, 3);
6064 sel.quick_grow (3);
6065 for (i = 0; i < 3; ++i)
6066 sel[i] = i * 2;
6067 vec_perm_indices indices (sel, 2, nelt);
6068 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6070 for (i = 0; i < 3; ++i)
6071 sel[i] = i * 2 + 1;
6072 indices.new_vector (sel, 2, nelt);
6073 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6075 for (i = 0; i < log_length; i++)
6077 for (j = 0; j < length; j += 2)
6079 first_vect = dr_chain[j];
6080 second_vect = dr_chain[j+1];
6082 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6083 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6084 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6085 first_vect, second_vect,
6086 perm_mask_even);
6087 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6088 (*result_chain)[j/2] = data_ref;
6090 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6091 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6092 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6093 first_vect, second_vect,
6094 perm_mask_odd);
6095 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6096 (*result_chain)[j/2+length/2] = data_ref;
6098 memcpy (dr_chain.address (), result_chain->address (),
6099 length * sizeof (tree));
6104 /* Function vect_shift_permute_load_chain.
6106 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6107 sequence of stmts to reorder the input data accordingly.
6108 Return the final references for loads in RESULT_CHAIN.
6109 Return true if successed, false otherwise.
6111 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6112 The input is 3 vectors each containing 8 elements. We assign a
6113 number to each element, the input sequence is:
6115 1st vec: 0 1 2 3 4 5 6 7
6116 2nd vec: 8 9 10 11 12 13 14 15
6117 3rd vec: 16 17 18 19 20 21 22 23
6119 The output sequence should be:
6121 1st vec: 0 3 6 9 12 15 18 21
6122 2nd vec: 1 4 7 10 13 16 19 22
6123 3rd vec: 2 5 8 11 14 17 20 23
6125 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6127 First we shuffle all 3 vectors to get correct elements order:
6129 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6130 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6131 3rd vec: (16 19 22) (17 20 23) (18 21)
6133 Next we unite and shift vector 3 times:
6135 1st step:
6136 shift right by 6 the concatenation of:
6137 "1st vec" and "2nd vec"
6138 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6139 "2nd vec" and "3rd vec"
6140 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6141 "3rd vec" and "1st vec"
6142 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6143 | New vectors |
6145 So that now new vectors are:
6147 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6148 2nd vec: (10 13) (16 19 22) (17 20 23)
6149 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6151 2nd step:
6152 shift right by 5 the concatenation of:
6153 "1st vec" and "3rd vec"
6154 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6155 "2nd vec" and "1st vec"
6156 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6157 "3rd vec" and "2nd vec"
6158 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6159 | New vectors |
6161 So that now new vectors are:
6163 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6164 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6165 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6167 3rd step:
6168 shift right by 5 the concatenation of:
6169 "1st vec" and "1st vec"
6170 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6171 shift right by 3 the concatenation of:
6172 "2nd vec" and "2nd vec"
6173 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6174 | New vectors |
6176 So that now all vectors are READY:
6177 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6178 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6179 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6181 This algorithm is faster than one in vect_permute_load_chain if:
6182 1. "shift of a concatination" is faster than general permutation.
6183 This is usually so.
6184 2. The TARGET machine can't execute vector instructions in parallel.
6185 This is because each step of the algorithm depends on previous.
6186 The algorithm in vect_permute_load_chain is much more parallel.
6188 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6191 static bool
6192 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6193 unsigned int length,
6194 stmt_vec_info stmt_info,
6195 gimple_stmt_iterator *gsi,
6196 vec<tree> *result_chain)
6198 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6199 tree perm2_mask1, perm2_mask2, perm3_mask;
6200 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6201 gimple *perm_stmt;
6203 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6204 unsigned int i;
6205 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6207 unsigned HOST_WIDE_INT nelt, vf;
6208 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6209 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6210 /* Not supported for variable-length vectors. */
6211 return false;
6213 vec_perm_builder sel (nelt, nelt, 1);
6214 sel.quick_grow (nelt);
6216 result_chain->quick_grow (length);
6217 memcpy (result_chain->address (), dr_chain.address (),
6218 length * sizeof (tree));
6220 if (pow2p_hwi (length) && vf > 4)
6222 unsigned int j, log_length = exact_log2 (length);
6223 for (i = 0; i < nelt / 2; ++i)
6224 sel[i] = i * 2;
6225 for (i = 0; i < nelt / 2; ++i)
6226 sel[nelt / 2 + i] = i * 2 + 1;
6227 vec_perm_indices indices (sel, 2, nelt);
6228 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6230 if (dump_enabled_p ())
6231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232 "shuffle of 2 fields structure is not \
6233 supported by target\n");
6234 return false;
6236 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6238 for (i = 0; i < nelt / 2; ++i)
6239 sel[i] = i * 2 + 1;
6240 for (i = 0; i < nelt / 2; ++i)
6241 sel[nelt / 2 + i] = i * 2;
6242 indices.new_vector (sel, 2, nelt);
6243 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6245 if (dump_enabled_p ())
6246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247 "shuffle of 2 fields structure is not \
6248 supported by target\n");
6249 return false;
6251 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6253 /* Generating permutation constant to shift all elements.
6254 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6255 for (i = 0; i < nelt; i++)
6256 sel[i] = nelt / 2 + i;
6257 indices.new_vector (sel, 2, nelt);
6258 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6260 if (dump_enabled_p ())
6261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262 "shift permutation is not supported by target\n");
6263 return false;
6265 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6267 /* Generating permutation constant to select vector from 2.
6268 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6269 for (i = 0; i < nelt / 2; i++)
6270 sel[i] = i;
6271 for (i = nelt / 2; i < nelt; i++)
6272 sel[i] = nelt + i;
6273 indices.new_vector (sel, 2, nelt);
6274 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6276 if (dump_enabled_p ())
6277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6278 "select is not supported by target\n");
6279 return false;
6281 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6283 for (i = 0; i < log_length; i++)
6285 for (j = 0; j < length; j += 2)
6287 first_vect = dr_chain[j];
6288 second_vect = dr_chain[j + 1];
6290 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6291 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6292 first_vect, first_vect,
6293 perm2_mask1);
6294 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6295 vect[0] = data_ref;
6297 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6298 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6299 second_vect, second_vect,
6300 perm2_mask2);
6301 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6302 vect[1] = data_ref;
6304 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6305 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6306 vect[0], vect[1], shift1_mask);
6307 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6308 (*result_chain)[j/2 + length/2] = data_ref;
6310 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6311 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6312 vect[0], vect[1], select_mask);
6313 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6314 (*result_chain)[j/2] = data_ref;
6316 memcpy (dr_chain.address (), result_chain->address (),
6317 length * sizeof (tree));
6319 return true;
6321 if (length == 3 && vf > 2)
6323 unsigned int k = 0, l = 0;
6325 /* Generating permutation constant to get all elements in rigth order.
6326 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6327 for (i = 0; i < nelt; i++)
6329 if (3 * k + (l % 3) >= nelt)
6331 k = 0;
6332 l += (3 - (nelt % 3));
6334 sel[i] = 3 * k + (l % 3);
6335 k++;
6337 vec_perm_indices indices (sel, 2, nelt);
6338 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6340 if (dump_enabled_p ())
6341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6342 "shuffle of 3 fields structure is not \
6343 supported by target\n");
6344 return false;
6346 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6348 /* Generating permutation constant to shift all elements.
6349 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6350 for (i = 0; i < nelt; i++)
6351 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6352 indices.new_vector (sel, 2, nelt);
6353 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "shift permutation is not supported by target\n");
6358 return false;
6360 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6362 /* Generating permutation constant to shift all elements.
6363 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6364 for (i = 0; i < nelt; i++)
6365 sel[i] = 2 * (nelt / 3) + 1 + i;
6366 indices.new_vector (sel, 2, nelt);
6367 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6369 if (dump_enabled_p ())
6370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6371 "shift permutation is not supported by target\n");
6372 return false;
6374 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6376 /* Generating permutation constant to shift all elements.
6377 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6378 for (i = 0; i < nelt; i++)
6379 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6380 indices.new_vector (sel, 2, nelt);
6381 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6383 if (dump_enabled_p ())
6384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6385 "shift permutation is not supported by target\n");
6386 return false;
6388 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6390 /* Generating permutation constant to shift all elements.
6391 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6392 for (i = 0; i < nelt; i++)
6393 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6394 indices.new_vector (sel, 2, nelt);
6395 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6397 if (dump_enabled_p ())
6398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6399 "shift permutation is not supported by target\n");
6400 return false;
6402 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6404 for (k = 0; k < 3; k++)
6406 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6407 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6408 dr_chain[k], dr_chain[k],
6409 perm3_mask);
6410 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6411 vect[k] = data_ref;
6414 for (k = 0; k < 3; k++)
6416 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6417 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6418 vect[k % 3], vect[(k + 1) % 3],
6419 shift1_mask);
6420 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6421 vect_shift[k] = data_ref;
6424 for (k = 0; k < 3; k++)
6426 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6427 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6428 vect_shift[(4 - k) % 3],
6429 vect_shift[(3 - k) % 3],
6430 shift2_mask);
6431 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6432 vect[k] = data_ref;
6435 (*result_chain)[3 - (nelt % 3)] = vect[2];
6437 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6438 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6439 vect[0], shift3_mask);
6440 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6441 (*result_chain)[nelt % 3] = data_ref;
6443 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6444 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6445 vect[1], shift4_mask);
6446 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6447 (*result_chain)[0] = data_ref;
6448 return true;
6450 return false;
6453 /* Function vect_transform_grouped_load.
6455 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6456 to perform their permutation and ascribe the result vectorized statements to
6457 the scalar statements.
6460 void
6461 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6462 vec<tree> dr_chain,
6463 int size, gimple_stmt_iterator *gsi)
6465 machine_mode mode;
6466 vec<tree> result_chain = vNULL;
6468 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6469 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6470 vectors, that are ready for vector computation. */
6471 result_chain.create (size);
6473 /* If reassociation width for vector type is 2 or greater target machine can
6474 execute 2 or more vector instructions in parallel. Otherwise try to
6475 get chain for loads group using vect_shift_permute_load_chain. */
6476 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6477 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6478 || pow2p_hwi (size)
6479 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6480 gsi, &result_chain))
6481 vect_permute_load_chain (vinfo, dr_chain,
6482 size, stmt_info, gsi, &result_chain);
6483 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6484 result_chain.release ();
6487 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6488 generated as part of the vectorization of STMT_INFO. Assign the statement
6489 for each vector to the associated scalar statement. */
6491 void
6492 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6493 vec<tree> result_chain)
6495 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6496 unsigned int i, gap_count;
6497 tree tmp_data_ref;
6499 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6500 Since we scan the chain starting from it's first node, their order
6501 corresponds the order of data-refs in RESULT_CHAIN. */
6502 stmt_vec_info next_stmt_info = first_stmt_info;
6503 gap_count = 1;
6504 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6506 if (!next_stmt_info)
6507 break;
6509 /* Skip the gaps. Loads created for the gaps will be removed by dead
6510 code elimination pass later. No need to check for the first stmt in
6511 the group, since it always exists.
6512 DR_GROUP_GAP is the number of steps in elements from the previous
6513 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6514 correspond to the gaps. */
6515 if (next_stmt_info != first_stmt_info
6516 && gap_count < DR_GROUP_GAP (next_stmt_info))
6518 gap_count++;
6519 continue;
6522 /* ??? The following needs cleanup after the removal of
6523 DR_GROUP_SAME_DR_STMT. */
6524 if (next_stmt_info)
6526 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6527 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6528 copies, and we put the new vector statement last. */
6529 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6531 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6532 gap_count = 1;
6537 /* Function vect_force_dr_alignment_p.
6539 Returns whether the alignment of a DECL can be forced to be aligned
6540 on ALIGNMENT bit boundary. */
6542 bool
6543 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6545 if (!VAR_P (decl))
6546 return false;
6548 if (decl_in_symtab_p (decl)
6549 && !symtab_node::get (decl)->can_increase_alignment_p ())
6550 return false;
6552 if (TREE_STATIC (decl))
6553 return (known_le (alignment,
6554 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6555 else
6556 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6560 /* Return whether the data reference DR_INFO is supported with respect to its
6561 alignment.
6562 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6563 it is aligned, i.e., check if it is possible to vectorize it with different
6564 alignment. */
6566 enum dr_alignment_support
6567 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6568 bool check_aligned_accesses)
6570 data_reference *dr = dr_info->dr;
6571 stmt_vec_info stmt_info = dr_info->stmt;
6572 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6573 machine_mode mode = TYPE_MODE (vectype);
6574 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6575 class loop *vect_loop = NULL;
6576 bool nested_in_vect_loop = false;
6578 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6579 return dr_aligned;
6581 /* For now assume all conditional loads/stores support unaligned
6582 access without any special code. */
6583 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6584 if (gimple_call_internal_p (stmt)
6585 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6586 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6587 return dr_unaligned_supported;
6589 if (loop_vinfo)
6591 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6592 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6595 /* Possibly unaligned access. */
6597 /* We can choose between using the implicit realignment scheme (generating
6598 a misaligned_move stmt) and the explicit realignment scheme (generating
6599 aligned loads with a REALIGN_LOAD). There are two variants to the
6600 explicit realignment scheme: optimized, and unoptimized.
6601 We can optimize the realignment only if the step between consecutive
6602 vector loads is equal to the vector size. Since the vector memory
6603 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6604 is guaranteed that the misalignment amount remains the same throughout the
6605 execution of the vectorized loop. Therefore, we can create the
6606 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6607 at the loop preheader.
6609 However, in the case of outer-loop vectorization, when vectorizing a
6610 memory access in the inner-loop nested within the LOOP that is now being
6611 vectorized, while it is guaranteed that the misalignment of the
6612 vectorized memory access will remain the same in different outer-loop
6613 iterations, it is *not* guaranteed that is will remain the same throughout
6614 the execution of the inner-loop. This is because the inner-loop advances
6615 with the original scalar step (and not in steps of VS). If the inner-loop
6616 step happens to be a multiple of VS, then the misalignment remains fixed
6617 and we can use the optimized realignment scheme. For example:
6619 for (i=0; i<N; i++)
6620 for (j=0; j<M; j++)
6621 s += a[i+j];
6623 When vectorizing the i-loop in the above example, the step between
6624 consecutive vector loads is 1, and so the misalignment does not remain
6625 fixed across the execution of the inner-loop, and the realignment cannot
6626 be optimized (as illustrated in the following pseudo vectorized loop):
6628 for (i=0; i<N; i+=4)
6629 for (j=0; j<M; j++){
6630 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6631 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6632 // (assuming that we start from an aligned address).
6635 We therefore have to use the unoptimized realignment scheme:
6637 for (i=0; i<N; i+=4)
6638 for (j=k; j<M; j+=4)
6639 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6640 // that the misalignment of the initial address is
6641 // 0).
6643 The loop can then be vectorized as follows:
6645 for (k=0; k<4; k++){
6646 rt = get_realignment_token (&vp[k]);
6647 for (i=0; i<N; i+=4){
6648 v1 = vp[i+k];
6649 for (j=k; j<M; j+=4){
6650 v2 = vp[i+j+VS-1];
6651 va = REALIGN_LOAD <v1,v2,rt>;
6652 vs += va;
6653 v1 = v2;
6656 } */
6658 if (DR_IS_READ (dr))
6660 bool is_packed = false;
6661 tree type = (TREE_TYPE (DR_REF (dr)));
6663 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6664 && (!targetm.vectorize.builtin_mask_for_load
6665 || targetm.vectorize.builtin_mask_for_load ()))
6667 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6669 /* If we are doing SLP then the accesses need not have the
6670 same alignment, instead it depends on the SLP group size. */
6671 if (loop_vinfo
6672 && STMT_SLP_TYPE (stmt_info)
6673 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6674 * (DR_GROUP_SIZE
6675 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6676 TYPE_VECTOR_SUBPARTS (vectype)))
6678 else if (!loop_vinfo
6679 || (nested_in_vect_loop
6680 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6681 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6682 return dr_explicit_realign;
6683 else
6684 return dr_explicit_realign_optimized;
6686 if (!known_alignment_for_access_p (dr_info))
6687 is_packed = not_size_aligned (DR_REF (dr));
6689 if (targetm.vectorize.support_vector_misalignment
6690 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6691 /* Can't software pipeline the loads, but can at least do them. */
6692 return dr_unaligned_supported;
6694 else
6696 bool is_packed = false;
6697 tree type = (TREE_TYPE (DR_REF (dr)));
6699 if (!known_alignment_for_access_p (dr_info))
6700 is_packed = not_size_aligned (DR_REF (dr));
6702 if (targetm.vectorize.support_vector_misalignment
6703 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6704 return dr_unaligned_supported;
6707 /* Unsupported. */
6708 return dr_unaligned_unsupported;