altivec.md (altivec_mov<mode>, [...]): Change the RTL attribute "length" from "4...
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobcf9cee5deb8b33a8f22b6e297cc8dd02ec095516
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
150 unsigned int i = 0;
151 if (gimple_call_internal_p (call))
153 internal_fn ifn = gimple_call_internal_fn (call);
154 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
155 /* gimple_expr_type already picked the type of the loaded
156 or stored data. */
157 i = ~0U;
158 else if (internal_fn_mask_index (ifn) == 0)
159 i = 1;
161 if (i < gimple_call_num_args (call))
163 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
164 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
166 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
167 if (rhs < lhs)
168 scalar_type = rhs_type;
173 *lhs_size_unit = lhs;
174 *rhs_size_unit = rhs;
175 return scalar_type;
179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
180 tested at run-time. Return TRUE if DDR was successfully inserted.
181 Return false if versioning is not supported. */
183 static opt_result
184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
188 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
189 return opt_result::failure_at (vect_location,
190 "will not create alias checks, as"
191 " --param vect-max-version-for-alias-checks"
192 " == 0\n");
194 opt_result res
195 = runtime_alias_check_p (ddr, loop,
196 optimize_loop_nest_for_speed_p (loop));
197 if (!res)
198 return res;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return opt_result::success ();
204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
206 static void
207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
209 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
210 for (unsigned int i = 0; i < checks.length(); ++i)
211 if (checks[i] == value)
212 return;
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location,
216 "need run-time check that %T is nonzero\n",
217 value);
218 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
221 /* Return true if we know that the order of vectorized DR_INFO_A and
222 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
223 DR_INFO_B. At least one of the accesses is a write. */
225 static bool
226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
228 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
229 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
231 /* Single statements are always kept in their original order. */
232 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
233 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
234 return true;
236 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
237 SLP group are emitted at the position of the last scalar load and
238 all loads in an interleaving group are emitted at the position
239 of the first scalar load.
240 Stores in a group are emitted at the position of the last scalar store.
241 Compute that position and check whether the resulting order matches
242 the current one.
243 We have not yet decided between SLP and interleaving so we have
244 to conservatively assume both. */
245 stmt_vec_info il_a;
246 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
247 if (last_a)
249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
250 s = DR_GROUP_NEXT_ELEMENT (s))
251 last_a = get_later_stmt (last_a, s);
252 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
254 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
255 s = DR_GROUP_NEXT_ELEMENT (s))
256 if (get_later_stmt (il_a, s) == il_a)
257 il_a = s;
259 else
260 il_a = last_a;
262 else
263 last_a = il_a = stmtinfo_a;
264 stmt_vec_info il_b;
265 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
266 if (last_b)
268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
269 s = DR_GROUP_NEXT_ELEMENT (s))
270 last_b = get_later_stmt (last_b, s);
271 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
273 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
274 s = DR_GROUP_NEXT_ELEMENT (s))
275 if (get_later_stmt (il_b, s) == il_b)
276 il_b = s;
278 else
279 il_b = last_b;
281 else
282 last_b = il_b = stmtinfo_b;
283 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
284 return (/* SLP */
285 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
286 /* Interleaving */
287 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
288 /* Mixed */
289 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
290 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
293 /* A subroutine of vect_analyze_data_ref_dependence. Handle
294 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
295 distances. These distances are conservatively correct but they don't
296 reflect a guaranteed dependence.
298 Return true if this function does all the work necessary to avoid
299 an alias or false if the caller should use the dependence distances
300 to limit the vectorization factor in the usual way. LOOP_DEPTH is
301 the depth of the loop described by LOOP_VINFO and the other arguments
302 are as for vect_analyze_data_ref_dependence. */
304 static bool
305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
306 loop_vec_info loop_vinfo,
307 int loop_depth, unsigned int *max_vf)
309 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
310 lambda_vector dist_v;
311 unsigned int i;
312 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
314 int dist = dist_v[loop_depth];
315 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
317 /* If the user asserted safelen >= DIST consecutive iterations
318 can be executed concurrently, assume independence.
320 ??? An alternative would be to add the alias check even
321 in this case, and vectorize the fallback loop with the
322 maximum VF set to safelen. However, if the user has
323 explicitly given a length, it's less likely that that
324 would be a win. */
325 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
327 if ((unsigned int) loop->safelen < *max_vf)
328 *max_vf = loop->safelen;
329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
330 continue;
333 /* For dependence distances of 2 or more, we have the option
334 of limiting VF or checking for an alias at runtime.
335 Prefer to check at runtime if we can, to avoid limiting
336 the VF unnecessarily when the bases are in fact independent.
338 Note that the alias checks will be removed if the VF ends up
339 being small enough. */
340 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
341 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
342 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
343 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
344 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
347 return true;
351 /* Function vect_analyze_data_ref_dependence.
353 FIXME: I needed to change the sense of the returned flag.
355 Return FALSE if there (might) exist a dependence between a memory-reference
356 DRA and a memory-reference DRB. When versioning for alias may check a
357 dependence at run-time, return TRUE. Adjust *MAX_VF according to
358 the data dependence. */
360 static opt_result
361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
362 loop_vec_info loop_vinfo,
363 unsigned int *max_vf)
365 unsigned int i;
366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
367 struct data_reference *dra = DDR_A (ddr);
368 struct data_reference *drb = DDR_B (ddr);
369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
371 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
372 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
373 lambda_vector dist_v;
374 unsigned int loop_depth;
376 /* In loop analysis all data references should be vectorizable. */
377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379 gcc_unreachable ();
381 /* Independent data accesses. */
382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383 return opt_result::success ();
385 if (dra == drb
386 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387 return opt_result::success ();
389 /* We do not have to consider dependences between accesses that belong
390 to the same group, unless the stride could be smaller than the
391 group size. */
392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396 return opt_result::success ();
398 /* Even if we have an anti-dependence then, as the vectorized loop covers at
399 least two scalar iterations, there is always also a true dependence.
400 As the vectorizer does not re-order loads and stores we can ignore
401 the anti-dependence if TBAA can disambiguate both DRs similar to the
402 case with known negative distance anti-dependences (positive
403 distance anti-dependences would violate TBAA constraints). */
404 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 get_alias_set (DR_REF (drb))))
408 return opt_result::success ();
410 /* Unknown data dependence. */
411 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
413 /* If user asserted safelen consecutive iterations can be
414 executed concurrently, assume independence. */
415 if (loop->safelen >= 2)
417 if ((unsigned int) loop->safelen < *max_vf)
418 *max_vf = loop->safelen;
419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 return opt_result::success ();
423 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
424 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
425 return opt_result::failure_at
426 (stmtinfo_a->stmt,
427 "versioning for alias not supported for: "
428 "can't determine dependence between %T and %T\n",
429 DR_REF (dra), DR_REF (drb));
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
433 "versioning for alias required: "
434 "can't determine dependence between %T and %T\n",
435 DR_REF (dra), DR_REF (drb));
437 /* Add to list of ddrs that need to be tested at run-time. */
438 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
441 /* Known data dependence. */
442 if (DDR_NUM_DIST_VECTS (ddr) == 0)
444 /* If user asserted safelen consecutive iterations can be
445 executed concurrently, assume independence. */
446 if (loop->safelen >= 2)
448 if ((unsigned int) loop->safelen < *max_vf)
449 *max_vf = loop->safelen;
450 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
451 return opt_result::success ();
454 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
455 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
456 return opt_result::failure_at
457 (stmtinfo_a->stmt,
458 "versioning for alias not supported for: "
459 "bad dist vector for %T and %T\n",
460 DR_REF (dra), DR_REF (drb));
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
464 "versioning for alias required: "
465 "bad dist vector for %T and %T\n",
466 DR_REF (dra), DR_REF (drb));
467 /* Add to list of ddrs that need to be tested at run-time. */
468 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
471 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
473 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
474 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
475 loop_depth, max_vf))
476 return opt_result::success ();
478 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
480 int dist = dist_v[loop_depth];
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE, vect_location,
484 "dependence distance = %d.\n", dist);
486 if (dist == 0)
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location,
490 "dependence distance == 0 between %T and %T\n",
491 DR_REF (dra), DR_REF (drb));
493 /* When we perform grouped accesses and perform implicit CSE
494 by detecting equal accesses and doing disambiguation with
495 runtime alias tests like for
496 .. = a[i];
497 .. = a[i+1];
498 a[i] = ..;
499 a[i+1] = ..;
500 *p = ..;
501 .. = a[i];
502 .. = a[i+1];
503 where we will end up loading { a[i], a[i+1] } once, make
504 sure that inserting group loads before the first load and
505 stores after the last store will do the right thing.
506 Similar for groups like
507 a[i] = ...;
508 ... = a[i];
509 a[i+1] = ...;
510 where loads from the group interleave with the store. */
511 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
512 return opt_result::failure_at (stmtinfo_a->stmt,
513 "READ_WRITE dependence"
514 " in interleaving.\n");
516 if (loop->safelen < 2)
518 tree indicator = dr_zero_step_indicator (dra);
519 if (!indicator || integer_zerop (indicator))
520 return opt_result::failure_at (stmtinfo_a->stmt,
521 "access also has a zero step\n");
522 else if (TREE_CODE (indicator) != INTEGER_CST)
523 vect_check_nonzero_value (loop_vinfo, indicator);
525 continue;
528 if (dist > 0 && DDR_REVERSED_P (ddr))
530 /* If DDR_REVERSED_P the order of the data-refs in DDR was
531 reversed (to make distance vector positive), and the actual
532 distance is negative. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance negative.\n");
536 /* When doing outer loop vectorization, we need to check if there is
537 a backward dependence at the inner loop level if the dependence
538 at the outer loop is reversed. See PR81740. */
539 if (nested_in_vect_loop_p (loop, stmtinfo_a)
540 || nested_in_vect_loop_p (loop, stmtinfo_b))
542 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
543 DDR_LOOP_NEST (ddr));
544 if (dist_v[inner_depth] < 0)
545 return opt_result::failure_at (stmtinfo_a->stmt,
546 "not vectorized, dependence "
547 "between data-refs %T and %T\n",
548 DR_REF (dra), DR_REF (drb));
550 /* Record a negative dependence distance to later limit the
551 amount of stmt copying / unrolling we can perform.
552 Only need to handle read-after-write dependence. */
553 if (DR_IS_READ (drb)
554 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
555 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
556 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
557 continue;
560 unsigned int abs_dist = abs (dist);
561 if (abs_dist >= 2 && abs_dist < *max_vf)
563 /* The dependence distance requires reduction of the maximal
564 vectorization factor. */
565 *max_vf = abs_dist;
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "adjusting maximal vectorization factor to %i\n",
569 *max_vf);
572 if (abs_dist >= *max_vf)
574 /* Dependence distance does not create dependence, as far as
575 vectorization is concerned, in this case. */
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_NOTE, vect_location,
578 "dependence distance >= VF.\n");
579 continue;
582 return opt_result::failure_at (stmtinfo_a->stmt,
583 "not vectorized, possible dependence "
584 "between data-refs %T and %T\n",
585 DR_REF (dra), DR_REF (drb));
588 return opt_result::success ();
591 /* Function vect_analyze_data_ref_dependences.
593 Examine all the data references in the loop, and make sure there do not
594 exist any data dependences between them. Set *MAX_VF according to
595 the maximum vectorization factor the data dependences allow. */
597 opt_result
598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
599 unsigned int *max_vf)
601 unsigned int i;
602 struct data_dependence_relation *ddr;
604 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
606 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
608 LOOP_VINFO_DDRS (loop_vinfo)
609 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
610 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
611 /* We need read-read dependences to compute
612 STMT_VINFO_SAME_ALIGN_REFS. */
613 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
614 &LOOP_VINFO_DDRS (loop_vinfo),
615 LOOP_VINFO_LOOP_NEST (loop_vinfo),
616 true);
617 gcc_assert (res);
620 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
622 /* For epilogues we either have no aliases or alias versioning
623 was applied to original loop. Therefore we may just get max_vf
624 using VF of original loop. */
625 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
626 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
627 else
628 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
630 opt_result res
631 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
632 if (!res)
633 return res;
636 return opt_result::success ();
640 /* Function vect_slp_analyze_data_ref_dependence.
642 Return TRUE if there (might) exist a dependence between a memory-reference
643 DRA and a memory-reference DRB for VINFO. When versioning for alias
644 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
645 according to the data dependence. */
647 static bool
648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
649 struct data_dependence_relation *ddr)
651 struct data_reference *dra = DDR_A (ddr);
652 struct data_reference *drb = DDR_B (ddr);
653 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
654 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
656 /* We need to check dependences of statements marked as unvectorizable
657 as well, they still can prohibit vectorization. */
659 /* Independent data accesses. */
660 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
661 return false;
663 if (dra == drb)
664 return false;
666 /* Read-read is OK. */
667 if (DR_IS_READ (dra) && DR_IS_READ (drb))
668 return false;
670 /* If dra and drb are part of the same interleaving chain consider
671 them independent. */
672 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
673 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
674 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
675 return false;
677 /* Unknown data dependence. */
678 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "can't determine dependence between %T and %T\n",
683 DR_REF (dra), DR_REF (drb));
685 else if (dump_enabled_p ())
686 dump_printf_loc (MSG_NOTE, vect_location,
687 "determined dependence between %T and %T\n",
688 DR_REF (dra), DR_REF (drb));
690 return true;
694 /* Analyze dependences involved in the transform of SLP NODE. STORES
695 contain the vector of scalar stores of this instance if we are
696 disambiguating the loads. */
698 static bool
699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
700 vec<stmt_vec_info> stores,
701 stmt_vec_info last_store_info)
703 /* This walks over all stmts involved in the SLP load/store done
704 in NODE verifying we can sink them up to the last stmt in the
705 group. */
706 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
707 vec_info *vinfo = last_access_info->vinfo;
708 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
710 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
711 if (access_info == last_access_info)
712 continue;
713 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
714 ao_ref ref;
715 bool ref_initialized_p = false;
716 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
717 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
719 gimple *stmt = gsi_stmt (gsi);
720 if (! gimple_vuse (stmt)
721 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
722 continue;
724 /* If we couldn't record a (single) data reference for this
725 stmt we have to resort to the alias oracle. */
726 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
727 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
728 if (!dr_b)
730 /* We are moving a store or sinking a load - this means
731 we cannot use TBAA for disambiguation. */
732 if (!ref_initialized_p)
733 ao_ref_init (&ref, DR_REF (dr_a));
734 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
735 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
736 return false;
737 continue;
740 bool dependent = false;
741 /* If we run into a store of this same instance (we've just
742 marked those) then delay dependence checking until we run
743 into the last store because this is where it will have
744 been sunk to (and we verify if we can do that as well). */
745 if (gimple_visited_p (stmt))
747 if (stmt_info != last_store_info)
748 continue;
749 unsigned i;
750 stmt_vec_info store_info;
751 FOR_EACH_VEC_ELT (stores, i, store_info)
753 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
754 ddr_p ddr = initialize_data_dependence_relation
755 (dr_a, store_dr, vNULL);
756 dependent
757 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
758 free_dependence_relation (ddr);
759 if (dependent)
760 break;
763 else
765 ddr_p ddr = initialize_data_dependence_relation (dr_a,
766 dr_b, vNULL);
767 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
768 free_dependence_relation (ddr);
770 if (dependent)
771 return false;
774 return true;
778 /* Function vect_analyze_data_ref_dependences.
780 Examine all the data references in the basic-block, and make sure there
781 do not exist any data dependences between them. Set *MAX_VF according to
782 the maximum vectorization factor the data dependences allow. */
784 bool
785 vect_slp_analyze_instance_dependence (slp_instance instance)
787 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
789 /* The stores of this instance are at the root of the SLP tree. */
790 slp_tree store = SLP_INSTANCE_TREE (instance);
791 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
792 store = NULL;
794 /* Verify we can sink stores to the vectorized stmt insert location. */
795 stmt_vec_info last_store_info = NULL;
796 if (store)
798 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
799 return false;
801 /* Mark stores in this instance and remember the last one. */
802 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
803 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
804 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
807 bool res = true;
809 /* Verify we can sink loads to the vectorized stmt insert location,
810 special-casing stores of this instance. */
811 slp_tree load;
812 unsigned int i;
813 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
814 if (! vect_slp_analyze_node_dependences (instance, load,
815 store
816 ? SLP_TREE_SCALAR_STMTS (store)
817 : vNULL, last_store_info))
819 res = false;
820 break;
823 /* Unset the visited flag. */
824 if (store)
825 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
826 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
828 return res;
831 /* Record the base alignment guarantee given by DRB, which occurs
832 in STMT_INFO. */
834 static void
835 vect_record_base_alignment (stmt_vec_info stmt_info,
836 innermost_loop_behavior *drb)
838 vec_info *vinfo = stmt_info->vinfo;
839 bool existed;
840 innermost_loop_behavior *&entry
841 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
842 if (!existed || entry->base_alignment < drb->base_alignment)
844 entry = drb;
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_NOTE, vect_location,
847 "recording new base alignment for %T\n"
848 " alignment: %d\n"
849 " misalignment: %d\n"
850 " based on: %G",
851 drb->base_address,
852 drb->base_alignment,
853 drb->base_misalignment,
854 stmt_info->stmt);
858 /* If the region we're going to vectorize is reached, all unconditional
859 data references occur at least once. We can therefore pool the base
860 alignment guarantees from each unconditional reference. Do this by
861 going through all the data references in VINFO and checking whether
862 the containing statement makes the reference unconditionally. If so,
863 record the alignment of the base address in VINFO so that it can be
864 used for all other references with the same base. */
866 void
867 vect_record_base_alignments (vec_info *vinfo)
869 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
870 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
871 data_reference *dr;
872 unsigned int i;
873 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
875 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
876 stmt_vec_info stmt_info = dr_info->stmt;
877 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
878 && STMT_VINFO_VECTORIZABLE (stmt_info)
879 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
881 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
883 /* If DR is nested in the loop that is being vectorized, we can also
884 record the alignment of the base wrt the outer loop. */
885 if (loop && nested_in_vect_loop_p (loop, stmt_info))
886 vect_record_base_alignment
887 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
892 /* Return the target alignment for the vectorized form of DR_INFO. */
894 static poly_uint64
895 vect_calculate_target_alignment (dr_vec_info *dr_info)
897 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
898 return targetm.vectorize.preferred_vector_alignment (vectype);
901 /* Function vect_compute_data_ref_alignment
903 Compute the misalignment of the data reference DR_INFO.
905 Output:
906 1. DR_MISALIGNMENT (DR_INFO) is defined.
908 FOR NOW: No analysis is actually performed. Misalignment is calculated
909 only for trivial cases. TODO. */
911 static void
912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
914 stmt_vec_info stmt_info = dr_info->stmt;
915 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
916 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
917 struct loop *loop = NULL;
918 tree ref = DR_REF (dr_info->dr);
919 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
921 if (dump_enabled_p ())
922 dump_printf_loc (MSG_NOTE, vect_location,
923 "vect_compute_data_ref_alignment:\n");
925 if (loop_vinfo)
926 loop = LOOP_VINFO_LOOP (loop_vinfo);
928 /* Initialize misalignment to unknown. */
929 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
931 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
932 return;
934 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
935 bool step_preserves_misalignment_p;
937 poly_uint64 vector_alignment
938 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
939 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
941 unsigned HOST_WIDE_INT vect_align_c;
942 if (!vector_alignment.is_constant (&vect_align_c))
943 return;
945 /* No step for BB vectorization. */
946 if (!loop)
948 gcc_assert (integer_zerop (drb->step));
949 step_preserves_misalignment_p = true;
952 /* In case the dataref is in an inner-loop of the loop that is being
953 vectorized (LOOP), we use the base and misalignment information
954 relative to the outer-loop (LOOP). This is ok only if the misalignment
955 stays the same throughout the execution of the inner-loop, which is why
956 we have to check that the stride of the dataref in the inner-loop evenly
957 divides by the vector alignment. */
958 else if (nested_in_vect_loop_p (loop, stmt_info))
960 step_preserves_misalignment_p
961 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
963 if (dump_enabled_p ())
965 if (step_preserves_misalignment_p)
966 dump_printf_loc (MSG_NOTE, vect_location,
967 "inner step divides the vector alignment.\n");
968 else
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "inner step doesn't divide the vector"
971 " alignment.\n");
975 /* Similarly we can only use base and misalignment information relative to
976 an innermost loop if the misalignment stays the same throughout the
977 execution of the loop. As above, this is the case if the stride of
978 the dataref evenly divides by the alignment. */
979 else
981 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
982 step_preserves_misalignment_p
983 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
985 if (!step_preserves_misalignment_p && dump_enabled_p ())
986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
987 "step doesn't divide the vector alignment.\n");
990 unsigned int base_alignment = drb->base_alignment;
991 unsigned int base_misalignment = drb->base_misalignment;
993 /* Calculate the maximum of the pooled base address alignment and the
994 alignment that we can compute for DR itself. */
995 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
996 if (entry && base_alignment < (*entry)->base_alignment)
998 base_alignment = (*entry)->base_alignment;
999 base_misalignment = (*entry)->base_misalignment;
1002 if (drb->offset_alignment < vect_align_c
1003 || !step_preserves_misalignment_p
1004 /* We need to know whether the step wrt the vectorized loop is
1005 negative when computing the starting misalignment below. */
1006 || TREE_CODE (drb->step) != INTEGER_CST)
1008 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 "Unknown alignment for access: %T\n", ref);
1011 return;
1014 if (base_alignment < vect_align_c)
1016 unsigned int max_alignment;
1017 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018 if (max_alignment < vect_align_c
1019 || !vect_can_force_dr_alignment_p (base,
1020 vect_align_c * BITS_PER_UNIT))
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "can't force alignment of ref: %T\n", ref);
1025 return;
1028 /* Force the alignment of the decl.
1029 NOTE: This is the only change to the code we make during
1030 the analysis phase, before deciding to vectorize the loop. */
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_NOTE, vect_location,
1033 "force alignment of %T\n", ref);
1035 dr_info->base_decl = base;
1036 dr_info->base_misaligned = true;
1037 base_misalignment = 0;
1039 poly_int64 misalignment
1040 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1042 /* If this is a backward running DR then first access in the larger
1043 vectype actually is N-1 elements before the address in the DR.
1044 Adjust misalign accordingly. */
1045 if (tree_int_cst_sgn (drb->step) < 0)
1046 /* PLUS because STEP is negative. */
1047 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048 * TREE_INT_CST_LOW (drb->step));
1050 unsigned int const_misalignment;
1051 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "Non-constant misalignment for access: %T\n", ref);
1056 return;
1059 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1061 if (dump_enabled_p ())
1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 "misalign = %d bytes of ref %T\n",
1064 DR_MISALIGNMENT (dr_info), ref);
1066 return;
1069 /* Function vect_update_misalignment_for_peel.
1070 Sets DR_INFO's misalignment
1071 - to 0 if it has the same alignment as DR_PEEL_INFO,
1072 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073 - to -1 (unknown) otherwise.
1075 DR_INFO - the data reference whose misalignment is to be adjusted.
1076 DR_PEEL_INFO - the data reference whose misalignment is being made
1077 zero in the vector loop by the peel.
1078 NPEEL - the number of iterations in the peel loop if the misalignment
1079 of DR_PEEL_INFO is known at compile time. */
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083 dr_vec_info *dr_peel_info, int npeel)
1085 unsigned int i;
1086 vec<dr_p> same_aligned_drs;
1087 struct data_reference *current_dr;
1088 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1090 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091 it is aligned in the vector loop. */
1092 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1095 if (current_dr != dr_info->dr)
1096 continue;
1097 gcc_assert (!known_alignment_for_access_p (dr_info)
1098 || !known_alignment_for_access_p (dr_peel_info)
1099 || (DR_MISALIGNMENT (dr_info)
1100 == DR_MISALIGNMENT (dr_peel_info)));
1101 SET_DR_MISALIGNMENT (dr_info, 0);
1102 return;
1105 unsigned HOST_WIDE_INT alignment;
1106 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107 && known_alignment_for_access_p (dr_info)
1108 && known_alignment_for_access_p (dr_peel_info))
1110 int misal = DR_MISALIGNMENT (dr_info);
1111 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112 misal &= alignment - 1;
1113 SET_DR_MISALIGNMENT (dr_info, misal);
1114 return;
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119 "to unknown (-1).\n");
1120 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1124 /* Function verify_data_ref_alignment
1126 Return TRUE if DR_INFO can be handled with respect to alignment. */
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1131 enum dr_alignment_support supportable_dr_alignment
1132 = vect_supportable_dr_alignment (dr_info, false);
1133 if (!supportable_dr_alignment)
1134 return opt_result::failure_at
1135 (dr_info->stmt->stmt,
1136 DR_IS_READ (dr_info->dr)
1137 ? "not vectorized: unsupported unaligned load: %T\n"
1138 : "not vectorized: unsupported unaligned store: %T\n",
1139 DR_REF (dr_info->dr));
1141 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142 dump_printf_loc (MSG_NOTE, vect_location,
1143 "Vectorizing an unaligned access.\n");
1145 return opt_result::success ();
1148 /* Function vect_verify_datarefs_alignment
1150 Return TRUE if all data references in the loop can be
1151 handled with respect to alignment. */
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1156 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157 struct data_reference *dr;
1158 unsigned int i;
1160 FOR_EACH_VEC_ELT (datarefs, i, dr)
1162 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163 stmt_vec_info stmt_info = dr_info->stmt;
1165 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166 continue;
1168 /* For interleaving, only the alignment of the first access matters. */
1169 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171 continue;
1173 /* Strided accesses perform only component accesses, alignment is
1174 irrelevant for them. */
1175 if (STMT_VINFO_STRIDED_P (stmt_info)
1176 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177 continue;
1179 opt_result res = verify_data_ref_alignment (dr_info);
1180 if (!res)
1181 return res;
1184 return opt_result::success ();
1187 /* Given an memory reference EXP return whether its alignment is less
1188 than its size. */
1190 static bool
1191 not_size_aligned (tree exp)
1193 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194 return true;
1196 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197 > get_object_alignment (exp));
1200 /* Function vector_alignment_reachable_p
1202 Return true if vector alignment for DR_INFO is reachable by peeling
1203 a few loop iterations. Return false otherwise. */
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1208 stmt_vec_info stmt_info = dr_info->stmt;
1209 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1211 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1213 /* For interleaved access we peel only if number of iterations in
1214 the prolog loop ({VF - misalignment}), is a multiple of the
1215 number of the interleaved accesses. */
1216 int elem_size, mis_in_elements;
1218 /* FORNOW: handle only known alignment. */
1219 if (!known_alignment_for_access_p (dr_info))
1220 return false;
1222 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224 elem_size = vector_element_size (vector_size, nelements);
1225 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1227 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228 return false;
1231 /* If misalignment is known at the compile time then allow peeling
1232 only if natural alignment is reachable through peeling. */
1233 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1235 HOST_WIDE_INT elmsize =
1236 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237 if (dump_enabled_p ())
1239 dump_printf_loc (MSG_NOTE, vect_location,
1240 "data size = %wd. misalignment = %d.\n", elmsize,
1241 DR_MISALIGNMENT (dr_info));
1243 if (DR_MISALIGNMENT (dr_info) % elmsize)
1245 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247 "data size does not divide the misalignment.\n");
1248 return false;
1252 if (!known_alignment_for_access_p (dr_info))
1254 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "Unknown misalignment, %snaturally aligned\n",
1259 is_packed ? "not " : "");
1260 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1263 return true;
1267 /* Calculate the cost of the memory access represented by DR_INFO. */
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271 unsigned int *inside_cost,
1272 unsigned int *outside_cost,
1273 stmt_vector_for_cost *body_cost_vec,
1274 stmt_vector_for_cost *prologue_cost_vec)
1276 stmt_vec_info stmt_info = dr_info->stmt;
1277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278 int ncopies;
1280 if (PURE_SLP_STMT (stmt_info))
1281 ncopies = 1;
1282 else
1283 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1285 if (DR_IS_READ (dr_info->dr))
1286 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287 prologue_cost_vec, body_cost_vec, false);
1288 else
1289 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "vect_get_data_access_cost: inside_cost = %d, "
1294 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1298 typedef struct _vect_peel_info
1300 dr_vec_info *dr_info;
1301 int npeel;
1302 unsigned int count;
1303 } *vect_peel_info;
1305 typedef struct _vect_peel_extended_info
1307 struct _vect_peel_info peel_info;
1308 unsigned int inside_cost;
1309 unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1313 /* Peeling hashtable helpers. */
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1317 static inline hashval_t hash (const _vect_peel_info *);
1318 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1324 return (hashval_t) peel_info->npeel;
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1330 return (a->npeel == b->npeel);
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339 int npeel)
1341 struct _vect_peel_info elem, *slot;
1342 _vect_peel_info **new_slot;
1343 bool supportable_dr_alignment
1344 = vect_supportable_dr_alignment (dr_info, true);
1346 elem.npeel = npeel;
1347 slot = peeling_htab->find (&elem);
1348 if (slot)
1349 slot->count++;
1350 else
1352 slot = XNEW (struct _vect_peel_info);
1353 slot->npeel = npeel;
1354 slot->dr_info = dr_info;
1355 slot->count = 1;
1356 new_slot = peeling_htab->find_slot (slot, INSERT);
1357 *new_slot = slot;
1360 if (!supportable_dr_alignment
1361 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362 slot->count += VECT_MAX_COST;
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367 number of data accesses. */
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371 _vect_peel_extended_info *max)
1373 vect_peel_info elem = *slot;
1375 if (elem->count > max->peel_info.count
1376 || (elem->count == max->peel_info.count
1377 && max->peel_info.npeel > elem->npeel))
1379 max->peel_info.npeel = elem->npeel;
1380 max->peel_info.count = elem->count;
1381 max->peel_info.dr_info = elem->dr_info;
1384 return 1;
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1389 we assume DR0_INFO's misalignment will be zero after peeling. */
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393 dr_vec_info *dr0_info,
1394 unsigned int *inside_cost,
1395 unsigned int *outside_cost,
1396 stmt_vector_for_cost *body_cost_vec,
1397 stmt_vector_for_cost *prologue_cost_vec,
1398 unsigned int npeel,
1399 bool unknown_misalignment)
1401 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402 unsigned i;
1403 data_reference *dr;
1405 FOR_EACH_VEC_ELT (datarefs, i, dr)
1407 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408 stmt_vec_info stmt_info = dr_info->stmt;
1409 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410 continue;
1412 /* For interleaving, only the alignment of the first access
1413 matters. */
1414 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416 continue;
1418 /* Strided accesses perform only component accesses, alignment is
1419 irrelevant for them. */
1420 if (STMT_VINFO_STRIDED_P (stmt_info)
1421 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422 continue;
1424 int save_misalignment;
1425 save_misalignment = DR_MISALIGNMENT (dr_info);
1426 if (npeel == 0)
1428 else if (unknown_misalignment && dr_info == dr0_info)
1429 SET_DR_MISALIGNMENT (dr_info, 0);
1430 else
1431 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433 body_cost_vec, prologue_cost_vec);
1434 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439 Find the one with the lowest cost. */
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443 _vect_peel_extended_info *min)
1445 vect_peel_info elem = *slot;
1446 int dummy;
1447 unsigned int inside_cost = 0, outside_cost = 0;
1448 stmt_vec_info stmt_info = elem->dr_info->stmt;
1449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451 epilogue_cost_vec;
1453 prologue_cost_vec.create (2);
1454 body_cost_vec.create (2);
1455 epilogue_cost_vec.create (2);
1457 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458 &outside_cost, &body_cost_vec,
1459 &prologue_cost_vec, elem->npeel, false);
1461 body_cost_vec.release ();
1463 outside_cost += vect_get_known_peeling_cost
1464 (loop_vinfo, elem->npeel, &dummy,
1465 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466 &prologue_cost_vec, &epilogue_cost_vec);
1468 /* Prologue and epilogue costs are added to the target model later.
1469 These costs depend only on the scalar iteration cost, the
1470 number of peeling iterations finally chosen, and the number of
1471 misaligned statements. So discard the information found here. */
1472 prologue_cost_vec.release ();
1473 epilogue_cost_vec.release ();
1475 if (inside_cost < min->inside_cost
1476 || (inside_cost == min->inside_cost
1477 && outside_cost < min->outside_cost))
1479 min->inside_cost = inside_cost;
1480 min->outside_cost = outside_cost;
1481 min->peel_info.dr_info = elem->dr_info;
1482 min->peel_info.npeel = elem->npeel;
1483 min->peel_info.count = elem->count;
1486 return 1;
1490 /* Choose best peeling option by traversing peeling hash table and either
1491 choosing an option with the lowest cost (if cost model is enabled) or the
1492 option that aligns as many accesses as possible. */
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496 loop_vec_info loop_vinfo)
1498 struct _vect_peel_extended_info res;
1500 res.peel_info.dr_info = NULL;
1502 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1504 res.inside_cost = INT_MAX;
1505 res.outside_cost = INT_MAX;
1506 peeling_htab->traverse <_vect_peel_extended_info *,
1507 vect_peeling_hash_get_lowest_cost> (&res);
1509 else
1511 res.peel_info.count = 0;
1512 peeling_htab->traverse <_vect_peel_extended_info *,
1513 vect_peeling_hash_get_most_frequent> (&res);
1514 res.inside_cost = 0;
1515 res.outside_cost = 0;
1518 return res;
1521 /* Return true if the new peeling NPEEL is supported. */
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525 unsigned npeel)
1527 unsigned i;
1528 struct data_reference *dr = NULL;
1529 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530 enum dr_alignment_support supportable_dr_alignment;
1532 /* Ensure that all data refs can be vectorized after the peel. */
1533 FOR_EACH_VEC_ELT (datarefs, i, dr)
1535 int save_misalignment;
1537 if (dr == dr0_info->dr)
1538 continue;
1540 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541 stmt_vec_info stmt_info = dr_info->stmt;
1542 /* For interleaving, only the alignment of the first access
1543 matters. */
1544 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546 continue;
1548 /* Strided accesses perform only component accesses, alignment is
1549 irrelevant for them. */
1550 if (STMT_VINFO_STRIDED_P (stmt_info)
1551 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552 continue;
1554 save_misalignment = DR_MISALIGNMENT (dr_info);
1555 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556 supportable_dr_alignment
1557 = vect_supportable_dr_alignment (dr_info, false);
1558 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1560 if (!supportable_dr_alignment)
1561 return false;
1564 return true;
1567 /* Function vect_enhance_data_refs_alignment
1569 This pass will use loop versioning and loop peeling in order to enhance
1570 the alignment of data references in the loop.
1572 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573 original loop is to be vectorized. Any other loops that are created by
1574 the transformations performed in this pass - are not supposed to be
1575 vectorized. This restriction will be relaxed.
1577 This pass will require a cost model to guide it whether to apply peeling
1578 or versioning or a combination of the two. For example, the scheme that
1579 intel uses when given a loop with several memory accesses, is as follows:
1580 choose one memory access ('p') which alignment you want to force by doing
1581 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1582 other accesses are not necessarily aligned, or (2) use loop versioning to
1583 generate one loop in which all accesses are aligned, and another loop in
1584 which only 'p' is necessarily aligned.
1586 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1590 Devising a cost model is the most critical aspect of this work. It will
1591 guide us on which access to peel for, whether to use loop versioning, how
1592 many versions to create, etc. The cost model will probably consist of
1593 generic considerations as well as target specific considerations (on
1594 powerpc for example, misaligned stores are more painful than misaligned
1595 loads).
1597 Here are the general steps involved in alignment enhancements:
1599 -- original loop, before alignment analysis:
1600 for (i=0; i<N; i++){
1601 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1602 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1605 -- After vect_compute_data_refs_alignment:
1606 for (i=0; i<N; i++){
1607 x = q[i]; # DR_MISALIGNMENT(q) = 3
1608 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1611 -- Possibility 1: we do loop versioning:
1612 if (p is aligned) {
1613 for (i=0; i<N; i++){ # loop 1A
1614 x = q[i]; # DR_MISALIGNMENT(q) = 3
1615 p[i] = y; # DR_MISALIGNMENT(p) = 0
1618 else {
1619 for (i=0; i<N; i++){ # loop 1B
1620 x = q[i]; # DR_MISALIGNMENT(q) = 3
1621 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1625 -- Possibility 2: we do loop peeling:
1626 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1627 x = q[i];
1628 p[i] = y;
1630 for (i = 3; i < N; i++){ # loop 2A
1631 x = q[i]; # DR_MISALIGNMENT(q) = 0
1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1635 -- Possibility 3: combination of loop peeling and versioning:
1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1637 x = q[i];
1638 p[i] = y;
1640 if (p is aligned) {
1641 for (i = 3; i<N; i++){ # loop 3A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = 0
1646 else {
1647 for (i = 3; i<N; i++){ # loop 3B
1648 x = q[i]; # DR_MISALIGNMENT(q) = 0
1649 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1653 These loops are later passed to loop_transform to be vectorized. The
1654 vectorizer will use the alignment information to guide the transformation
1655 (whether to generate regular loads/stores, or with special handling for
1656 misalignment). */
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1661 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663 enum dr_alignment_support supportable_dr_alignment;
1664 dr_vec_info *first_store = NULL;
1665 dr_vec_info *dr0_info = NULL;
1666 struct data_reference *dr;
1667 unsigned int i, j;
1668 bool do_peeling = false;
1669 bool do_versioning = false;
1670 unsigned int npeel = 0;
1671 bool one_misalignment_known = false;
1672 bool one_misalignment_unknown = false;
1673 bool one_dr_unsupportable = false;
1674 dr_vec_info *unsupportable_dr_info = NULL;
1675 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676 unsigned possible_npeel_number = 1;
1677 tree vectype;
1678 unsigned int mis, same_align_drs_max = 0;
1679 hash_table<peel_info_hasher> peeling_htab (1);
1681 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1683 /* Reset data so we can safely be called multiple times. */
1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1687 /* While cost model enhancements are expected in the future, the high level
1688 view of the code at this time is as follows:
1690 A) If there is a misaligned access then see if peeling to align
1691 this access can make all data references satisfy
1692 vect_supportable_dr_alignment. If so, update data structures
1693 as needed and return true.
1695 B) If peeling wasn't possible and there is a data reference with an
1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697 then see if loop versioning checks can be used to make all data
1698 references satisfy vect_supportable_dr_alignment. If so, update
1699 data structures as needed and return true.
1701 C) If neither peeling nor versioning were successful then return false if
1702 any data reference does not satisfy vect_supportable_dr_alignment.
1704 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1706 Note, Possibility 3 above (which is peeling and versioning together) is not
1707 being done at this time. */
1709 /* (1) Peeling to force alignment. */
1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712 Considerations:
1713 + How many accesses will become aligned due to the peeling
1714 - How many accesses will become unaligned due to the peeling,
1715 and the cost of misaligned accesses.
1716 - The cost of peeling (the extra runtime checks, the increase
1717 in code size). */
1719 FOR_EACH_VEC_ELT (datarefs, i, dr)
1721 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722 stmt_vec_info stmt_info = dr_info->stmt;
1724 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725 continue;
1727 /* For interleaving, only the alignment of the first access
1728 matters. */
1729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731 continue;
1733 /* For scatter-gather or invariant accesses there is nothing
1734 to enhance. */
1735 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736 || integer_zerop (DR_STEP (dr)))
1737 continue;
1739 /* Strided accesses perform only component accesses, alignment is
1740 irrelevant for them. */
1741 if (STMT_VINFO_STRIDED_P (stmt_info)
1742 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743 continue;
1745 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746 do_peeling = vector_alignment_reachable_p (dr_info);
1747 if (do_peeling)
1749 if (known_alignment_for_access_p (dr_info))
1751 unsigned int npeel_tmp = 0;
1752 bool negative = tree_int_cst_compare (DR_STEP (dr),
1753 size_zero_node) < 0;
1755 vectype = STMT_VINFO_VECTYPE (stmt_info);
1756 /* If known_alignment_for_access_p then we have set
1757 DR_MISALIGNMENT which is only done if we know it at compiler
1758 time, so it is safe to assume target alignment is constant.
1760 unsigned int target_align =
1761 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763 mis = (negative
1764 ? DR_MISALIGNMENT (dr_info)
1765 : -DR_MISALIGNMENT (dr_info));
1766 if (DR_MISALIGNMENT (dr_info) != 0)
1767 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1769 /* For multiple types, it is possible that the bigger type access
1770 will have more than one peeling option. E.g., a loop with two
1771 types: one of size (vector size / 4), and the other one of
1772 size (vector size / 8). Vectorization factor will 8. If both
1773 accesses are misaligned by 3, the first one needs one scalar
1774 iteration to be aligned, and the second one needs 5. But the
1775 first one will be aligned also by peeling 5 scalar
1776 iterations, and in that case both accesses will be aligned.
1777 Hence, except for the immediate peeling amount, we also want
1778 to try to add full vector size, while we don't exceed
1779 vectorization factor.
1780 We do this automatically for cost model, since we calculate
1781 cost for every peeling option. */
1782 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1784 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786 possible_npeel_number
1787 = vect_get_num_vectors (nscalars, vectype);
1789 /* NPEEL_TMP is 0 when there is no misalignment, but also
1790 allow peeling NELEMENTS. */
1791 if (DR_MISALIGNMENT (dr_info) == 0)
1792 possible_npeel_number++;
1795 /* Save info about DR in the hash table. Also include peeling
1796 amounts according to the explanation above. */
1797 for (j = 0; j < possible_npeel_number; j++)
1799 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800 dr_info, npeel_tmp);
1801 npeel_tmp += target_align / dr_size;
1804 one_misalignment_known = true;
1806 else
1808 /* If we don't know any misalignment values, we prefer
1809 peeling for data-ref that has the maximum number of data-refs
1810 with the same alignment, unless the target prefers to align
1811 stores over load. */
1812 unsigned same_align_drs
1813 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814 if (!dr0_info
1815 || same_align_drs_max < same_align_drs)
1817 same_align_drs_max = same_align_drs;
1818 dr0_info = dr_info;
1820 /* For data-refs with the same number of related
1821 accesses prefer the one where the misalign
1822 computation will be invariant in the outermost loop. */
1823 else if (same_align_drs_max == same_align_drs)
1825 struct loop *ivloop0, *ivloop;
1826 ivloop0 = outermost_invariant_loop_for_expr
1827 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828 ivloop = outermost_invariant_loop_for_expr
1829 (loop, DR_BASE_ADDRESS (dr));
1830 if ((ivloop && !ivloop0)
1831 || (ivloop && ivloop0
1832 && flow_loop_nested_p (ivloop, ivloop0)))
1833 dr0_info = dr_info;
1836 one_misalignment_unknown = true;
1838 /* Check for data refs with unsupportable alignment that
1839 can be peeled. */
1840 if (!supportable_dr_alignment)
1842 one_dr_unsupportable = true;
1843 unsupportable_dr_info = dr_info;
1846 if (!first_store && DR_IS_WRITE (dr))
1847 first_store = dr_info;
1850 else
1852 if (!aligned_access_p (dr_info))
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "vector alignment may not be reachable\n");
1857 break;
1862 /* Check if we can possibly peel the loop. */
1863 if (!vect_can_advance_ivs_p (loop_vinfo)
1864 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865 || loop->inner)
1866 do_peeling = false;
1868 struct _vect_peel_extended_info peel_for_known_alignment;
1869 struct _vect_peel_extended_info peel_for_unknown_alignment;
1870 struct _vect_peel_extended_info best_peel;
1872 peel_for_unknown_alignment.inside_cost = INT_MAX;
1873 peel_for_unknown_alignment.outside_cost = INT_MAX;
1874 peel_for_unknown_alignment.peel_info.count = 0;
1876 if (do_peeling
1877 && one_misalignment_unknown)
1879 /* Check if the target requires to prefer stores over loads, i.e., if
1880 misaligned stores are more expensive than misaligned loads (taking
1881 drs with same alignment into account). */
1882 unsigned int load_inside_cost = 0;
1883 unsigned int load_outside_cost = 0;
1884 unsigned int store_inside_cost = 0;
1885 unsigned int store_outside_cost = 0;
1886 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1888 stmt_vector_for_cost dummy;
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891 &load_inside_cost,
1892 &load_outside_cost,
1893 &dummy, &dummy, estimated_npeels, true);
1894 dummy.release ();
1896 if (first_store)
1898 dummy.create (2);
1899 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900 &store_inside_cost,
1901 &store_outside_cost,
1902 &dummy, &dummy,
1903 estimated_npeels, true);
1904 dummy.release ();
1906 else
1908 store_inside_cost = INT_MAX;
1909 store_outside_cost = INT_MAX;
1912 if (load_inside_cost > store_inside_cost
1913 || (load_inside_cost == store_inside_cost
1914 && load_outside_cost > store_outside_cost))
1916 dr0_info = first_store;
1917 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1920 else
1922 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1926 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927 prologue_cost_vec.create (2);
1928 epilogue_cost_vec.create (2);
1930 int dummy2;
1931 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932 (loop_vinfo, estimated_npeels, &dummy2,
1933 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934 &prologue_cost_vec, &epilogue_cost_vec);
1936 prologue_cost_vec.release ();
1937 epilogue_cost_vec.release ();
1939 peel_for_unknown_alignment.peel_info.count = 1
1940 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1943 peel_for_unknown_alignment.peel_info.npeel = 0;
1944 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1946 best_peel = peel_for_unknown_alignment;
1948 peel_for_known_alignment.inside_cost = INT_MAX;
1949 peel_for_known_alignment.outside_cost = INT_MAX;
1950 peel_for_known_alignment.peel_info.count = 0;
1951 peel_for_known_alignment.peel_info.dr_info = NULL;
1953 if (do_peeling && one_misalignment_known)
1955 /* Peeling is possible, but there is no data access that is not supported
1956 unless aligned. So we try to choose the best possible peeling from
1957 the hash table. */
1958 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959 (&peeling_htab, loop_vinfo);
1962 /* Compare costs of peeling for known and unknown alignment. */
1963 if (peel_for_known_alignment.peel_info.dr_info != NULL
1964 && peel_for_unknown_alignment.inside_cost
1965 >= peel_for_known_alignment.inside_cost)
1967 best_peel = peel_for_known_alignment;
1969 /* If the best peeling for known alignment has NPEEL == 0, perform no
1970 peeling at all except if there is an unsupportable dr that we can
1971 align. */
1972 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973 do_peeling = false;
1976 /* If there is an unsupportable data ref, prefer this over all choices so far
1977 since we'd have to discard a chosen peeling except when it accidentally
1978 aligned the unsupportable data ref. */
1979 if (one_dr_unsupportable)
1980 dr0_info = unsupportable_dr_info;
1981 else if (do_peeling)
1983 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984 TODO: Use nopeel_outside_cost or get rid of it? */
1985 unsigned nopeel_inside_cost = 0;
1986 unsigned nopeel_outside_cost = 0;
1988 stmt_vector_for_cost dummy;
1989 dummy.create (2);
1990 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991 &nopeel_outside_cost, &dummy, &dummy,
1992 0, false);
1993 dummy.release ();
1995 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1996 costs will be recorded. */
1997 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998 prologue_cost_vec.create (2);
1999 epilogue_cost_vec.create (2);
2001 int dummy2;
2002 nopeel_outside_cost += vect_get_known_peeling_cost
2003 (loop_vinfo, 0, &dummy2,
2004 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005 &prologue_cost_vec, &epilogue_cost_vec);
2007 prologue_cost_vec.release ();
2008 epilogue_cost_vec.release ();
2010 npeel = best_peel.peel_info.npeel;
2011 dr0_info = best_peel.peel_info.dr_info;
2013 /* If no peeling is not more expensive than the best peeling we
2014 have so far, don't perform any peeling. */
2015 if (nopeel_inside_cost <= best_peel.inside_cost)
2016 do_peeling = false;
2019 if (do_peeling)
2021 stmt_vec_info stmt_info = dr0_info->stmt;
2022 vectype = STMT_VINFO_VECTYPE (stmt_info);
2024 if (known_alignment_for_access_p (dr0_info))
2026 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027 size_zero_node) < 0;
2028 if (!npeel)
2030 /* Since it's known at compile time, compute the number of
2031 iterations in the peeled loop (the peeling factor) for use in
2032 updating DR_MISALIGNMENT values. The peeling factor is the
2033 vectorization factor minus the misalignment as an element
2034 count. */
2035 mis = (negative
2036 ? DR_MISALIGNMENT (dr0_info)
2037 : -DR_MISALIGNMENT (dr0_info));
2038 /* If known_alignment_for_access_p then we have set
2039 DR_MISALIGNMENT which is only done if we know it at compiler
2040 time, so it is safe to assume target alignment is constant.
2042 unsigned int target_align =
2043 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044 npeel = ((mis & (target_align - 1))
2045 / vect_get_scalar_dr_size (dr0_info));
2048 /* For interleaved data access every iteration accesses all the
2049 members of the group, therefore we divide the number of iterations
2050 by the group size. */
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 npeel /= DR_GROUP_SIZE (stmt_info);
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_NOTE, vect_location,
2056 "Try peeling by %d\n", npeel);
2059 /* Ensure that all datarefs can be vectorized after the peel. */
2060 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061 do_peeling = false;
2063 /* Check if all datarefs are supportable and log. */
2064 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2066 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067 if (!stat)
2068 do_peeling = false;
2069 else
2070 return stat;
2073 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2074 if (do_peeling)
2076 unsigned max_allowed_peel
2077 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078 if (max_allowed_peel != (unsigned)-1)
2080 unsigned max_peel = npeel;
2081 if (max_peel == 0)
2083 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2084 unsigned HOST_WIDE_INT target_align_c;
2085 if (target_align.is_constant (&target_align_c))
2086 max_peel =
2087 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2088 else
2090 do_peeling = false;
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "Disable peeling, max peels set and vector"
2094 " alignment unknown\n");
2097 if (max_peel > max_allowed_peel)
2099 do_peeling = false;
2100 if (dump_enabled_p ())
2101 dump_printf_loc (MSG_NOTE, vect_location,
2102 "Disable peeling, max peels reached: %d\n", max_peel);
2107 /* Cost model #2 - if peeling may result in a remaining loop not
2108 iterating enough to be vectorized then do not peel. Since this
2109 is a cost heuristic rather than a correctness decision, use the
2110 most likely runtime value for variable vectorization factors. */
2111 if (do_peeling
2112 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2114 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2115 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2116 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2117 < assumed_vf + max_peel)
2118 do_peeling = false;
2121 if (do_peeling)
2123 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2124 If the misalignment of DR_i is identical to that of dr0 then set
2125 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2126 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2127 by the peeling factor times the element size of DR_i (MOD the
2128 vectorization factor times the size). Otherwise, the
2129 misalignment of DR_i must be set to unknown. */
2130 FOR_EACH_VEC_ELT (datarefs, i, dr)
2131 if (dr != dr0_info->dr)
2133 /* Strided accesses perform only component accesses, alignment
2134 is irrelevant for them. */
2135 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2136 stmt_info = dr_info->stmt;
2137 if (STMT_VINFO_STRIDED_P (stmt_info)
2138 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139 continue;
2141 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2144 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2145 if (npeel)
2146 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2147 else
2148 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149 = DR_MISALIGNMENT (dr0_info);
2150 SET_DR_MISALIGNMENT (dr0_info, 0);
2151 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_NOTE, vect_location,
2154 "Alignment of access forced using peeling.\n");
2155 dump_printf_loc (MSG_NOTE, vect_location,
2156 "Peeling for alignment will be applied.\n");
2159 /* The inside-loop cost will be accounted for in vectorizable_load
2160 and vectorizable_store correctly with adjusted alignments.
2161 Drop the body_cst_vec on the floor here. */
2162 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2163 gcc_assert (stat);
2164 return stat;
2168 /* (2) Versioning to force alignment. */
2170 /* Try versioning if:
2171 1) optimize loop for speed
2172 2) there is at least one unsupported misaligned data ref with an unknown
2173 misalignment, and
2174 3) all misaligned data refs with a known misalignment are supported, and
2175 4) the number of runtime alignment checks is within reason. */
2177 do_versioning =
2178 optimize_loop_nest_for_speed_p (loop)
2179 && (!loop->inner); /* FORNOW */
2181 if (do_versioning)
2183 FOR_EACH_VEC_ELT (datarefs, i, dr)
2185 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2186 stmt_vec_info stmt_info = dr_info->stmt;
2188 /* For interleaving, only the alignment of the first access
2189 matters. */
2190 if (aligned_access_p (dr_info)
2191 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2192 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2193 continue;
2195 if (STMT_VINFO_STRIDED_P (stmt_info))
2197 /* Strided loads perform only component accesses, alignment is
2198 irrelevant for them. */
2199 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2200 continue;
2201 do_versioning = false;
2202 break;
2205 supportable_dr_alignment
2206 = vect_supportable_dr_alignment (dr_info, false);
2208 if (!supportable_dr_alignment)
2210 int mask;
2211 tree vectype;
2213 if (known_alignment_for_access_p (dr_info)
2214 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2215 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2217 do_versioning = false;
2218 break;
2221 vectype = STMT_VINFO_VECTYPE (stmt_info);
2222 gcc_assert (vectype);
2224 /* At present we don't support versioning for alignment
2225 with variable VF, since there's no guarantee that the
2226 VF is a power of two. We could relax this if we added
2227 a way of enforcing a power-of-two size. */
2228 unsigned HOST_WIDE_INT size;
2229 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2231 do_versioning = false;
2232 break;
2235 /* Forcing alignment in the first iteration is no good if
2236 we don't keep it across iterations. For now, just disable
2237 versioning in this case.
2238 ?? We could actually unroll the loop to achieve the required
2239 overall step alignment, and forcing the alignment could be
2240 done by doing some iterations of the non-vectorized loop. */
2241 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2242 * DR_STEP_ALIGNMENT (dr),
2243 DR_TARGET_ALIGNMENT (dr_info)))
2245 do_versioning = false;
2246 break;
2249 /* The rightmost bits of an aligned address must be zeros.
2250 Construct the mask needed for this test. For example,
2251 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2252 mask must be 15 = 0xf. */
2253 mask = size - 1;
2255 /* FORNOW: use the same mask to test all potentially unaligned
2256 references in the loop. The vectorizer currently supports
2257 a single vector size, see the reference to
2258 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2259 vectorization factor is computed. */
2260 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2261 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2262 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2263 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2267 /* Versioning requires at least one misaligned data reference. */
2268 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2269 do_versioning = false;
2270 else if (!do_versioning)
2271 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2274 if (do_versioning)
2276 vec<stmt_vec_info> may_misalign_stmts
2277 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2278 stmt_vec_info stmt_info;
2280 /* It can now be assumed that the data references in the statements
2281 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2282 of the loop being vectorized. */
2283 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2285 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2286 SET_DR_MISALIGNMENT (dr_info, 0);
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_NOTE, vect_location,
2289 "Alignment of access forced using versioning.\n");
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "Versioning for alignment will be applied.\n");
2296 /* Peeling and versioning can't be done together at this time. */
2297 gcc_assert (! (do_peeling && do_versioning));
2299 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2300 gcc_assert (stat);
2301 return stat;
2304 /* This point is reached if neither peeling nor versioning is being done. */
2305 gcc_assert (! (do_peeling || do_versioning));
2307 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2308 return stat;
2312 /* Function vect_find_same_alignment_drs.
2314 Update group and alignment relations in VINFO according to the chosen
2315 vectorization factor. */
2317 static void
2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2320 struct data_reference *dra = DDR_A (ddr);
2321 struct data_reference *drb = DDR_B (ddr);
2322 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2323 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2324 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2325 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2327 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2328 return;
2330 if (dra == drb)
2331 return;
2333 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2334 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2335 return;
2337 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2338 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2339 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2340 return;
2342 /* Two references with distance zero have the same alignment. */
2343 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2344 - wi::to_poly_offset (DR_INIT (drb)));
2345 if (maybe_ne (diff, 0))
2347 /* Get the wider of the two alignments. */
2348 poly_uint64 align_a =
2349 exact_div (vect_calculate_target_alignment (dr_info_a),
2350 BITS_PER_UNIT);
2351 poly_uint64 align_b =
2352 exact_div (vect_calculate_target_alignment (dr_info_b),
2353 BITS_PER_UNIT);
2354 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2355 if (!align_a.is_constant (&align_a_c)
2356 || !align_b.is_constant (&align_b_c))
2357 return;
2359 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2361 /* Require the gap to be a multiple of the larger vector alignment. */
2362 if (!multiple_p (diff, max_align))
2363 return;
2366 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2367 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_NOTE, vect_location,
2370 "accesses have the same alignment: %T and %T\n",
2371 DR_REF (dra), DR_REF (drb));
2375 /* Function vect_analyze_data_refs_alignment
2377 Analyze the alignment of the data-references in the loop.
2378 Return FALSE if a data reference is found that cannot be vectorized. */
2380 opt_result
2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2383 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2385 /* Mark groups of data references with same alignment using
2386 data dependence information. */
2387 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2388 struct data_dependence_relation *ddr;
2389 unsigned int i;
2391 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2392 vect_find_same_alignment_drs (vinfo, ddr);
2394 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2395 struct data_reference *dr;
2397 vect_record_base_alignments (vinfo);
2398 FOR_EACH_VEC_ELT (datarefs, i, dr)
2400 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2401 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2402 vect_compute_data_ref_alignment (dr_info);
2405 return opt_result::success ();
2409 /* Analyze alignment of DRs of stmts in NODE. */
2411 static bool
2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2414 /* We vectorize from the first scalar stmt in the node unless
2415 the node is permuted in which case we start from the first
2416 element in the group. */
2417 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2418 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2419 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2420 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2422 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2423 vect_compute_data_ref_alignment (dr_info);
2424 /* For creating the data-ref pointer we need alignment of the
2425 first element anyway. */
2426 if (dr_info != first_dr_info)
2427 vect_compute_data_ref_alignment (first_dr_info);
2428 if (! verify_data_ref_alignment (dr_info))
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "not vectorized: bad data alignment in basic "
2433 "block.\n");
2434 return false;
2437 return true;
2440 /* Function vect_slp_analyze_instance_alignment
2442 Analyze the alignment of the data-references in the SLP instance.
2443 Return FALSE if a data reference is found that cannot be vectorized. */
2445 bool
2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2448 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2450 slp_tree node;
2451 unsigned i;
2452 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2453 if (! vect_slp_analyze_and_verify_node_alignment (node))
2454 return false;
2456 node = SLP_INSTANCE_TREE (instance);
2457 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2458 && ! vect_slp_analyze_and_verify_node_alignment
2459 (SLP_INSTANCE_TREE (instance)))
2460 return false;
2462 return true;
2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2467 accesses of legal size, step, etc. Detect gaps, single element
2468 interleaving, and other special cases. Set grouped access info.
2469 Collect groups of strided stores for further use in SLP analysis.
2470 Worker for vect_analyze_group_access. */
2472 static bool
2473 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2475 data_reference *dr = dr_info->dr;
2476 tree step = DR_STEP (dr);
2477 tree scalar_type = TREE_TYPE (DR_REF (dr));
2478 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2479 stmt_vec_info stmt_info = dr_info->stmt;
2480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2482 HOST_WIDE_INT dr_step = -1;
2483 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2484 bool slp_impossible = false;
2486 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2487 size of the interleaving group (including gaps). */
2488 if (tree_fits_shwi_p (step))
2490 dr_step = tree_to_shwi (step);
2491 /* Check that STEP is a multiple of type size. Otherwise there is
2492 a non-element-sized gap at the end of the group which we
2493 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2494 ??? As we can handle non-constant step fine here we should
2495 simply remove uses of DR_GROUP_GAP between the last and first
2496 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2497 simply not include that gap. */
2498 if ((dr_step % type_size) != 0)
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_NOTE, vect_location,
2502 "Step %T is not a multiple of the element size"
2503 " for %T\n",
2504 step, DR_REF (dr));
2505 return false;
2507 groupsize = absu_hwi (dr_step) / type_size;
2509 else
2510 groupsize = 0;
2512 /* Not consecutive access is possible only if it is a part of interleaving. */
2513 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2515 /* Check if it this DR is a part of interleaving, and is a single
2516 element of the group that is accessed in the loop. */
2518 /* Gaps are supported only for loads. STEP must be a multiple of the type
2519 size. */
2520 if (DR_IS_READ (dr)
2521 && (dr_step % type_size) == 0
2522 && groupsize > 0)
2524 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2525 DR_GROUP_SIZE (stmt_info) = groupsize;
2526 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2527 if (dump_enabled_p ())
2528 dump_printf_loc (MSG_NOTE, vect_location,
2529 "Detected single element interleaving %T"
2530 " step %T\n",
2531 DR_REF (dr), step);
2533 return true;
2536 if (dump_enabled_p ())
2537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2538 "not consecutive access %G", stmt_info->stmt);
2540 if (bb_vinfo)
2542 /* Mark the statement as unvectorizable. */
2543 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2544 return true;
2547 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2549 STMT_VINFO_STRIDED_P (stmt_info) = true;
2550 return true;
2553 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2555 /* First stmt in the interleaving chain. Check the chain. */
2556 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2557 struct data_reference *data_ref = dr;
2558 unsigned int count = 1;
2559 tree prev_init = DR_INIT (data_ref);
2560 HOST_WIDE_INT diff, gaps = 0;
2562 /* By construction, all group members have INTEGER_CST DR_INITs. */
2563 while (next)
2565 /* We never have the same DR multiple times. */
2566 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2567 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2569 data_ref = STMT_VINFO_DATA_REF (next);
2571 /* All group members have the same STEP by construction. */
2572 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2574 /* Check that the distance between two accesses is equal to the type
2575 size. Otherwise, we have gaps. */
2576 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577 - TREE_INT_CST_LOW (prev_init)) / type_size;
2578 if (diff != 1)
2580 /* FORNOW: SLP of accesses with gaps is not supported. */
2581 slp_impossible = true;
2582 if (DR_IS_WRITE (data_ref))
2584 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586 "interleaved store with gaps\n");
2587 return false;
2590 gaps += diff - 1;
2593 last_accessed_element += diff;
2595 /* Store the gap from the previous member of the group. If there is no
2596 gap in the access, DR_GROUP_GAP is always 1. */
2597 DR_GROUP_GAP (next) = diff;
2599 prev_init = DR_INIT (data_ref);
2600 next = DR_GROUP_NEXT_ELEMENT (next);
2601 /* Count the number of data-refs in the chain. */
2602 count++;
2605 if (groupsize == 0)
2606 groupsize = count + gaps;
2608 /* This could be UINT_MAX but as we are generating code in a very
2609 inefficient way we have to cap earlier. See PR78699 for example. */
2610 if (groupsize > 4096)
2612 if (dump_enabled_p ())
2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 "group is too large\n");
2615 return false;
2618 /* Check that the size of the interleaving is equal to count for stores,
2619 i.e., that there are no gaps. */
2620 if (groupsize != count
2621 && !DR_IS_READ (dr))
2623 groupsize = count;
2624 STMT_VINFO_STRIDED_P (stmt_info) = true;
2627 /* If there is a gap after the last load in the group it is the
2628 difference between the groupsize and the last accessed
2629 element.
2630 When there is no gap, this difference should be 0. */
2631 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2633 DR_GROUP_SIZE (stmt_info) = groupsize;
2634 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_NOTE, vect_location,
2637 "Detected interleaving ");
2638 if (DR_IS_READ (dr))
2639 dump_printf (MSG_NOTE, "load ");
2640 else if (STMT_VINFO_STRIDED_P (stmt_info))
2641 dump_printf (MSG_NOTE, "strided store ");
2642 else
2643 dump_printf (MSG_NOTE, "store ");
2644 dump_printf (MSG_NOTE, "of size %u\n",
2645 (unsigned)groupsize);
2646 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2647 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2648 while (next)
2650 if (DR_GROUP_GAP (next) != 1)
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "\t<gap of %d elements>\n",
2653 DR_GROUP_GAP (next) - 1);
2654 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2655 next = DR_GROUP_NEXT_ELEMENT (next);
2657 if (DR_GROUP_GAP (stmt_info) != 0)
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "\t<gap of %d elements>\n",
2660 DR_GROUP_GAP (stmt_info));
2663 /* SLP: create an SLP data structure for every interleaving group of
2664 stores for further analysis in vect_analyse_slp. */
2665 if (DR_IS_WRITE (dr) && !slp_impossible)
2667 if (loop_vinfo)
2668 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2669 if (bb_vinfo)
2670 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2674 return true;
2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2678 accesses of legal size, step, etc. Detect gaps, single element
2679 interleaving, and other special cases. Set grouped access info.
2680 Collect groups of strided stores for further use in SLP analysis. */
2682 static bool
2683 vect_analyze_group_access (dr_vec_info *dr_info)
2685 if (!vect_analyze_group_access_1 (dr_info))
2687 /* Dissolve the group if present. */
2688 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2689 while (stmt_info)
2691 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2693 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2694 stmt_info = next;
2696 return false;
2698 return true;
2701 /* Analyze the access pattern of the data-reference DR_INFO.
2702 In case of non-consecutive accesses call vect_analyze_group_access() to
2703 analyze groups of accesses. */
2705 static bool
2706 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2708 data_reference *dr = dr_info->dr;
2709 tree step = DR_STEP (dr);
2710 tree scalar_type = TREE_TYPE (DR_REF (dr));
2711 stmt_vec_info stmt_info = dr_info->stmt;
2712 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2713 struct loop *loop = NULL;
2715 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2716 return true;
2718 if (loop_vinfo)
2719 loop = LOOP_VINFO_LOOP (loop_vinfo);
2721 if (loop_vinfo && !step)
2723 if (dump_enabled_p ())
2724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2725 "bad data-ref access in loop\n");
2726 return false;
2729 /* Allow loads with zero step in inner-loop vectorization. */
2730 if (loop_vinfo && integer_zerop (step))
2732 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733 if (!nested_in_vect_loop_p (loop, stmt_info))
2734 return DR_IS_READ (dr);
2735 /* Allow references with zero step for outer loops marked
2736 with pragma omp simd only - it guarantees absence of
2737 loop-carried dependencies between inner loop iterations. */
2738 if (loop->safelen < 2)
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_NOTE, vect_location,
2742 "zero step in inner loop of nest\n");
2743 return false;
2747 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2749 /* Interleaved accesses are not yet supported within outer-loop
2750 vectorization for references in the inner-loop. */
2751 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2753 /* For the rest of the analysis we use the outer-loop step. */
2754 step = STMT_VINFO_DR_STEP (stmt_info);
2755 if (integer_zerop (step))
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location,
2759 "zero step in outer loop.\n");
2760 return DR_IS_READ (dr);
2764 /* Consecutive? */
2765 if (TREE_CODE (step) == INTEGER_CST)
2767 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2768 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2769 || (dr_step < 0
2770 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2772 /* Mark that it is not interleaving. */
2773 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2774 return true;
2778 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_NOTE, vect_location,
2782 "grouped access in outer loop.\n");
2783 return false;
2787 /* Assume this is a DR handled by non-constant strided load case. */
2788 if (TREE_CODE (step) != INTEGER_CST)
2789 return (STMT_VINFO_STRIDED_P (stmt_info)
2790 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2791 || vect_analyze_group_access (dr_info)));
2793 /* Not consecutive access - check if it's a part of interleaving group. */
2794 return vect_analyze_group_access (dr_info);
2797 /* Compare two data-references DRA and DRB to group them into chunks
2798 suitable for grouping. */
2800 static int
2801 dr_group_sort_cmp (const void *dra_, const void *drb_)
2803 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2804 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2805 int cmp;
2807 /* Stabilize sort. */
2808 if (dra == drb)
2809 return 0;
2811 /* DRs in different loops never belong to the same group. */
2812 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2813 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2814 if (loopa != loopb)
2815 return loopa->num < loopb->num ? -1 : 1;
2817 /* Ordering of DRs according to base. */
2818 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2819 DR_BASE_ADDRESS (drb));
2820 if (cmp != 0)
2821 return cmp;
2823 /* And according to DR_OFFSET. */
2824 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2825 if (cmp != 0)
2826 return cmp;
2828 /* Put reads before writes. */
2829 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2830 return DR_IS_READ (dra) ? -1 : 1;
2832 /* Then sort after access size. */
2833 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2834 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2835 if (cmp != 0)
2836 return cmp;
2838 /* And after step. */
2839 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2840 if (cmp != 0)
2841 return cmp;
2843 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2844 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2845 if (cmp == 0)
2846 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2847 return cmp;
2850 /* If OP is the result of a conversion, return the unconverted value,
2851 otherwise return null. */
2853 static tree
2854 strip_conversion (tree op)
2856 if (TREE_CODE (op) != SSA_NAME)
2857 return NULL_TREE;
2858 gimple *stmt = SSA_NAME_DEF_STMT (op);
2859 if (!is_gimple_assign (stmt)
2860 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2861 return NULL_TREE;
2862 return gimple_assign_rhs1 (stmt);
2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2866 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2867 be grouped in SLP mode. */
2869 static bool
2870 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2871 bool allow_slp_p)
2873 if (gimple_assign_single_p (stmt1_info->stmt))
2874 return gimple_assign_single_p (stmt2_info->stmt);
2876 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2877 if (call1 && gimple_call_internal_p (call1))
2879 /* Check for two masked loads or two masked stores. */
2880 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2881 if (!call2 || !gimple_call_internal_p (call2))
2882 return false;
2883 internal_fn ifn = gimple_call_internal_fn (call1);
2884 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885 return false;
2886 if (ifn != gimple_call_internal_fn (call2))
2887 return false;
2889 /* Check that the masks are the same. Cope with casts of masks,
2890 like those created by build_mask_conversion. */
2891 tree mask1 = gimple_call_arg (call1, 2);
2892 tree mask2 = gimple_call_arg (call2, 2);
2893 if (!operand_equal_p (mask1, mask2, 0)
2894 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2896 mask1 = strip_conversion (mask1);
2897 if (!mask1)
2898 return false;
2899 mask2 = strip_conversion (mask2);
2900 if (!mask2)
2901 return false;
2902 if (!operand_equal_p (mask1, mask2, 0))
2903 return false;
2905 return true;
2908 return false;
2911 /* Function vect_analyze_data_ref_accesses.
2913 Analyze the access pattern of all the data references in the loop.
2915 FORNOW: the only access pattern that is considered vectorizable is a
2916 simple step 1 (consecutive) access.
2918 FORNOW: handle only arrays and pointer accesses. */
2920 opt_result
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2923 unsigned int i;
2924 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925 struct data_reference *dr;
2927 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2929 if (datarefs.is_empty ())
2930 return opt_result::success ();
2932 /* Sort the array of datarefs to make building the interleaving chains
2933 linear. Don't modify the original vector's order, it is needed for
2934 determining what dependencies are reversed. */
2935 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936 datarefs_copy.qsort (dr_group_sort_cmp);
2937 hash_set<stmt_vec_info> to_fixup;
2939 /* Build the interleaving chains. */
2940 for (i = 0; i < datarefs_copy.length () - 1;)
2942 data_reference_p dra = datarefs_copy[i];
2943 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2944 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2945 stmt_vec_info lastinfo = NULL;
2946 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2947 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2949 ++i;
2950 continue;
2952 for (i = i + 1; i < datarefs_copy.length (); ++i)
2954 data_reference_p drb = datarefs_copy[i];
2955 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2956 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2957 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2958 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2959 break;
2961 /* ??? Imperfect sorting (non-compatible types, non-modulo
2962 accesses, same accesses) can lead to a group to be artificially
2963 split here as we don't just skip over those. If it really
2964 matters we can push those to a worklist and re-iterate
2965 over them. The we can just skip ahead to the next DR here. */
2967 /* DRs in a different loop should not be put into the same
2968 interleaving group. */
2969 if (gimple_bb (DR_STMT (dra))->loop_father
2970 != gimple_bb (DR_STMT (drb))->loop_father)
2971 break;
2973 /* Check that the data-refs have same first location (except init)
2974 and they are both either store or load (not load and store,
2975 not masked loads or stores). */
2976 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2977 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2978 DR_BASE_ADDRESS (drb)) != 0
2979 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2980 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2981 break;
2983 /* Check that the data-refs have the same constant size. */
2984 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2985 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2986 if (!tree_fits_uhwi_p (sza)
2987 || !tree_fits_uhwi_p (szb)
2988 || !tree_int_cst_equal (sza, szb))
2989 break;
2991 /* Check that the data-refs have the same step. */
2992 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2993 break;
2995 /* Check the types are compatible.
2996 ??? We don't distinguish this during sorting. */
2997 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2998 TREE_TYPE (DR_REF (drb))))
2999 break;
3001 /* Check that the DR_INITs are compile-time constants. */
3002 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3003 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3004 break;
3006 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3007 just hold extra information. */
3008 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3009 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3010 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3011 break;
3013 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3014 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3015 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3016 HOST_WIDE_INT init_prev
3017 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3018 gcc_assert (init_a <= init_b
3019 && init_a <= init_prev
3020 && init_prev <= init_b);
3022 /* Do not place the same access in the interleaving chain twice. */
3023 if (init_b == init_prev)
3025 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3026 < gimple_uid (DR_STMT (drb)));
3027 /* Simply link in duplicates and fix up the chain below. */
3029 else
3031 /* If init_b == init_a + the size of the type * k, we have an
3032 interleaving, and DRA is accessed before DRB. */
3033 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3034 if (type_size_a == 0
3035 || (init_b - init_a) % type_size_a != 0)
3036 break;
3038 /* If we have a store, the accesses are adjacent. This splits
3039 groups into chunks we support (we don't support vectorization
3040 of stores with gaps). */
3041 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3042 break;
3044 /* If the step (if not zero or non-constant) is greater than the
3045 difference between data-refs' inits this splits groups into
3046 suitable sizes. */
3047 if (tree_fits_shwi_p (DR_STEP (dra)))
3049 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3050 if (step != 0 && step <= (init_b - init_a))
3051 break;
3055 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_NOTE, vect_location,
3057 DR_IS_READ (dra)
3058 ? "Detected interleaving load %T and %T\n"
3059 : "Detected interleaving store %T and %T\n",
3060 DR_REF (dra), DR_REF (drb));
3062 /* Link the found element into the group list. */
3063 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3065 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3066 lastinfo = stmtinfo_a;
3068 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3069 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3070 lastinfo = stmtinfo_b;
3072 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3073 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3075 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3076 dump_printf_loc (MSG_NOTE, vect_location,
3077 "Load suitable for SLP vectorization only.\n");
3079 if (init_b == init_prev
3080 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3081 && dump_enabled_p ())
3082 dump_printf_loc (MSG_NOTE, vect_location,
3083 "Queuing group with duplicate access for fixup\n");
3087 /* Fixup groups with duplicate entries by splitting it. */
3088 while (1)
3090 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3091 if (!(it != to_fixup.end ()))
3092 break;
3093 stmt_vec_info grp = *it;
3094 to_fixup.remove (grp);
3096 /* Find the earliest duplicate group member. */
3097 unsigned first_duplicate = -1u;
3098 stmt_vec_info next, g = grp;
3099 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3101 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3102 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3103 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3104 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3105 g = next;
3107 if (first_duplicate == -1U)
3108 continue;
3110 /* Then move all stmts after the first duplicate to a new group.
3111 Note this is a heuristic but one with the property that *it
3112 is fixed up completely. */
3113 g = grp;
3114 stmt_vec_info newgroup = NULL, ng = grp;
3115 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3117 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3119 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3120 if (!newgroup)
3121 newgroup = next;
3122 else
3123 DR_GROUP_NEXT_ELEMENT (ng) = next;
3124 ng = next;
3125 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3127 else
3128 g = DR_GROUP_NEXT_ELEMENT (g);
3130 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3132 /* Fixup the new group which still may contain duplicates. */
3133 to_fixup.add (newgroup);
3136 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3138 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3139 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3140 && !vect_analyze_data_ref_access (dr_info))
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144 "not vectorized: complicated access pattern.\n");
3146 if (is_a <bb_vec_info> (vinfo))
3148 /* Mark the statement as not vectorizable. */
3149 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3150 continue;
3152 else
3154 datarefs_copy.release ();
3155 return opt_result::failure_at (dr_info->stmt->stmt,
3156 "not vectorized:"
3157 " complicated access pattern.\n");
3162 datarefs_copy.release ();
3163 return opt_result::success ();
3166 /* Function vect_vfa_segment_size.
3168 Input:
3169 DR_INFO: The data reference.
3170 LENGTH_FACTOR: segment length to consider.
3172 Return a value suitable for the dr_with_seg_len::seg_len field.
3173 This is the "distance travelled" by the pointer from the first
3174 iteration in the segment to the last. Note that it does not include
3175 the size of the access; in effect it only describes the first byte. */
3177 static tree
3178 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3180 length_factor = size_binop (MINUS_EXPR,
3181 fold_convert (sizetype, length_factor),
3182 size_one_node);
3183 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3184 length_factor);
3187 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3188 gives the worst-case number of bytes covered by the segment. */
3190 static unsigned HOST_WIDE_INT
3191 vect_vfa_access_size (dr_vec_info *dr_info)
3193 stmt_vec_info stmt_vinfo = dr_info->stmt;
3194 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3195 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3196 unsigned HOST_WIDE_INT access_size = ref_size;
3197 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3199 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3200 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3202 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3203 && (vect_supportable_dr_alignment (dr_info, false)
3204 == dr_explicit_realign_optimized))
3206 /* We might access a full vector's worth. */
3207 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3208 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3210 return access_size;
3213 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3214 describes. */
3216 static unsigned int
3217 vect_vfa_align (dr_vec_info *dr_info)
3219 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3222 /* Function vect_no_alias_p.
3224 Given data references A and B with equal base and offset, see whether
3225 the alias relation can be decided at compilation time. Return 1 if
3226 it can and the references alias, 0 if it can and the references do
3227 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3228 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3229 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3231 static int
3232 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3233 tree segment_length_a, tree segment_length_b,
3234 unsigned HOST_WIDE_INT access_size_a,
3235 unsigned HOST_WIDE_INT access_size_b)
3237 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3238 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3239 poly_uint64 const_length_a;
3240 poly_uint64 const_length_b;
3242 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3243 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3244 [a, a+12) */
3245 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3247 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3248 offset_a = (offset_a + access_size_a) - const_length_a;
3250 else
3251 const_length_a = tree_to_poly_uint64 (segment_length_a);
3252 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3254 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3255 offset_b = (offset_b + access_size_b) - const_length_b;
3257 else
3258 const_length_b = tree_to_poly_uint64 (segment_length_b);
3260 const_length_a += access_size_a;
3261 const_length_b += access_size_b;
3263 if (ranges_known_overlap_p (offset_a, const_length_a,
3264 offset_b, const_length_b))
3265 return 1;
3267 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3268 offset_b, const_length_b))
3269 return 0;
3271 return -1;
3274 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3275 in DDR is >= VF. */
3277 static bool
3278 dependence_distance_ge_vf (data_dependence_relation *ddr,
3279 unsigned int loop_depth, poly_uint64 vf)
3281 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3282 || DDR_NUM_DIST_VECTS (ddr) == 0)
3283 return false;
3285 /* If the dependence is exact, we should have limited the VF instead. */
3286 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3288 unsigned int i;
3289 lambda_vector dist_v;
3290 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3292 HOST_WIDE_INT dist = dist_v[loop_depth];
3293 if (dist != 0
3294 && !(dist > 0 && DDR_REVERSED_P (ddr))
3295 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3296 return false;
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_NOTE, vect_location,
3301 "dependence distance between %T and %T is >= VF\n",
3302 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3304 return true;
3307 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3309 static void
3310 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3312 dump_printf (dump_kind, "%s (%T) >= ",
3313 lower_bound.unsigned_p ? "unsigned" : "abs",
3314 lower_bound.expr);
3315 dump_dec (dump_kind, lower_bound.min_value);
3318 /* Record that the vectorized loop requires the vec_lower_bound described
3319 by EXPR, UNSIGNED_P and MIN_VALUE. */
3321 static void
3322 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3323 poly_uint64 min_value)
3325 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3326 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3327 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3329 unsigned_p &= lower_bounds[i].unsigned_p;
3330 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3331 if (lower_bounds[i].unsigned_p != unsigned_p
3332 || maybe_lt (lower_bounds[i].min_value, min_value))
3334 lower_bounds[i].unsigned_p = unsigned_p;
3335 lower_bounds[i].min_value = min_value;
3336 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_NOTE, vect_location,
3339 "updating run-time check to ");
3340 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3341 dump_printf (MSG_NOTE, "\n");
3344 return;
3347 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3348 if (dump_enabled_p ())
3350 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3351 dump_lower_bound (MSG_NOTE, lower_bound);
3352 dump_printf (MSG_NOTE, "\n");
3354 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3357 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3358 will span fewer than GAP bytes. */
3360 static bool
3361 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3362 poly_int64 gap)
3364 stmt_vec_info stmt_info = dr_info->stmt;
3365 HOST_WIDE_INT count
3366 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3367 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3368 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3369 return (estimated_poly_value (gap)
3370 <= count * vect_get_scalar_dr_size (dr_info));
3373 /* Return true if we know that there is no alias between DR_INFO_A and
3374 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3375 When returning true, set *LOWER_BOUND_OUT to this N. */
3377 static bool
3378 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3379 poly_uint64 *lower_bound_out)
3381 /* Check that there is a constant gap of known sign between DR_A
3382 and DR_B. */
3383 data_reference *dr_a = dr_info_a->dr;
3384 data_reference *dr_b = dr_info_b->dr;
3385 poly_int64 init_a, init_b;
3386 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3387 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3388 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3389 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3390 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3391 || !ordered_p (init_a, init_b))
3392 return false;
3394 /* Sort DR_A and DR_B by the address they access. */
3395 if (maybe_lt (init_b, init_a))
3397 std::swap (init_a, init_b);
3398 std::swap (dr_info_a, dr_info_b);
3399 std::swap (dr_a, dr_b);
3402 /* If the two accesses could be dependent within a scalar iteration,
3403 make sure that we'd retain their order. */
3404 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3405 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3406 return false;
3408 /* There is no alias if abs (DR_STEP) is greater than or equal to
3409 the bytes spanned by the combination of the two accesses. */
3410 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3411 return true;
3414 /* Function vect_prune_runtime_alias_test_list.
3416 Prune a list of ddrs to be tested at run-time by versioning for alias.
3417 Merge several alias checks into one if possible.
3418 Return FALSE if resulting list of ddrs is longer then allowed by
3419 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3421 opt_result
3422 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3424 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3425 hash_set <tree_pair_hash> compared_objects;
3427 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3428 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3429 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3430 vec<vec_object_pair> &check_unequal_addrs
3431 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3432 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3435 ddr_p ddr;
3436 unsigned int i;
3437 tree length_factor;
3439 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3441 /* Step values are irrelevant for aliasing if the number of vector
3442 iterations is equal to the number of scalar iterations (which can
3443 happen for fully-SLP loops). */
3444 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3446 if (!ignore_step_p)
3448 /* Convert the checks for nonzero steps into bound tests. */
3449 tree value;
3450 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3451 vect_check_lower_bound (loop_vinfo, value, true, 1);
3454 if (may_alias_ddrs.is_empty ())
3455 return opt_result::success ();
3457 comp_alias_ddrs.create (may_alias_ddrs.length ());
3459 unsigned int loop_depth
3460 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3461 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3463 /* First, we collect all data ref pairs for aliasing checks. */
3464 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3466 int comp_res;
3467 poly_uint64 lower_bound;
3468 tree segment_length_a, segment_length_b;
3469 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3470 unsigned int align_a, align_b;
3472 /* Ignore the alias if the VF we chose ended up being no greater
3473 than the dependence distance. */
3474 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3475 continue;
3477 if (DDR_OBJECT_A (ddr))
3479 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3480 if (!compared_objects.add (new_pair))
3482 if (dump_enabled_p ())
3483 dump_printf_loc (MSG_NOTE, vect_location,
3484 "checking that %T and %T"
3485 " have different addresses\n",
3486 new_pair.first, new_pair.second);
3487 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3489 continue;
3492 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3493 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3495 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3496 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3498 /* Skip the pair if inter-iteration dependencies are irrelevant
3499 and intra-iteration dependencies are guaranteed to be honored. */
3500 if (ignore_step_p
3501 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3502 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3503 &lower_bound)))
3505 if (dump_enabled_p ())
3506 dump_printf_loc (MSG_NOTE, vect_location,
3507 "no need for alias check between "
3508 "%T and %T when VF is 1\n",
3509 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3510 continue;
3513 /* See whether we can handle the alias using a bounds check on
3514 the step, and whether that's likely to be the best approach.
3515 (It might not be, for example, if the minimum step is much larger
3516 than the number of bytes handled by one vector iteration.) */
3517 if (!ignore_step_p
3518 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3519 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3520 &lower_bound)
3521 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3522 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3524 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3525 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3528 "%T and %T when the step %T is outside ",
3529 DR_REF (dr_info_a->dr),
3530 DR_REF (dr_info_b->dr),
3531 DR_STEP (dr_info_a->dr));
3532 if (unsigned_p)
3533 dump_printf (MSG_NOTE, "[0");
3534 else
3536 dump_printf (MSG_NOTE, "(");
3537 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3539 dump_printf (MSG_NOTE, ", ");
3540 dump_dec (MSG_NOTE, lower_bound);
3541 dump_printf (MSG_NOTE, ")\n");
3543 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3544 unsigned_p, lower_bound);
3545 continue;
3548 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3549 if (dr_group_first_a)
3551 stmt_info_a = dr_group_first_a;
3552 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3555 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3556 if (dr_group_first_b)
3558 stmt_info_b = dr_group_first_b;
3559 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3562 if (ignore_step_p)
3564 segment_length_a = size_zero_node;
3565 segment_length_b = size_zero_node;
3567 else
3569 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3570 DR_STEP (dr_info_b->dr), 0))
3571 length_factor = scalar_loop_iters;
3572 else
3573 length_factor = size_int (vect_factor);
3574 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3575 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3577 access_size_a = vect_vfa_access_size (dr_info_a);
3578 access_size_b = vect_vfa_access_size (dr_info_b);
3579 align_a = vect_vfa_align (dr_info_a);
3580 align_b = vect_vfa_align (dr_info_b);
3582 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3583 DR_BASE_ADDRESS (dr_info_b->dr));
3584 if (comp_res == 0)
3585 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3586 DR_OFFSET (dr_info_b->dr));
3588 /* See whether the alias is known at compilation time. */
3589 if (comp_res == 0
3590 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3591 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3592 && poly_int_tree_p (segment_length_a)
3593 && poly_int_tree_p (segment_length_b))
3595 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3596 segment_length_a,
3597 segment_length_b,
3598 access_size_a,
3599 access_size_b);
3600 if (res >= 0 && dump_enabled_p ())
3602 dump_printf_loc (MSG_NOTE, vect_location,
3603 "can tell at compile time that %T and %T",
3604 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3605 if (res == 0)
3606 dump_printf (MSG_NOTE, " do not alias\n");
3607 else
3608 dump_printf (MSG_NOTE, " alias\n");
3611 if (res == 0)
3612 continue;
3614 if (res == 1)
3615 return opt_result::failure_at (stmt_info_b->stmt,
3616 "not vectorized:"
3617 " compilation time alias: %G%G",
3618 stmt_info_a->stmt,
3619 stmt_info_b->stmt);
3622 dr_with_seg_len_pair_t dr_with_seg_len_pair
3623 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3624 access_size_a, align_a),
3625 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3626 access_size_b, align_b));
3628 /* Canonicalize pairs by sorting the two DR members. */
3629 if (comp_res > 0)
3630 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3632 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3635 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3637 unsigned int count = (comp_alias_ddrs.length ()
3638 + check_unequal_addrs.length ());
3640 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "improved number of alias checks from %d to %d\n",
3643 may_alias_ddrs.length (), count);
3644 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3645 return opt_result::failure_at
3646 (vect_location,
3647 "number of versioning for alias "
3648 "run-time tests exceeds %d "
3649 "(--param vect-max-version-for-alias-checks)\n",
3650 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3652 return opt_result::success ();
3655 /* Check whether we can use an internal function for a gather load
3656 or scatter store. READ_P is true for loads and false for stores.
3657 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3658 the type of the memory elements being loaded or stored. OFFSET_BITS
3659 is the number of bits in each scalar offset and OFFSET_SIGN is the
3660 sign of the offset. SCALE is the amount by which the offset should
3661 be multiplied *after* it has been converted to address width.
3663 Return true if the function is supported, storing the function
3664 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3666 bool
3667 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3668 tree memory_type, unsigned int offset_bits,
3669 signop offset_sign, int scale,
3670 internal_fn *ifn_out, tree *element_type_out)
3672 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3673 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3674 if (offset_bits > element_bits)
3675 /* Internal functions require the offset to be the same width as
3676 the vector elements. We can extend narrower offsets, but it isn't
3677 safe to truncate wider offsets. */
3678 return false;
3680 if (element_bits != memory_bits)
3681 /* For now the vector elements must be the same width as the
3682 memory elements. */
3683 return false;
3685 /* Work out which function we need. */
3686 internal_fn ifn;
3687 if (read_p)
3688 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3689 else
3690 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3692 /* Test whether the target supports this combination. */
3693 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3694 offset_sign, scale))
3695 return false;
3697 *ifn_out = ifn;
3698 *element_type_out = TREE_TYPE (vectype);
3699 return true;
3702 /* STMT_INFO is a call to an internal gather load or scatter store function.
3703 Describe the operation in INFO. */
3705 static void
3706 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3707 gather_scatter_info *info)
3709 gcall *call = as_a <gcall *> (stmt_info->stmt);
3710 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3711 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3713 info->ifn = gimple_call_internal_fn (call);
3714 info->decl = NULL_TREE;
3715 info->base = gimple_call_arg (call, 0);
3716 info->offset = gimple_call_arg (call, 1);
3717 info->offset_dt = vect_unknown_def_type;
3718 info->offset_vectype = NULL_TREE;
3719 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3720 info->element_type = TREE_TYPE (vectype);
3721 info->memory_type = TREE_TYPE (DR_REF (dr));
3724 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3725 gather load or scatter store. Describe the operation in *INFO if so. */
3727 bool
3728 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3729 gather_scatter_info *info)
3731 HOST_WIDE_INT scale = 1;
3732 poly_int64 pbitpos, pbitsize;
3733 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3734 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3735 tree offtype = NULL_TREE;
3736 tree decl = NULL_TREE, base, off;
3737 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3738 tree memory_type = TREE_TYPE (DR_REF (dr));
3739 machine_mode pmode;
3740 int punsignedp, reversep, pvolatilep = 0;
3741 internal_fn ifn;
3742 tree element_type;
3743 bool masked_p = false;
3745 /* See whether this is already a call to a gather/scatter internal function.
3746 If not, see whether it's a masked load or store. */
3747 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3748 if (call && gimple_call_internal_p (call))
3750 ifn = gimple_call_internal_fn (call);
3751 if (internal_gather_scatter_fn_p (ifn))
3753 vect_describe_gather_scatter_call (stmt_info, info);
3754 return true;
3756 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3759 /* True if we should aim to use internal functions rather than
3760 built-in functions. */
3761 bool use_ifn_p = (DR_IS_READ (dr)
3762 ? supports_vec_gather_load_p ()
3763 : supports_vec_scatter_store_p ());
3765 base = DR_REF (dr);
3766 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3767 see if we can use the def stmt of the address. */
3768 if (masked_p
3769 && TREE_CODE (base) == MEM_REF
3770 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3771 && integer_zerop (TREE_OPERAND (base, 1))
3772 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3774 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3775 if (is_gimple_assign (def_stmt)
3776 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3777 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3780 /* The gather and scatter builtins need address of the form
3781 loop_invariant + vector * {1, 2, 4, 8}
3783 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3784 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3785 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3786 multiplications and additions in it. To get a vector, we need
3787 a single SSA_NAME that will be defined in the loop and will
3788 contain everything that is not loop invariant and that can be
3789 vectorized. The following code attempts to find such a preexistng
3790 SSA_NAME OFF and put the loop invariants into a tree BASE
3791 that can be gimplified before the loop. */
3792 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3793 &punsignedp, &reversep, &pvolatilep);
3794 if (reversep)
3795 return false;
3797 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3799 if (TREE_CODE (base) == MEM_REF)
3801 if (!integer_zerop (TREE_OPERAND (base, 1)))
3803 if (off == NULL_TREE)
3804 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3805 else
3806 off = size_binop (PLUS_EXPR, off,
3807 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3809 base = TREE_OPERAND (base, 0);
3811 else
3812 base = build_fold_addr_expr (base);
3814 if (off == NULL_TREE)
3815 off = size_zero_node;
3817 /* If base is not loop invariant, either off is 0, then we start with just
3818 the constant offset in the loop invariant BASE and continue with base
3819 as OFF, otherwise give up.
3820 We could handle that case by gimplifying the addition of base + off
3821 into some SSA_NAME and use that as off, but for now punt. */
3822 if (!expr_invariant_in_loop_p (loop, base))
3824 if (!integer_zerop (off))
3825 return false;
3826 off = base;
3827 base = size_int (pbytepos);
3829 /* Otherwise put base + constant offset into the loop invariant BASE
3830 and continue with OFF. */
3831 else
3833 base = fold_convert (sizetype, base);
3834 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3837 /* OFF at this point may be either a SSA_NAME or some tree expression
3838 from get_inner_reference. Try to peel off loop invariants from it
3839 into BASE as long as possible. */
3840 STRIP_NOPS (off);
3841 while (offtype == NULL_TREE)
3843 enum tree_code code;
3844 tree op0, op1, add = NULL_TREE;
3846 if (TREE_CODE (off) == SSA_NAME)
3848 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3850 if (expr_invariant_in_loop_p (loop, off))
3851 return false;
3853 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3854 break;
3856 op0 = gimple_assign_rhs1 (def_stmt);
3857 code = gimple_assign_rhs_code (def_stmt);
3858 op1 = gimple_assign_rhs2 (def_stmt);
3860 else
3862 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3863 return false;
3864 code = TREE_CODE (off);
3865 extract_ops_from_tree (off, &code, &op0, &op1);
3867 switch (code)
3869 case POINTER_PLUS_EXPR:
3870 case PLUS_EXPR:
3871 if (expr_invariant_in_loop_p (loop, op0))
3873 add = op0;
3874 off = op1;
3875 do_add:
3876 add = fold_convert (sizetype, add);
3877 if (scale != 1)
3878 add = size_binop (MULT_EXPR, add, size_int (scale));
3879 base = size_binop (PLUS_EXPR, base, add);
3880 continue;
3882 if (expr_invariant_in_loop_p (loop, op1))
3884 add = op1;
3885 off = op0;
3886 goto do_add;
3888 break;
3889 case MINUS_EXPR:
3890 if (expr_invariant_in_loop_p (loop, op1))
3892 add = fold_convert (sizetype, op1);
3893 add = size_binop (MINUS_EXPR, size_zero_node, add);
3894 off = op0;
3895 goto do_add;
3897 break;
3898 case MULT_EXPR:
3899 if (scale == 1 && tree_fits_shwi_p (op1))
3901 int new_scale = tree_to_shwi (op1);
3902 /* Only treat this as a scaling operation if the target
3903 supports it. */
3904 if (use_ifn_p
3905 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3906 vectype, memory_type, 1,
3907 TYPE_SIGN (TREE_TYPE (op0)),
3908 new_scale, &ifn,
3909 &element_type))
3910 break;
3911 scale = new_scale;
3912 off = op0;
3913 continue;
3915 break;
3916 case SSA_NAME:
3917 off = op0;
3918 continue;
3919 CASE_CONVERT:
3920 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3921 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3922 break;
3923 if (TYPE_PRECISION (TREE_TYPE (op0))
3924 == TYPE_PRECISION (TREE_TYPE (off)))
3926 off = op0;
3927 continue;
3930 /* The internal functions need the offset to be the same width
3931 as the elements of VECTYPE. Don't include operations that
3932 cast the offset from that width to a different width. */
3933 if (use_ifn_p
3934 && (int_size_in_bytes (TREE_TYPE (vectype))
3935 == int_size_in_bytes (TREE_TYPE (off))))
3936 break;
3938 if (TYPE_PRECISION (TREE_TYPE (op0))
3939 < TYPE_PRECISION (TREE_TYPE (off)))
3941 off = op0;
3942 offtype = TREE_TYPE (off);
3943 STRIP_NOPS (off);
3944 continue;
3946 break;
3947 default:
3948 break;
3950 break;
3953 /* If at the end OFF still isn't a SSA_NAME or isn't
3954 defined in the loop, punt. */
3955 if (TREE_CODE (off) != SSA_NAME
3956 || expr_invariant_in_loop_p (loop, off))
3957 return false;
3959 if (offtype == NULL_TREE)
3960 offtype = TREE_TYPE (off);
3962 if (use_ifn_p)
3964 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3965 memory_type, TYPE_PRECISION (offtype),
3966 TYPE_SIGN (offtype), scale, &ifn,
3967 &element_type))
3968 return false;
3970 else
3972 if (DR_IS_READ (dr))
3974 if (targetm.vectorize.builtin_gather)
3975 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3977 else
3979 if (targetm.vectorize.builtin_scatter)
3980 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3983 if (!decl)
3984 return false;
3986 ifn = IFN_LAST;
3987 element_type = TREE_TYPE (vectype);
3990 info->ifn = ifn;
3991 info->decl = decl;
3992 info->base = base;
3993 info->offset = off;
3994 info->offset_dt = vect_unknown_def_type;
3995 info->offset_vectype = NULL_TREE;
3996 info->scale = scale;
3997 info->element_type = element_type;
3998 info->memory_type = memory_type;
3999 return true;
4002 /* Find the data references in STMT, analyze them with respect to LOOP and
4003 append them to DATAREFS. Return false if datarefs in this stmt cannot
4004 be handled. */
4006 opt_result
4007 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4008 vec<data_reference_p> *datarefs)
4010 /* We can ignore clobbers for dataref analysis - they are removed during
4011 loop vectorization and BB vectorization checks dependences with a
4012 stmt walk. */
4013 if (gimple_clobber_p (stmt))
4014 return opt_result::success ();
4016 if (gimple_has_volatile_ops (stmt))
4017 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4018 stmt);
4020 if (stmt_can_throw_internal (cfun, stmt))
4021 return opt_result::failure_at (stmt,
4022 "not vectorized:"
4023 " statement can throw an exception: %G",
4024 stmt);
4026 auto_vec<data_reference_p, 2> refs;
4027 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4028 if (!res)
4029 return res;
4031 if (refs.is_empty ())
4032 return opt_result::success ();
4034 if (refs.length () > 1)
4035 return opt_result::failure_at (stmt,
4036 "not vectorized:"
4037 " more than one data ref in stmt: %G", stmt);
4039 if (gcall *call = dyn_cast <gcall *> (stmt))
4040 if (!gimple_call_internal_p (call)
4041 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4042 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4043 return opt_result::failure_at (stmt,
4044 "not vectorized: dr in a call %G", stmt);
4046 data_reference_p dr = refs.pop ();
4047 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4048 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4049 return opt_result::failure_at (stmt,
4050 "not vectorized:"
4051 " statement is bitfield access %G", stmt);
4053 if (DR_BASE_ADDRESS (dr)
4054 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4055 return opt_result::failure_at (stmt,
4056 "not vectorized:"
4057 " base addr of dr is a constant\n");
4059 /* Check whether this may be a SIMD lane access and adjust the
4060 DR to make it easier for us to handle it. */
4061 if (loop
4062 && loop->simduid
4063 && (!DR_BASE_ADDRESS (dr)
4064 || !DR_OFFSET (dr)
4065 || !DR_INIT (dr)
4066 || !DR_STEP (dr)))
4068 struct data_reference *newdr
4069 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4070 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4071 if (DR_BASE_ADDRESS (newdr)
4072 && DR_OFFSET (newdr)
4073 && DR_INIT (newdr)
4074 && DR_STEP (newdr)
4075 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4076 && integer_zerop (DR_STEP (newdr)))
4078 tree base_address = DR_BASE_ADDRESS (newdr);
4079 tree off = DR_OFFSET (newdr);
4080 tree step = ssize_int (1);
4081 if (integer_zerop (off)
4082 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4084 off = TREE_OPERAND (base_address, 1);
4085 base_address = TREE_OPERAND (base_address, 0);
4087 STRIP_NOPS (off);
4088 if (TREE_CODE (off) == MULT_EXPR
4089 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4091 step = TREE_OPERAND (off, 1);
4092 off = TREE_OPERAND (off, 0);
4093 STRIP_NOPS (off);
4095 if (CONVERT_EXPR_P (off)
4096 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4097 < TYPE_PRECISION (TREE_TYPE (off))))
4098 off = TREE_OPERAND (off, 0);
4099 if (TREE_CODE (off) == SSA_NAME)
4101 gimple *def = SSA_NAME_DEF_STMT (off);
4102 /* Look through widening conversion. */
4103 if (is_gimple_assign (def)
4104 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4106 tree rhs1 = gimple_assign_rhs1 (def);
4107 if (TREE_CODE (rhs1) == SSA_NAME
4108 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4109 && (TYPE_PRECISION (TREE_TYPE (off))
4110 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4111 def = SSA_NAME_DEF_STMT (rhs1);
4113 if (is_gimple_call (def)
4114 && gimple_call_internal_p (def)
4115 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4117 tree arg = gimple_call_arg (def, 0);
4118 tree reft = TREE_TYPE (DR_REF (newdr));
4119 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4120 arg = SSA_NAME_VAR (arg);
4121 if (arg == loop->simduid
4122 /* For now. */
4123 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4125 DR_BASE_ADDRESS (newdr) = base_address;
4126 DR_OFFSET (newdr) = ssize_int (0);
4127 DR_STEP (newdr) = step;
4128 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4129 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4130 /* Mark as simd-lane access. */
4131 tree arg2 = gimple_call_arg (def, 1);
4132 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4133 free_data_ref (dr);
4134 datarefs->safe_push (newdr);
4135 return opt_result::success ();
4140 free_data_ref (newdr);
4143 datarefs->safe_push (dr);
4144 return opt_result::success ();
4147 /* Function vect_analyze_data_refs.
4149 Find all the data references in the loop or basic block.
4151 The general structure of the analysis of data refs in the vectorizer is as
4152 follows:
4153 1- vect_analyze_data_refs(loop/bb): call
4154 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4155 in the loop/bb and their dependences.
4156 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4157 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4158 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4162 opt_result
4163 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4165 struct loop *loop = NULL;
4166 unsigned int i;
4167 struct data_reference *dr;
4168 tree scalar_type;
4170 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4172 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4173 loop = LOOP_VINFO_LOOP (loop_vinfo);
4175 /* Go through the data-refs, check that the analysis succeeded. Update
4176 pointer from stmt_vec_info struct to DR and vectype. */
4178 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4179 FOR_EACH_VEC_ELT (datarefs, i, dr)
4181 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4182 poly_uint64 vf;
4184 gcc_assert (DR_REF (dr));
4185 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4186 gcc_assert (!stmt_info->dr_aux.dr);
4187 stmt_info->dr_aux.dr = dr;
4188 stmt_info->dr_aux.stmt = stmt_info;
4190 /* Check that analysis of the data-ref succeeded. */
4191 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4192 || !DR_STEP (dr))
4194 bool maybe_gather
4195 = DR_IS_READ (dr)
4196 && !TREE_THIS_VOLATILE (DR_REF (dr))
4197 && (targetm.vectorize.builtin_gather != NULL
4198 || supports_vec_gather_load_p ());
4199 bool maybe_scatter
4200 = DR_IS_WRITE (dr)
4201 && !TREE_THIS_VOLATILE (DR_REF (dr))
4202 && (targetm.vectorize.builtin_scatter != NULL
4203 || supports_vec_scatter_store_p ());
4205 /* If target supports vector gather loads or scatter stores,
4206 see if they can't be used. */
4207 if (is_a <loop_vec_info> (vinfo)
4208 && !nested_in_vect_loop_p (loop, stmt_info))
4210 if (maybe_gather || maybe_scatter)
4212 if (maybe_gather)
4213 gatherscatter = GATHER;
4214 else
4215 gatherscatter = SCATTER;
4219 if (gatherscatter == SG_NONE)
4221 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 "not vectorized: data ref analysis "
4224 "failed %G", stmt_info->stmt);
4225 if (is_a <bb_vec_info> (vinfo))
4227 /* In BB vectorization the ref can still participate
4228 in dependence analysis, we just can't vectorize it. */
4229 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4230 continue;
4232 return opt_result::failure_at (stmt_info->stmt,
4233 "not vectorized:"
4234 " data ref analysis failed: %G",
4235 stmt_info->stmt);
4239 /* See if this was detected as SIMD lane access. */
4240 if (dr->aux == (void *)-1
4241 || dr->aux == (void *)-2
4242 || dr->aux == (void *)-3
4243 || dr->aux == (void *)-4)
4245 if (nested_in_vect_loop_p (loop, stmt_info))
4246 return opt_result::failure_at (stmt_info->stmt,
4247 "not vectorized:"
4248 " data ref analysis failed: %G",
4249 stmt_info->stmt);
4250 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4251 = -(uintptr_t) dr->aux;
4254 tree base = get_base_address (DR_REF (dr));
4255 if (base && VAR_P (base) && DECL_NONALIASED (base))
4257 if (dump_enabled_p ())
4258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4259 "not vectorized: base object not addressable "
4260 "for stmt: %G", stmt_info->stmt);
4261 if (is_a <bb_vec_info> (vinfo))
4263 /* In BB vectorization the ref can still participate
4264 in dependence analysis, we just can't vectorize it. */
4265 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4266 continue;
4268 return opt_result::failure_at (stmt_info->stmt,
4269 "not vectorized: base object not"
4270 " addressable for stmt: %G",
4271 stmt_info->stmt);
4274 if (is_a <loop_vec_info> (vinfo)
4275 && DR_STEP (dr)
4276 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4278 if (nested_in_vect_loop_p (loop, stmt_info))
4279 return opt_result::failure_at (stmt_info->stmt,
4280 "not vectorized:"
4281 "not suitable for strided load %G",
4282 stmt_info->stmt);
4283 STMT_VINFO_STRIDED_P (stmt_info) = true;
4286 /* Update DR field in stmt_vec_info struct. */
4288 /* If the dataref is in an inner-loop of the loop that is considered for
4289 for vectorization, we also want to analyze the access relative to
4290 the outer-loop (DR contains information only relative to the
4291 inner-most enclosing loop). We do that by building a reference to the
4292 first location accessed by the inner-loop, and analyze it relative to
4293 the outer-loop. */
4294 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4296 /* Build a reference to the first location accessed by the
4297 inner loop: *(BASE + INIT + OFFSET). By construction,
4298 this address must be invariant in the inner loop, so we
4299 can consider it as being used in the outer loop. */
4300 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4301 tree offset = unshare_expr (DR_OFFSET (dr));
4302 tree init = unshare_expr (DR_INIT (dr));
4303 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4304 init, offset);
4305 tree init_addr = fold_build_pointer_plus (base, init_offset);
4306 tree init_ref = build_fold_indirect_ref (init_addr);
4308 if (dump_enabled_p ())
4309 dump_printf_loc (MSG_NOTE, vect_location,
4310 "analyze in outer loop: %T\n", init_ref);
4312 opt_result res
4313 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4314 init_ref, loop, stmt_info->stmt);
4315 if (!res)
4316 /* dr_analyze_innermost already explained the failure. */
4317 return res;
4319 if (dump_enabled_p ())
4320 dump_printf_loc (MSG_NOTE, vect_location,
4321 "\touter base_address: %T\n"
4322 "\touter offset from base address: %T\n"
4323 "\touter constant offset from base address: %T\n"
4324 "\touter step: %T\n"
4325 "\touter base alignment: %d\n\n"
4326 "\touter base misalignment: %d\n"
4327 "\touter offset alignment: %d\n"
4328 "\touter step alignment: %d\n",
4329 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4330 STMT_VINFO_DR_OFFSET (stmt_info),
4331 STMT_VINFO_DR_INIT (stmt_info),
4332 STMT_VINFO_DR_STEP (stmt_info),
4333 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4334 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4335 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4336 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4339 /* Set vectype for STMT. */
4340 scalar_type = TREE_TYPE (DR_REF (dr));
4341 STMT_VINFO_VECTYPE (stmt_info)
4342 = get_vectype_for_scalar_type (scalar_type);
4343 if (!STMT_VINFO_VECTYPE (stmt_info))
4345 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348 "not vectorized: no vectype for stmt: %G",
4349 stmt_info->stmt);
4350 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4351 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4352 scalar_type);
4353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4356 if (is_a <bb_vec_info> (vinfo))
4358 /* No vector type is fine, the ref can still participate
4359 in dependence analysis, we just can't vectorize it. */
4360 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4361 continue;
4363 return opt_result::failure_at (stmt_info->stmt,
4364 "not vectorized:"
4365 " no vectype for stmt: %G"
4366 " scalar_type: %T\n",
4367 stmt_info->stmt, scalar_type);
4369 else
4371 if (dump_enabled_p ())
4372 dump_printf_loc (MSG_NOTE, vect_location,
4373 "got vectype for stmt: %G%T\n",
4374 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4377 /* Adjust the minimal vectorization factor according to the
4378 vector type. */
4379 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4380 *min_vf = upper_bound (*min_vf, vf);
4382 if (gatherscatter != SG_NONE)
4384 gather_scatter_info gs_info;
4385 if (!vect_check_gather_scatter (stmt_info,
4386 as_a <loop_vec_info> (vinfo),
4387 &gs_info)
4388 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4390 if (fatal)
4391 *fatal = false;
4392 return opt_result::failure_at
4393 (stmt_info->stmt,
4394 (gatherscatter == GATHER)
4395 ? "not vectorized: not suitable for gather load %G"
4396 : "not vectorized: not suitable for scatter store %G",
4397 stmt_info->stmt);
4399 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4403 /* We used to stop processing and prune the list here. Verify we no
4404 longer need to. */
4405 gcc_assert (i == datarefs.length ());
4407 return opt_result::success ();
4411 /* Function vect_get_new_vect_var.
4413 Returns a name for a new variable. The current naming scheme appends the
4414 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4415 the name of vectorizer generated variables, and appends that to NAME if
4416 provided. */
4418 tree
4419 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4421 const char *prefix;
4422 tree new_vect_var;
4424 switch (var_kind)
4426 case vect_simple_var:
4427 prefix = "vect";
4428 break;
4429 case vect_scalar_var:
4430 prefix = "stmp";
4431 break;
4432 case vect_mask_var:
4433 prefix = "mask";
4434 break;
4435 case vect_pointer_var:
4436 prefix = "vectp";
4437 break;
4438 default:
4439 gcc_unreachable ();
4442 if (name)
4444 char* tmp = concat (prefix, "_", name, NULL);
4445 new_vect_var = create_tmp_reg (type, tmp);
4446 free (tmp);
4448 else
4449 new_vect_var = create_tmp_reg (type, prefix);
4451 return new_vect_var;
4454 /* Like vect_get_new_vect_var but return an SSA name. */
4456 tree
4457 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4459 const char *prefix;
4460 tree new_vect_var;
4462 switch (var_kind)
4464 case vect_simple_var:
4465 prefix = "vect";
4466 break;
4467 case vect_scalar_var:
4468 prefix = "stmp";
4469 break;
4470 case vect_pointer_var:
4471 prefix = "vectp";
4472 break;
4473 default:
4474 gcc_unreachable ();
4477 if (name)
4479 char* tmp = concat (prefix, "_", name, NULL);
4480 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4481 free (tmp);
4483 else
4484 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4486 return new_vect_var;
4489 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4491 static void
4492 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4494 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4495 int misalign = DR_MISALIGNMENT (dr_info);
4496 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4497 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4498 else
4499 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4500 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4501 misalign);
4504 /* Function vect_create_addr_base_for_vector_ref.
4506 Create an expression that computes the address of the first memory location
4507 that will be accessed for a data reference.
4509 Input:
4510 STMT_INFO: The statement containing the data reference.
4511 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4512 OFFSET: Optional. If supplied, it is be added to the initial address.
4513 LOOP: Specify relative to which loop-nest should the address be computed.
4514 For example, when the dataref is in an inner-loop nested in an
4515 outer-loop that is now being vectorized, LOOP can be either the
4516 outer-loop, or the inner-loop. The first memory location accessed
4517 by the following dataref ('in' points to short):
4519 for (i=0; i<N; i++)
4520 for (j=0; j<M; j++)
4521 s += in[i+j]
4523 is as follows:
4524 if LOOP=i_loop: &in (relative to i_loop)
4525 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4526 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4527 initial address. Unlike OFFSET, which is number of elements to
4528 be added, BYTE_OFFSET is measured in bytes.
4530 Output:
4531 1. Return an SSA_NAME whose value is the address of the memory location of
4532 the first vector of the data reference.
4533 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4534 these statement(s) which define the returned SSA_NAME.
4536 FORNOW: We are only handling array accesses with step 1. */
4538 tree
4539 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4540 gimple_seq *new_stmt_list,
4541 tree offset,
4542 tree byte_offset)
4544 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4545 struct data_reference *dr = dr_info->dr;
4546 const char *base_name;
4547 tree addr_base;
4548 tree dest;
4549 gimple_seq seq = NULL;
4550 tree vect_ptr_type;
4551 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4552 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4553 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4555 tree data_ref_base = unshare_expr (drb->base_address);
4556 tree base_offset = unshare_expr (drb->offset);
4557 tree init = unshare_expr (drb->init);
4559 if (loop_vinfo)
4560 base_name = get_name (data_ref_base);
4561 else
4563 base_offset = ssize_int (0);
4564 init = ssize_int (0);
4565 base_name = get_name (DR_REF (dr));
4568 /* Create base_offset */
4569 base_offset = size_binop (PLUS_EXPR,
4570 fold_convert (sizetype, base_offset),
4571 fold_convert (sizetype, init));
4573 if (offset)
4575 offset = fold_build2 (MULT_EXPR, sizetype,
4576 fold_convert (sizetype, offset), step);
4577 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4578 base_offset, offset);
4580 if (byte_offset)
4582 byte_offset = fold_convert (sizetype, byte_offset);
4583 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4584 base_offset, byte_offset);
4587 /* base + base_offset */
4588 if (loop_vinfo)
4589 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4590 else
4592 addr_base = build1 (ADDR_EXPR,
4593 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4594 unshare_expr (DR_REF (dr)));
4597 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4598 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4599 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4600 gimple_seq_add_seq (new_stmt_list, seq);
4602 if (DR_PTR_INFO (dr)
4603 && TREE_CODE (addr_base) == SSA_NAME
4604 && !SSA_NAME_PTR_INFO (addr_base))
4606 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4607 if (offset || byte_offset)
4608 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4611 if (dump_enabled_p ())
4612 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4614 return addr_base;
4618 /* Function vect_create_data_ref_ptr.
4620 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4621 location accessed in the loop by STMT_INFO, along with the def-use update
4622 chain to appropriately advance the pointer through the loop iterations.
4623 Also set aliasing information for the pointer. This pointer is used by
4624 the callers to this function to create a memory reference expression for
4625 vector load/store access.
4627 Input:
4628 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4629 GIMPLE_ASSIGN <name, data-ref> or
4630 GIMPLE_ASSIGN <data-ref, name>.
4631 2. AGGR_TYPE: the type of the reference, which should be either a vector
4632 or an array.
4633 3. AT_LOOP: the loop where the vector memref is to be created.
4634 4. OFFSET (optional): an offset to be added to the initial address accessed
4635 by the data-ref in STMT_INFO.
4636 5. BSI: location where the new stmts are to be placed if there is no loop
4637 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4638 pointing to the initial address.
4639 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4640 to the initial address accessed by the data-ref in STMT_INFO. This is
4641 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4642 in bytes.
4643 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4644 to the IV during each iteration of the loop. NULL says to move
4645 by one copy of AGGR_TYPE up or down, depending on the step of the
4646 data reference.
4648 Output:
4649 1. Declare a new ptr to vector_type, and have it point to the base of the
4650 data reference (initial addressed accessed by the data reference).
4651 For example, for vector of type V8HI, the following code is generated:
4653 v8hi *ap;
4654 ap = (v8hi *)initial_address;
4656 if OFFSET is not supplied:
4657 initial_address = &a[init];
4658 if OFFSET is supplied:
4659 initial_address = &a[init + OFFSET];
4660 if BYTE_OFFSET is supplied:
4661 initial_address = &a[init] + BYTE_OFFSET;
4663 Return the initial_address in INITIAL_ADDRESS.
4665 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4666 update the pointer in each iteration of the loop.
4668 Return the increment stmt that updates the pointer in PTR_INCR.
4670 3. Return the pointer. */
4672 tree
4673 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4674 struct loop *at_loop, tree offset,
4675 tree *initial_address, gimple_stmt_iterator *gsi,
4676 gimple **ptr_incr, bool only_init,
4677 tree byte_offset, tree iv_step)
4679 const char *base_name;
4680 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4681 struct loop *loop = NULL;
4682 bool nested_in_vect_loop = false;
4683 struct loop *containing_loop = NULL;
4684 tree aggr_ptr_type;
4685 tree aggr_ptr;
4686 tree new_temp;
4687 gimple_seq new_stmt_list = NULL;
4688 edge pe = NULL;
4689 basic_block new_bb;
4690 tree aggr_ptr_init;
4691 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4692 struct data_reference *dr = dr_info->dr;
4693 tree aptr;
4694 gimple_stmt_iterator incr_gsi;
4695 bool insert_after;
4696 tree indx_before_incr, indx_after_incr;
4697 gimple *incr;
4698 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4700 gcc_assert (iv_step != NULL_TREE
4701 || TREE_CODE (aggr_type) == ARRAY_TYPE
4702 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4704 if (loop_vinfo)
4706 loop = LOOP_VINFO_LOOP (loop_vinfo);
4707 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4708 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4709 pe = loop_preheader_edge (loop);
4711 else
4713 gcc_assert (bb_vinfo);
4714 only_init = true;
4715 *ptr_incr = NULL;
4718 /* Create an expression for the first address accessed by this load
4719 in LOOP. */
4720 base_name = get_name (DR_BASE_ADDRESS (dr));
4722 if (dump_enabled_p ())
4724 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4725 dump_printf_loc (MSG_NOTE, vect_location,
4726 "create %s-pointer variable to type: %T",
4727 get_tree_code_name (TREE_CODE (aggr_type)),
4728 aggr_type);
4729 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4730 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4731 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4732 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4733 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4734 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4735 else
4736 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4737 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4740 /* (1) Create the new aggregate-pointer variable.
4741 Vector and array types inherit the alias set of their component
4742 type by default so we need to use a ref-all pointer if the data
4743 reference does not conflict with the created aggregated data
4744 reference because it is not addressable. */
4745 bool need_ref_all = false;
4746 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4747 get_alias_set (DR_REF (dr))))
4748 need_ref_all = true;
4749 /* Likewise for any of the data references in the stmt group. */
4750 else if (DR_GROUP_SIZE (stmt_info) > 1)
4752 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4755 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4756 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4757 get_alias_set (DR_REF (sdr))))
4759 need_ref_all = true;
4760 break;
4762 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4764 while (sinfo);
4766 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4767 need_ref_all);
4768 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4771 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4772 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4773 def-use update cycles for the pointer: one relative to the outer-loop
4774 (LOOP), which is what steps (3) and (4) below do. The other is relative
4775 to the inner-loop (which is the inner-most loop containing the dataref),
4776 and this is done be step (5) below.
4778 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4779 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4780 redundant. Steps (3),(4) create the following:
4782 vp0 = &base_addr;
4783 LOOP: vp1 = phi(vp0,vp2)
4786 vp2 = vp1 + step
4787 goto LOOP
4789 If there is an inner-loop nested in loop, then step (5) will also be
4790 applied, and an additional update in the inner-loop will be created:
4792 vp0 = &base_addr;
4793 LOOP: vp1 = phi(vp0,vp2)
4795 inner: vp3 = phi(vp1,vp4)
4796 vp4 = vp3 + inner_step
4797 if () goto inner
4799 vp2 = vp1 + step
4800 if () goto LOOP */
4802 /* (2) Calculate the initial address of the aggregate-pointer, and set
4803 the aggregate-pointer to point to it before the loop. */
4805 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4807 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4808 offset, byte_offset);
4809 if (new_stmt_list)
4811 if (pe)
4813 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4814 gcc_assert (!new_bb);
4816 else
4817 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4820 *initial_address = new_temp;
4821 aggr_ptr_init = new_temp;
4823 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4824 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4825 inner-loop nested in LOOP (during outer-loop vectorization). */
4827 /* No update in loop is required. */
4828 if (only_init && (!loop_vinfo || at_loop == loop))
4829 aptr = aggr_ptr_init;
4830 else
4832 /* Accesses to invariant addresses should be handled specially
4833 by the caller. */
4834 tree step = vect_dr_behavior (dr_info)->step;
4835 gcc_assert (!integer_zerop (step));
4837 if (iv_step == NULL_TREE)
4839 /* The step of the aggregate pointer is the type size,
4840 negated for downward accesses. */
4841 iv_step = TYPE_SIZE_UNIT (aggr_type);
4842 if (tree_int_cst_sgn (step) == -1)
4843 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4846 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4848 create_iv (aggr_ptr_init,
4849 fold_convert (aggr_ptr_type, iv_step),
4850 aggr_ptr, loop, &incr_gsi, insert_after,
4851 &indx_before_incr, &indx_after_incr);
4852 incr = gsi_stmt (incr_gsi);
4853 loop_vinfo->add_stmt (incr);
4855 /* Copy the points-to information if it exists. */
4856 if (DR_PTR_INFO (dr))
4858 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4859 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4861 if (ptr_incr)
4862 *ptr_incr = incr;
4864 aptr = indx_before_incr;
4867 if (!nested_in_vect_loop || only_init)
4868 return aptr;
4871 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4872 nested in LOOP, if exists. */
4874 gcc_assert (nested_in_vect_loop);
4875 if (!only_init)
4877 standard_iv_increment_position (containing_loop, &incr_gsi,
4878 &insert_after);
4879 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4880 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4881 &indx_after_incr);
4882 incr = gsi_stmt (incr_gsi);
4883 loop_vinfo->add_stmt (incr);
4885 /* Copy the points-to information if it exists. */
4886 if (DR_PTR_INFO (dr))
4888 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4889 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4891 if (ptr_incr)
4892 *ptr_incr = incr;
4894 return indx_before_incr;
4896 else
4897 gcc_unreachable ();
4901 /* Function bump_vector_ptr
4903 Increment a pointer (to a vector type) by vector-size. If requested,
4904 i.e. if PTR-INCR is given, then also connect the new increment stmt
4905 to the existing def-use update-chain of the pointer, by modifying
4906 the PTR_INCR as illustrated below:
4908 The pointer def-use update-chain before this function:
4909 DATAREF_PTR = phi (p_0, p_2)
4910 ....
4911 PTR_INCR: p_2 = DATAREF_PTR + step
4913 The pointer def-use update-chain after this function:
4914 DATAREF_PTR = phi (p_0, p_2)
4915 ....
4916 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4917 ....
4918 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4920 Input:
4921 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4922 in the loop.
4923 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4924 the loop. The increment amount across iterations is expected
4925 to be vector_size.
4926 BSI - location where the new update stmt is to be placed.
4927 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4928 BUMP - optional. The offset by which to bump the pointer. If not given,
4929 the offset is assumed to be vector_size.
4931 Output: Return NEW_DATAREF_PTR as illustrated above.
4935 tree
4936 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4937 stmt_vec_info stmt_info, tree bump)
4939 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4940 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941 tree update = TYPE_SIZE_UNIT (vectype);
4942 gassign *incr_stmt;
4943 ssa_op_iter iter;
4944 use_operand_p use_p;
4945 tree new_dataref_ptr;
4947 if (bump)
4948 update = bump;
4950 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4951 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4952 else
4953 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4954 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4955 dataref_ptr, update);
4956 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4958 /* Copy the points-to information if it exists. */
4959 if (DR_PTR_INFO (dr))
4961 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4962 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4965 if (!ptr_incr)
4966 return new_dataref_ptr;
4968 /* Update the vector-pointer's cross-iteration increment. */
4969 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4971 tree use = USE_FROM_PTR (use_p);
4973 if (use == dataref_ptr)
4974 SET_USE (use_p, new_dataref_ptr);
4975 else
4976 gcc_assert (operand_equal_p (use, update, 0));
4979 return new_dataref_ptr;
4983 /* Copy memory reference info such as base/clique from the SRC reference
4984 to the DEST MEM_REF. */
4986 void
4987 vect_copy_ref_info (tree dest, tree src)
4989 if (TREE_CODE (dest) != MEM_REF)
4990 return;
4992 tree src_base = src;
4993 while (handled_component_p (src_base))
4994 src_base = TREE_OPERAND (src_base, 0);
4995 if (TREE_CODE (src_base) != MEM_REF
4996 && TREE_CODE (src_base) != TARGET_MEM_REF)
4997 return;
4999 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5000 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5004 /* Function vect_create_destination_var.
5006 Create a new temporary of type VECTYPE. */
5008 tree
5009 vect_create_destination_var (tree scalar_dest, tree vectype)
5011 tree vec_dest;
5012 const char *name;
5013 char *new_name;
5014 tree type;
5015 enum vect_var_kind kind;
5017 kind = vectype
5018 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5019 ? vect_mask_var
5020 : vect_simple_var
5021 : vect_scalar_var;
5022 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5024 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5026 name = get_name (scalar_dest);
5027 if (name)
5028 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5029 else
5030 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5031 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5032 free (new_name);
5034 return vec_dest;
5037 /* Function vect_grouped_store_supported.
5039 Returns TRUE if interleave high and interleave low permutations
5040 are supported, and FALSE otherwise. */
5042 bool
5043 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5045 machine_mode mode = TYPE_MODE (vectype);
5047 /* vect_permute_store_chain requires the group size to be equal to 3 or
5048 be a power of two. */
5049 if (count != 3 && exact_log2 (count) == -1)
5051 if (dump_enabled_p ())
5052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053 "the size of the group of accesses"
5054 " is not a power of 2 or not eqaul to 3\n");
5055 return false;
5058 /* Check that the permutation is supported. */
5059 if (VECTOR_MODE_P (mode))
5061 unsigned int i;
5062 if (count == 3)
5064 unsigned int j0 = 0, j1 = 0, j2 = 0;
5065 unsigned int i, j;
5067 unsigned int nelt;
5068 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5070 if (dump_enabled_p ())
5071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072 "cannot handle groups of 3 stores for"
5073 " variable-length vectors\n");
5074 return false;
5077 vec_perm_builder sel (nelt, nelt, 1);
5078 sel.quick_grow (nelt);
5079 vec_perm_indices indices;
5080 for (j = 0; j < 3; j++)
5082 int nelt0 = ((3 - j) * nelt) % 3;
5083 int nelt1 = ((3 - j) * nelt + 1) % 3;
5084 int nelt2 = ((3 - j) * nelt + 2) % 3;
5085 for (i = 0; i < nelt; i++)
5087 if (3 * i + nelt0 < nelt)
5088 sel[3 * i + nelt0] = j0++;
5089 if (3 * i + nelt1 < nelt)
5090 sel[3 * i + nelt1] = nelt + j1++;
5091 if (3 * i + nelt2 < nelt)
5092 sel[3 * i + nelt2] = 0;
5094 indices.new_vector (sel, 2, nelt);
5095 if (!can_vec_perm_const_p (mode, indices))
5097 if (dump_enabled_p ())
5098 dump_printf (MSG_MISSED_OPTIMIZATION,
5099 "permutation op not supported by target.\n");
5100 return false;
5103 for (i = 0; i < nelt; i++)
5105 if (3 * i + nelt0 < nelt)
5106 sel[3 * i + nelt0] = 3 * i + nelt0;
5107 if (3 * i + nelt1 < nelt)
5108 sel[3 * i + nelt1] = 3 * i + nelt1;
5109 if (3 * i + nelt2 < nelt)
5110 sel[3 * i + nelt2] = nelt + j2++;
5112 indices.new_vector (sel, 2, nelt);
5113 if (!can_vec_perm_const_p (mode, indices))
5115 if (dump_enabled_p ())
5116 dump_printf (MSG_MISSED_OPTIMIZATION,
5117 "permutation op not supported by target.\n");
5118 return false;
5121 return true;
5123 else
5125 /* If length is not equal to 3 then only power of 2 is supported. */
5126 gcc_assert (pow2p_hwi (count));
5127 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5129 /* The encoding has 2 interleaved stepped patterns. */
5130 vec_perm_builder sel (nelt, 2, 3);
5131 sel.quick_grow (6);
5132 for (i = 0; i < 3; i++)
5134 sel[i * 2] = i;
5135 sel[i * 2 + 1] = i + nelt;
5137 vec_perm_indices indices (sel, 2, nelt);
5138 if (can_vec_perm_const_p (mode, indices))
5140 for (i = 0; i < 6; i++)
5141 sel[i] += exact_div (nelt, 2);
5142 indices.new_vector (sel, 2, nelt);
5143 if (can_vec_perm_const_p (mode, indices))
5144 return true;
5149 if (dump_enabled_p ())
5150 dump_printf (MSG_MISSED_OPTIMIZATION,
5151 "permutation op not supported by target.\n");
5152 return false;
5156 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5157 type VECTYPE. MASKED_P says whether the masked form is needed. */
5159 bool
5160 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5161 bool masked_p)
5163 if (masked_p)
5164 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5165 vec_mask_store_lanes_optab,
5166 vectype, count);
5167 else
5168 return vect_lanes_optab_supported_p ("vec_store_lanes",
5169 vec_store_lanes_optab,
5170 vectype, count);
5174 /* Function vect_permute_store_chain.
5176 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5177 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5178 the data correctly for the stores. Return the final references for stores
5179 in RESULT_CHAIN.
5181 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5182 The input is 4 vectors each containing 8 elements. We assign a number to
5183 each element, the input sequence is:
5185 1st vec: 0 1 2 3 4 5 6 7
5186 2nd vec: 8 9 10 11 12 13 14 15
5187 3rd vec: 16 17 18 19 20 21 22 23
5188 4th vec: 24 25 26 27 28 29 30 31
5190 The output sequence should be:
5192 1st vec: 0 8 16 24 1 9 17 25
5193 2nd vec: 2 10 18 26 3 11 19 27
5194 3rd vec: 4 12 20 28 5 13 21 30
5195 4th vec: 6 14 22 30 7 15 23 31
5197 i.e., we interleave the contents of the four vectors in their order.
5199 We use interleave_high/low instructions to create such output. The input of
5200 each interleave_high/low operation is two vectors:
5201 1st vec 2nd vec
5202 0 1 2 3 4 5 6 7
5203 the even elements of the result vector are obtained left-to-right from the
5204 high/low elements of the first vector. The odd elements of the result are
5205 obtained left-to-right from the high/low elements of the second vector.
5206 The output of interleave_high will be: 0 4 1 5
5207 and of interleave_low: 2 6 3 7
5210 The permutation is done in log LENGTH stages. In each stage interleave_high
5211 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5212 where the first argument is taken from the first half of DR_CHAIN and the
5213 second argument from it's second half.
5214 In our example,
5216 I1: interleave_high (1st vec, 3rd vec)
5217 I2: interleave_low (1st vec, 3rd vec)
5218 I3: interleave_high (2nd vec, 4th vec)
5219 I4: interleave_low (2nd vec, 4th vec)
5221 The output for the first stage is:
5223 I1: 0 16 1 17 2 18 3 19
5224 I2: 4 20 5 21 6 22 7 23
5225 I3: 8 24 9 25 10 26 11 27
5226 I4: 12 28 13 29 14 30 15 31
5228 The output of the second stage, i.e. the final result is:
5230 I1: 0 8 16 24 1 9 17 25
5231 I2: 2 10 18 26 3 11 19 27
5232 I3: 4 12 20 28 5 13 21 30
5233 I4: 6 14 22 30 7 15 23 31. */
5235 void
5236 vect_permute_store_chain (vec<tree> dr_chain,
5237 unsigned int length,
5238 stmt_vec_info stmt_info,
5239 gimple_stmt_iterator *gsi,
5240 vec<tree> *result_chain)
5242 tree vect1, vect2, high, low;
5243 gimple *perm_stmt;
5244 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5245 tree perm_mask_low, perm_mask_high;
5246 tree data_ref;
5247 tree perm3_mask_low, perm3_mask_high;
5248 unsigned int i, j, n, log_length = exact_log2 (length);
5250 result_chain->quick_grow (length);
5251 memcpy (result_chain->address (), dr_chain.address (),
5252 length * sizeof (tree));
5254 if (length == 3)
5256 /* vect_grouped_store_supported ensures that this is constant. */
5257 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5258 unsigned int j0 = 0, j1 = 0, j2 = 0;
5260 vec_perm_builder sel (nelt, nelt, 1);
5261 sel.quick_grow (nelt);
5262 vec_perm_indices indices;
5263 for (j = 0; j < 3; j++)
5265 int nelt0 = ((3 - j) * nelt) % 3;
5266 int nelt1 = ((3 - j) * nelt + 1) % 3;
5267 int nelt2 = ((3 - j) * nelt + 2) % 3;
5269 for (i = 0; i < nelt; i++)
5271 if (3 * i + nelt0 < nelt)
5272 sel[3 * i + nelt0] = j0++;
5273 if (3 * i + nelt1 < nelt)
5274 sel[3 * i + nelt1] = nelt + j1++;
5275 if (3 * i + nelt2 < nelt)
5276 sel[3 * i + nelt2] = 0;
5278 indices.new_vector (sel, 2, nelt);
5279 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5281 for (i = 0; i < nelt; i++)
5283 if (3 * i + nelt0 < nelt)
5284 sel[3 * i + nelt0] = 3 * i + nelt0;
5285 if (3 * i + nelt1 < nelt)
5286 sel[3 * i + nelt1] = 3 * i + nelt1;
5287 if (3 * i + nelt2 < nelt)
5288 sel[3 * i + nelt2] = nelt + j2++;
5290 indices.new_vector (sel, 2, nelt);
5291 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5293 vect1 = dr_chain[0];
5294 vect2 = dr_chain[1];
5296 /* Create interleaving stmt:
5297 low = VEC_PERM_EXPR <vect1, vect2,
5298 {j, nelt, *, j + 1, nelt + j + 1, *,
5299 j + 2, nelt + j + 2, *, ...}> */
5300 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5302 vect2, perm3_mask_low);
5303 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5305 vect1 = data_ref;
5306 vect2 = dr_chain[2];
5307 /* Create interleaving stmt:
5308 low = VEC_PERM_EXPR <vect1, vect2,
5309 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5310 6, 7, nelt + j + 2, ...}> */
5311 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5312 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313 vect2, perm3_mask_high);
5314 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5315 (*result_chain)[j] = data_ref;
5318 else
5320 /* If length is not equal to 3 then only power of 2 is supported. */
5321 gcc_assert (pow2p_hwi (length));
5323 /* The encoding has 2 interleaved stepped patterns. */
5324 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325 vec_perm_builder sel (nelt, 2, 3);
5326 sel.quick_grow (6);
5327 for (i = 0; i < 3; i++)
5329 sel[i * 2] = i;
5330 sel[i * 2 + 1] = i + nelt;
5332 vec_perm_indices indices (sel, 2, nelt);
5333 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5335 for (i = 0; i < 6; i++)
5336 sel[i] += exact_div (nelt, 2);
5337 indices.new_vector (sel, 2, nelt);
5338 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5340 for (i = 0, n = log_length; i < n; i++)
5342 for (j = 0; j < length/2; j++)
5344 vect1 = dr_chain[j];
5345 vect2 = dr_chain[j+length/2];
5347 /* Create interleaving stmt:
5348 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5349 ...}> */
5350 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5351 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5352 vect2, perm_mask_high);
5353 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5354 (*result_chain)[2*j] = high;
5356 /* Create interleaving stmt:
5357 low = VEC_PERM_EXPR <vect1, vect2,
5358 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5359 ...}> */
5360 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5361 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5362 vect2, perm_mask_low);
5363 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5364 (*result_chain)[2*j+1] = low;
5366 memcpy (dr_chain.address (), result_chain->address (),
5367 length * sizeof (tree));
5372 /* Function vect_setup_realignment
5374 This function is called when vectorizing an unaligned load using
5375 the dr_explicit_realign[_optimized] scheme.
5376 This function generates the following code at the loop prolog:
5378 p = initial_addr;
5379 x msq_init = *(floor(p)); # prolog load
5380 realignment_token = call target_builtin;
5381 loop:
5382 x msq = phi (msq_init, ---)
5384 The stmts marked with x are generated only for the case of
5385 dr_explicit_realign_optimized.
5387 The code above sets up a new (vector) pointer, pointing to the first
5388 location accessed by STMT_INFO, and a "floor-aligned" load using that
5389 pointer. It also generates code to compute the "realignment-token"
5390 (if the relevant target hook was defined), and creates a phi-node at the
5391 loop-header bb whose arguments are the result of the prolog-load (created
5392 by this function) and the result of a load that takes place in the loop
5393 (to be created by the caller to this function).
5395 For the case of dr_explicit_realign_optimized:
5396 The caller to this function uses the phi-result (msq) to create the
5397 realignment code inside the loop, and sets up the missing phi argument,
5398 as follows:
5399 loop:
5400 msq = phi (msq_init, lsq)
5401 lsq = *(floor(p')); # load in loop
5402 result = realign_load (msq, lsq, realignment_token);
5404 For the case of dr_explicit_realign:
5405 loop:
5406 msq = *(floor(p)); # load in loop
5407 p' = p + (VS-1);
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 Input:
5412 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5413 a memory location that may be unaligned.
5414 BSI - place where new code is to be inserted.
5415 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5416 is used.
5418 Output:
5419 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5420 target hook, if defined.
5421 Return value - the result of the loop-header phi node. */
5423 tree
5424 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5425 tree *realignment_token,
5426 enum dr_alignment_support alignment_support_scheme,
5427 tree init_addr,
5428 struct loop **at_loop)
5430 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5431 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5432 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5433 struct data_reference *dr = dr_info->dr;
5434 struct loop *loop = NULL;
5435 edge pe = NULL;
5436 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5437 tree vec_dest;
5438 gimple *inc;
5439 tree ptr;
5440 tree data_ref;
5441 basic_block new_bb;
5442 tree msq_init = NULL_TREE;
5443 tree new_temp;
5444 gphi *phi_stmt;
5445 tree msq = NULL_TREE;
5446 gimple_seq stmts = NULL;
5447 bool compute_in_loop = false;
5448 bool nested_in_vect_loop = false;
5449 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5450 struct loop *loop_for_initial_load = NULL;
5452 if (loop_vinfo)
5454 loop = LOOP_VINFO_LOOP (loop_vinfo);
5455 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5458 gcc_assert (alignment_support_scheme == dr_explicit_realign
5459 || alignment_support_scheme == dr_explicit_realign_optimized);
5461 /* We need to generate three things:
5462 1. the misalignment computation
5463 2. the extra vector load (for the optimized realignment scheme).
5464 3. the phi node for the two vectors from which the realignment is
5465 done (for the optimized realignment scheme). */
5467 /* 1. Determine where to generate the misalignment computation.
5469 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5470 calculation will be generated by this function, outside the loop (in the
5471 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5472 caller, inside the loop.
5474 Background: If the misalignment remains fixed throughout the iterations of
5475 the loop, then both realignment schemes are applicable, and also the
5476 misalignment computation can be done outside LOOP. This is because we are
5477 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5478 are a multiple of VS (the Vector Size), and therefore the misalignment in
5479 different vectorized LOOP iterations is always the same.
5480 The problem arises only if the memory access is in an inner-loop nested
5481 inside LOOP, which is now being vectorized using outer-loop vectorization.
5482 This is the only case when the misalignment of the memory access may not
5483 remain fixed throughout the iterations of the inner-loop (as explained in
5484 detail in vect_supportable_dr_alignment). In this case, not only is the
5485 optimized realignment scheme not applicable, but also the misalignment
5486 computation (and generation of the realignment token that is passed to
5487 REALIGN_LOAD) have to be done inside the loop.
5489 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5490 or not, which in turn determines if the misalignment is computed inside
5491 the inner-loop, or outside LOOP. */
5493 if (init_addr != NULL_TREE || !loop_vinfo)
5495 compute_in_loop = true;
5496 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5500 /* 2. Determine where to generate the extra vector load.
5502 For the optimized realignment scheme, instead of generating two vector
5503 loads in each iteration, we generate a single extra vector load in the
5504 preheader of the loop, and in each iteration reuse the result of the
5505 vector load from the previous iteration. In case the memory access is in
5506 an inner-loop nested inside LOOP, which is now being vectorized using
5507 outer-loop vectorization, we need to determine whether this initial vector
5508 load should be generated at the preheader of the inner-loop, or can be
5509 generated at the preheader of LOOP. If the memory access has no evolution
5510 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5511 to be generated inside LOOP (in the preheader of the inner-loop). */
5513 if (nested_in_vect_loop)
5515 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5516 bool invariant_in_outerloop =
5517 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5518 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5520 else
5521 loop_for_initial_load = loop;
5522 if (at_loop)
5523 *at_loop = loop_for_initial_load;
5525 if (loop_for_initial_load)
5526 pe = loop_preheader_edge (loop_for_initial_load);
5528 /* 3. For the case of the optimized realignment, create the first vector
5529 load at the loop preheader. */
5531 if (alignment_support_scheme == dr_explicit_realign_optimized)
5533 /* Create msq_init = *(floor(p1)) in the loop preheader */
5534 gassign *new_stmt;
5536 gcc_assert (!compute_in_loop);
5537 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5538 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5539 loop_for_initial_load, NULL_TREE,
5540 &init_addr, NULL, &inc, true);
5541 if (TREE_CODE (ptr) == SSA_NAME)
5542 new_temp = copy_ssa_name (ptr);
5543 else
5544 new_temp = make_ssa_name (TREE_TYPE (ptr));
5545 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5546 tree type = TREE_TYPE (ptr);
5547 new_stmt = gimple_build_assign
5548 (new_temp, BIT_AND_EXPR, ptr,
5549 fold_build2 (MINUS_EXPR, type,
5550 build_int_cst (type, 0),
5551 build_int_cst (type, align)));
5552 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5553 gcc_assert (!new_bb);
5554 data_ref
5555 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5556 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5557 vect_copy_ref_info (data_ref, DR_REF (dr));
5558 new_stmt = gimple_build_assign (vec_dest, data_ref);
5559 new_temp = make_ssa_name (vec_dest, new_stmt);
5560 gimple_assign_set_lhs (new_stmt, new_temp);
5561 if (pe)
5563 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5564 gcc_assert (!new_bb);
5566 else
5567 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5569 msq_init = gimple_assign_lhs (new_stmt);
5572 /* 4. Create realignment token using a target builtin, if available.
5573 It is done either inside the containing loop, or before LOOP (as
5574 determined above). */
5576 if (targetm.vectorize.builtin_mask_for_load)
5578 gcall *new_stmt;
5579 tree builtin_decl;
5581 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5582 if (!init_addr)
5584 /* Generate the INIT_ADDR computation outside LOOP. */
5585 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5586 NULL_TREE);
5587 if (loop)
5589 pe = loop_preheader_edge (loop);
5590 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5591 gcc_assert (!new_bb);
5593 else
5594 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5597 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5598 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5599 vec_dest =
5600 vect_create_destination_var (scalar_dest,
5601 gimple_call_return_type (new_stmt));
5602 new_temp = make_ssa_name (vec_dest, new_stmt);
5603 gimple_call_set_lhs (new_stmt, new_temp);
5605 if (compute_in_loop)
5606 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5607 else
5609 /* Generate the misalignment computation outside LOOP. */
5610 pe = loop_preheader_edge (loop);
5611 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5612 gcc_assert (!new_bb);
5615 *realignment_token = gimple_call_lhs (new_stmt);
5617 /* The result of the CALL_EXPR to this builtin is determined from
5618 the value of the parameter and no global variables are touched
5619 which makes the builtin a "const" function. Requiring the
5620 builtin to have the "const" attribute makes it unnecessary
5621 to call mark_call_clobbered. */
5622 gcc_assert (TREE_READONLY (builtin_decl));
5625 if (alignment_support_scheme == dr_explicit_realign)
5626 return msq;
5628 gcc_assert (!compute_in_loop);
5629 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5632 /* 5. Create msq = phi <msq_init, lsq> in loop */
5634 pe = loop_preheader_edge (containing_loop);
5635 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5636 msq = make_ssa_name (vec_dest);
5637 phi_stmt = create_phi_node (msq, containing_loop->header);
5638 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5640 return msq;
5644 /* Function vect_grouped_load_supported.
5646 COUNT is the size of the load group (the number of statements plus the
5647 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5648 only one statement, with a gap of COUNT - 1.
5650 Returns true if a suitable permute exists. */
5652 bool
5653 vect_grouped_load_supported (tree vectype, bool single_element_p,
5654 unsigned HOST_WIDE_INT count)
5656 machine_mode mode = TYPE_MODE (vectype);
5658 /* If this is single-element interleaving with an element distance
5659 that leaves unused vector loads around punt - we at least create
5660 very sub-optimal code in that case (and blow up memory,
5661 see PR65518). */
5662 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5664 if (dump_enabled_p ())
5665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5666 "single-element interleaving not supported "
5667 "for not adjacent vector loads\n");
5668 return false;
5671 /* vect_permute_load_chain requires the group size to be equal to 3 or
5672 be a power of two. */
5673 if (count != 3 && exact_log2 (count) == -1)
5675 if (dump_enabled_p ())
5676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5677 "the size of the group of accesses"
5678 " is not a power of 2 or not equal to 3\n");
5679 return false;
5682 /* Check that the permutation is supported. */
5683 if (VECTOR_MODE_P (mode))
5685 unsigned int i, j;
5686 if (count == 3)
5688 unsigned int nelt;
5689 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5691 if (dump_enabled_p ())
5692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693 "cannot handle groups of 3 loads for"
5694 " variable-length vectors\n");
5695 return false;
5698 vec_perm_builder sel (nelt, nelt, 1);
5699 sel.quick_grow (nelt);
5700 vec_perm_indices indices;
5701 unsigned int k;
5702 for (k = 0; k < 3; k++)
5704 for (i = 0; i < nelt; i++)
5705 if (3 * i + k < 2 * nelt)
5706 sel[i] = 3 * i + k;
5707 else
5708 sel[i] = 0;
5709 indices.new_vector (sel, 2, nelt);
5710 if (!can_vec_perm_const_p (mode, indices))
5712 if (dump_enabled_p ())
5713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5714 "shuffle of 3 loads is not supported by"
5715 " target\n");
5716 return false;
5718 for (i = 0, j = 0; i < nelt; i++)
5719 if (3 * i + k < 2 * nelt)
5720 sel[i] = i;
5721 else
5722 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5723 indices.new_vector (sel, 2, nelt);
5724 if (!can_vec_perm_const_p (mode, indices))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "shuffle of 3 loads is not supported by"
5729 " target\n");
5730 return false;
5733 return true;
5735 else
5737 /* If length is not equal to 3 then only power of 2 is supported. */
5738 gcc_assert (pow2p_hwi (count));
5739 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5741 /* The encoding has a single stepped pattern. */
5742 vec_perm_builder sel (nelt, 1, 3);
5743 sel.quick_grow (3);
5744 for (i = 0; i < 3; i++)
5745 sel[i] = i * 2;
5746 vec_perm_indices indices (sel, 2, nelt);
5747 if (can_vec_perm_const_p (mode, indices))
5749 for (i = 0; i < 3; i++)
5750 sel[i] = i * 2 + 1;
5751 indices.new_vector (sel, 2, nelt);
5752 if (can_vec_perm_const_p (mode, indices))
5753 return true;
5758 if (dump_enabled_p ())
5759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5760 "extract even/odd not supported by target\n");
5761 return false;
5764 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5765 type VECTYPE. MASKED_P says whether the masked form is needed. */
5767 bool
5768 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5769 bool masked_p)
5771 if (masked_p)
5772 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5773 vec_mask_load_lanes_optab,
5774 vectype, count);
5775 else
5776 return vect_lanes_optab_supported_p ("vec_load_lanes",
5777 vec_load_lanes_optab,
5778 vectype, count);
5781 /* Function vect_permute_load_chain.
5783 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5784 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5785 the input data correctly. Return the final references for loads in
5786 RESULT_CHAIN.
5788 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5789 The input is 4 vectors each containing 8 elements. We assign a number to each
5790 element, the input sequence is:
5792 1st vec: 0 1 2 3 4 5 6 7
5793 2nd vec: 8 9 10 11 12 13 14 15
5794 3rd vec: 16 17 18 19 20 21 22 23
5795 4th vec: 24 25 26 27 28 29 30 31
5797 The output sequence should be:
5799 1st vec: 0 4 8 12 16 20 24 28
5800 2nd vec: 1 5 9 13 17 21 25 29
5801 3rd vec: 2 6 10 14 18 22 26 30
5802 4th vec: 3 7 11 15 19 23 27 31
5804 i.e., the first output vector should contain the first elements of each
5805 interleaving group, etc.
5807 We use extract_even/odd instructions to create such output. The input of
5808 each extract_even/odd operation is two vectors
5809 1st vec 2nd vec
5810 0 1 2 3 4 5 6 7
5812 and the output is the vector of extracted even/odd elements. The output of
5813 extract_even will be: 0 2 4 6
5814 and of extract_odd: 1 3 5 7
5817 The permutation is done in log LENGTH stages. In each stage extract_even
5818 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5819 their order. In our example,
5821 E1: extract_even (1st vec, 2nd vec)
5822 E2: extract_odd (1st vec, 2nd vec)
5823 E3: extract_even (3rd vec, 4th vec)
5824 E4: extract_odd (3rd vec, 4th vec)
5826 The output for the first stage will be:
5828 E1: 0 2 4 6 8 10 12 14
5829 E2: 1 3 5 7 9 11 13 15
5830 E3: 16 18 20 22 24 26 28 30
5831 E4: 17 19 21 23 25 27 29 31
5833 In order to proceed and create the correct sequence for the next stage (or
5834 for the correct output, if the second stage is the last one, as in our
5835 example), we first put the output of extract_even operation and then the
5836 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5837 The input for the second stage is:
5839 1st vec (E1): 0 2 4 6 8 10 12 14
5840 2nd vec (E3): 16 18 20 22 24 26 28 30
5841 3rd vec (E2): 1 3 5 7 9 11 13 15
5842 4th vec (E4): 17 19 21 23 25 27 29 31
5844 The output of the second stage:
5846 E1: 0 4 8 12 16 20 24 28
5847 E2: 2 6 10 14 18 22 26 30
5848 E3: 1 5 9 13 17 21 25 29
5849 E4: 3 7 11 15 19 23 27 31
5851 And RESULT_CHAIN after reordering:
5853 1st vec (E1): 0 4 8 12 16 20 24 28
5854 2nd vec (E3): 1 5 9 13 17 21 25 29
5855 3rd vec (E2): 2 6 10 14 18 22 26 30
5856 4th vec (E4): 3 7 11 15 19 23 27 31. */
5858 static void
5859 vect_permute_load_chain (vec<tree> dr_chain,
5860 unsigned int length,
5861 stmt_vec_info stmt_info,
5862 gimple_stmt_iterator *gsi,
5863 vec<tree> *result_chain)
5865 tree data_ref, first_vect, second_vect;
5866 tree perm_mask_even, perm_mask_odd;
5867 tree perm3_mask_low, perm3_mask_high;
5868 gimple *perm_stmt;
5869 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5870 unsigned int i, j, log_length = exact_log2 (length);
5872 result_chain->quick_grow (length);
5873 memcpy (result_chain->address (), dr_chain.address (),
5874 length * sizeof (tree));
5876 if (length == 3)
5878 /* vect_grouped_load_supported ensures that this is constant. */
5879 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5880 unsigned int k;
5882 vec_perm_builder sel (nelt, nelt, 1);
5883 sel.quick_grow (nelt);
5884 vec_perm_indices indices;
5885 for (k = 0; k < 3; k++)
5887 for (i = 0; i < nelt; i++)
5888 if (3 * i + k < 2 * nelt)
5889 sel[i] = 3 * i + k;
5890 else
5891 sel[i] = 0;
5892 indices.new_vector (sel, 2, nelt);
5893 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5895 for (i = 0, j = 0; i < nelt; i++)
5896 if (3 * i + k < 2 * nelt)
5897 sel[i] = i;
5898 else
5899 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5900 indices.new_vector (sel, 2, nelt);
5901 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5903 first_vect = dr_chain[0];
5904 second_vect = dr_chain[1];
5906 /* Create interleaving stmt (low part of):
5907 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5908 ...}> */
5909 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5910 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5911 second_vect, perm3_mask_low);
5912 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5914 /* Create interleaving stmt (high part of):
5915 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5916 ...}> */
5917 first_vect = data_ref;
5918 second_vect = dr_chain[2];
5919 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5920 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5921 second_vect, perm3_mask_high);
5922 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5923 (*result_chain)[k] = data_ref;
5926 else
5928 /* If length is not equal to 3 then only power of 2 is supported. */
5929 gcc_assert (pow2p_hwi (length));
5931 /* The encoding has a single stepped pattern. */
5932 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5933 vec_perm_builder sel (nelt, 1, 3);
5934 sel.quick_grow (3);
5935 for (i = 0; i < 3; ++i)
5936 sel[i] = i * 2;
5937 vec_perm_indices indices (sel, 2, nelt);
5938 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5940 for (i = 0; i < 3; ++i)
5941 sel[i] = i * 2 + 1;
5942 indices.new_vector (sel, 2, nelt);
5943 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5945 for (i = 0; i < log_length; i++)
5947 for (j = 0; j < length; j += 2)
5949 first_vect = dr_chain[j];
5950 second_vect = dr_chain[j+1];
5952 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5953 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5954 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5955 first_vect, second_vect,
5956 perm_mask_even);
5957 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5958 (*result_chain)[j/2] = data_ref;
5960 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5961 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5962 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5963 first_vect, second_vect,
5964 perm_mask_odd);
5965 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5966 (*result_chain)[j/2+length/2] = data_ref;
5968 memcpy (dr_chain.address (), result_chain->address (),
5969 length * sizeof (tree));
5974 /* Function vect_shift_permute_load_chain.
5976 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5977 sequence of stmts to reorder the input data accordingly.
5978 Return the final references for loads in RESULT_CHAIN.
5979 Return true if successed, false otherwise.
5981 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5982 The input is 3 vectors each containing 8 elements. We assign a
5983 number to each element, the input sequence is:
5985 1st vec: 0 1 2 3 4 5 6 7
5986 2nd vec: 8 9 10 11 12 13 14 15
5987 3rd vec: 16 17 18 19 20 21 22 23
5989 The output sequence should be:
5991 1st vec: 0 3 6 9 12 15 18 21
5992 2nd vec: 1 4 7 10 13 16 19 22
5993 3rd vec: 2 5 8 11 14 17 20 23
5995 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5997 First we shuffle all 3 vectors to get correct elements order:
5999 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6000 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6001 3rd vec: (16 19 22) (17 20 23) (18 21)
6003 Next we unite and shift vector 3 times:
6005 1st step:
6006 shift right by 6 the concatenation of:
6007 "1st vec" and "2nd vec"
6008 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6009 "2nd vec" and "3rd vec"
6010 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6011 "3rd vec" and "1st vec"
6012 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6013 | New vectors |
6015 So that now new vectors are:
6017 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6018 2nd vec: (10 13) (16 19 22) (17 20 23)
6019 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6021 2nd step:
6022 shift right by 5 the concatenation of:
6023 "1st vec" and "3rd vec"
6024 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6025 "2nd vec" and "1st vec"
6026 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6027 "3rd vec" and "2nd vec"
6028 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6029 | New vectors |
6031 So that now new vectors are:
6033 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6034 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6035 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6037 3rd step:
6038 shift right by 5 the concatenation of:
6039 "1st vec" and "1st vec"
6040 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6041 shift right by 3 the concatenation of:
6042 "2nd vec" and "2nd vec"
6043 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6044 | New vectors |
6046 So that now all vectors are READY:
6047 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6048 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6049 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6051 This algorithm is faster than one in vect_permute_load_chain if:
6052 1. "shift of a concatination" is faster than general permutation.
6053 This is usually so.
6054 2. The TARGET machine can't execute vector instructions in parallel.
6055 This is because each step of the algorithm depends on previous.
6056 The algorithm in vect_permute_load_chain is much more parallel.
6058 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6061 static bool
6062 vect_shift_permute_load_chain (vec<tree> dr_chain,
6063 unsigned int length,
6064 stmt_vec_info stmt_info,
6065 gimple_stmt_iterator *gsi,
6066 vec<tree> *result_chain)
6068 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6069 tree perm2_mask1, perm2_mask2, perm3_mask;
6070 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6071 gimple *perm_stmt;
6073 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6074 unsigned int i;
6075 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6077 unsigned HOST_WIDE_INT nelt, vf;
6078 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6079 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6080 /* Not supported for variable-length vectors. */
6081 return false;
6083 vec_perm_builder sel (nelt, nelt, 1);
6084 sel.quick_grow (nelt);
6086 result_chain->quick_grow (length);
6087 memcpy (result_chain->address (), dr_chain.address (),
6088 length * sizeof (tree));
6090 if (pow2p_hwi (length) && vf > 4)
6092 unsigned int j, log_length = exact_log2 (length);
6093 for (i = 0; i < nelt / 2; ++i)
6094 sel[i] = i * 2;
6095 for (i = 0; i < nelt / 2; ++i)
6096 sel[nelt / 2 + i] = i * 2 + 1;
6097 vec_perm_indices indices (sel, 2, nelt);
6098 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6100 if (dump_enabled_p ())
6101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6102 "shuffle of 2 fields structure is not \
6103 supported by target\n");
6104 return false;
6106 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6108 for (i = 0; i < nelt / 2; ++i)
6109 sel[i] = i * 2 + 1;
6110 for (i = 0; i < nelt / 2; ++i)
6111 sel[nelt / 2 + i] = i * 2;
6112 indices.new_vector (sel, 2, nelt);
6113 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6115 if (dump_enabled_p ())
6116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117 "shuffle of 2 fields structure is not \
6118 supported by target\n");
6119 return false;
6121 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6123 /* Generating permutation constant to shift all elements.
6124 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6125 for (i = 0; i < nelt; i++)
6126 sel[i] = nelt / 2 + i;
6127 indices.new_vector (sel, 2, nelt);
6128 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132 "shift permutation is not supported by target\n");
6133 return false;
6135 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6137 /* Generating permutation constant to select vector from 2.
6138 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6139 for (i = 0; i < nelt / 2; i++)
6140 sel[i] = i;
6141 for (i = nelt / 2; i < nelt; i++)
6142 sel[i] = nelt + i;
6143 indices.new_vector (sel, 2, nelt);
6144 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6146 if (dump_enabled_p ())
6147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148 "select is not supported by target\n");
6149 return false;
6151 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6153 for (i = 0; i < log_length; i++)
6155 for (j = 0; j < length; j += 2)
6157 first_vect = dr_chain[j];
6158 second_vect = dr_chain[j + 1];
6160 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6161 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6162 first_vect, first_vect,
6163 perm2_mask1);
6164 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6165 vect[0] = data_ref;
6167 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6168 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6169 second_vect, second_vect,
6170 perm2_mask2);
6171 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6172 vect[1] = data_ref;
6174 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6175 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6176 vect[0], vect[1], shift1_mask);
6177 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6178 (*result_chain)[j/2 + length/2] = data_ref;
6180 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6181 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182 vect[0], vect[1], select_mask);
6183 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6184 (*result_chain)[j/2] = data_ref;
6186 memcpy (dr_chain.address (), result_chain->address (),
6187 length * sizeof (tree));
6189 return true;
6191 if (length == 3 && vf > 2)
6193 unsigned int k = 0, l = 0;
6195 /* Generating permutation constant to get all elements in rigth order.
6196 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6197 for (i = 0; i < nelt; i++)
6199 if (3 * k + (l % 3) >= nelt)
6201 k = 0;
6202 l += (3 - (nelt % 3));
6204 sel[i] = 3 * k + (l % 3);
6205 k++;
6207 vec_perm_indices indices (sel, 2, nelt);
6208 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212 "shuffle of 3 fields structure is not \
6213 supported by target\n");
6214 return false;
6216 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6218 /* Generating permutation constant to shift all elements.
6219 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6220 for (i = 0; i < nelt; i++)
6221 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6222 indices.new_vector (sel, 2, nelt);
6223 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6225 if (dump_enabled_p ())
6226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227 "shift permutation is not supported by target\n");
6228 return false;
6230 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6232 /* Generating permutation constant to shift all elements.
6233 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6234 for (i = 0; i < nelt; i++)
6235 sel[i] = 2 * (nelt / 3) + 1 + i;
6236 indices.new_vector (sel, 2, nelt);
6237 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6239 if (dump_enabled_p ())
6240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6241 "shift permutation is not supported by target\n");
6242 return false;
6244 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6246 /* Generating permutation constant to shift all elements.
6247 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6248 for (i = 0; i < nelt; i++)
6249 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6250 indices.new_vector (sel, 2, nelt);
6251 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6253 if (dump_enabled_p ())
6254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255 "shift permutation is not supported by target\n");
6256 return false;
6258 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6260 /* Generating permutation constant to shift all elements.
6261 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6262 for (i = 0; i < nelt; i++)
6263 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6264 indices.new_vector (sel, 2, nelt);
6265 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6267 if (dump_enabled_p ())
6268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269 "shift permutation is not supported by target\n");
6270 return false;
6272 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6274 for (k = 0; k < 3; k++)
6276 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6277 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6278 dr_chain[k], dr_chain[k],
6279 perm3_mask);
6280 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6281 vect[k] = data_ref;
6284 for (k = 0; k < 3; k++)
6286 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6287 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288 vect[k % 3], vect[(k + 1) % 3],
6289 shift1_mask);
6290 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6291 vect_shift[k] = data_ref;
6294 for (k = 0; k < 3; k++)
6296 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6297 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6298 vect_shift[(4 - k) % 3],
6299 vect_shift[(3 - k) % 3],
6300 shift2_mask);
6301 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6302 vect[k] = data_ref;
6305 (*result_chain)[3 - (nelt % 3)] = vect[2];
6307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6309 vect[0], shift3_mask);
6310 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6311 (*result_chain)[nelt % 3] = data_ref;
6313 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6314 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6315 vect[1], shift4_mask);
6316 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6317 (*result_chain)[0] = data_ref;
6318 return true;
6320 return false;
6323 /* Function vect_transform_grouped_load.
6325 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6326 to perform their permutation and ascribe the result vectorized statements to
6327 the scalar statements.
6330 void
6331 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6332 int size, gimple_stmt_iterator *gsi)
6334 machine_mode mode;
6335 vec<tree> result_chain = vNULL;
6337 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6338 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6339 vectors, that are ready for vector computation. */
6340 result_chain.create (size);
6342 /* If reassociation width for vector type is 2 or greater target machine can
6343 execute 2 or more vector instructions in parallel. Otherwise try to
6344 get chain for loads group using vect_shift_permute_load_chain. */
6345 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6346 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6347 || pow2p_hwi (size)
6348 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6349 gsi, &result_chain))
6350 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6351 vect_record_grouped_load_vectors (stmt_info, result_chain);
6352 result_chain.release ();
6355 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6356 generated as part of the vectorization of STMT_INFO. Assign the statement
6357 for each vector to the associated scalar statement. */
6359 void
6360 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6361 vec<tree> result_chain)
6363 vec_info *vinfo = stmt_info->vinfo;
6364 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6365 unsigned int i, gap_count;
6366 tree tmp_data_ref;
6368 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6369 Since we scan the chain starting from it's first node, their order
6370 corresponds the order of data-refs in RESULT_CHAIN. */
6371 stmt_vec_info next_stmt_info = first_stmt_info;
6372 gap_count = 1;
6373 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6375 if (!next_stmt_info)
6376 break;
6378 /* Skip the gaps. Loads created for the gaps will be removed by dead
6379 code elimination pass later. No need to check for the first stmt in
6380 the group, since it always exists.
6381 DR_GROUP_GAP is the number of steps in elements from the previous
6382 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6383 correspond to the gaps. */
6384 if (next_stmt_info != first_stmt_info
6385 && gap_count < DR_GROUP_GAP (next_stmt_info))
6387 gap_count++;
6388 continue;
6391 /* ??? The following needs cleanup after the removal of
6392 DR_GROUP_SAME_DR_STMT. */
6393 if (next_stmt_info)
6395 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6396 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6397 copies, and we put the new vector statement in the first available
6398 RELATED_STMT. */
6399 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6400 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6401 else
6403 stmt_vec_info prev_stmt_info
6404 = STMT_VINFO_VEC_STMT (next_stmt_info);
6405 stmt_vec_info rel_stmt_info
6406 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6407 while (rel_stmt_info)
6409 prev_stmt_info = rel_stmt_info;
6410 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6413 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6416 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6417 gap_count = 1;
6422 /* Function vect_force_dr_alignment_p.
6424 Returns whether the alignment of a DECL can be forced to be aligned
6425 on ALIGNMENT bit boundary. */
6427 bool
6428 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6430 if (!VAR_P (decl))
6431 return false;
6433 if (decl_in_symtab_p (decl)
6434 && !symtab_node::get (decl)->can_increase_alignment_p ())
6435 return false;
6437 if (TREE_STATIC (decl))
6438 return (known_le (alignment,
6439 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6440 else
6441 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6445 /* Return whether the data reference DR_INFO is supported with respect to its
6446 alignment.
6447 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6448 it is aligned, i.e., check if it is possible to vectorize it with different
6449 alignment. */
6451 enum dr_alignment_support
6452 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6453 bool check_aligned_accesses)
6455 data_reference *dr = dr_info->dr;
6456 stmt_vec_info stmt_info = dr_info->stmt;
6457 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6458 machine_mode mode = TYPE_MODE (vectype);
6459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6460 struct loop *vect_loop = NULL;
6461 bool nested_in_vect_loop = false;
6463 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6464 return dr_aligned;
6466 /* For now assume all conditional loads/stores support unaligned
6467 access without any special code. */
6468 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6469 if (gimple_call_internal_p (stmt)
6470 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6471 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6472 return dr_unaligned_supported;
6474 if (loop_vinfo)
6476 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6477 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6480 /* Possibly unaligned access. */
6482 /* We can choose between using the implicit realignment scheme (generating
6483 a misaligned_move stmt) and the explicit realignment scheme (generating
6484 aligned loads with a REALIGN_LOAD). There are two variants to the
6485 explicit realignment scheme: optimized, and unoptimized.
6486 We can optimize the realignment only if the step between consecutive
6487 vector loads is equal to the vector size. Since the vector memory
6488 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6489 is guaranteed that the misalignment amount remains the same throughout the
6490 execution of the vectorized loop. Therefore, we can create the
6491 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6492 at the loop preheader.
6494 However, in the case of outer-loop vectorization, when vectorizing a
6495 memory access in the inner-loop nested within the LOOP that is now being
6496 vectorized, while it is guaranteed that the misalignment of the
6497 vectorized memory access will remain the same in different outer-loop
6498 iterations, it is *not* guaranteed that is will remain the same throughout
6499 the execution of the inner-loop. This is because the inner-loop advances
6500 with the original scalar step (and not in steps of VS). If the inner-loop
6501 step happens to be a multiple of VS, then the misalignment remains fixed
6502 and we can use the optimized realignment scheme. For example:
6504 for (i=0; i<N; i++)
6505 for (j=0; j<M; j++)
6506 s += a[i+j];
6508 When vectorizing the i-loop in the above example, the step between
6509 consecutive vector loads is 1, and so the misalignment does not remain
6510 fixed across the execution of the inner-loop, and the realignment cannot
6511 be optimized (as illustrated in the following pseudo vectorized loop):
6513 for (i=0; i<N; i+=4)
6514 for (j=0; j<M; j++){
6515 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6516 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6517 // (assuming that we start from an aligned address).
6520 We therefore have to use the unoptimized realignment scheme:
6522 for (i=0; i<N; i+=4)
6523 for (j=k; j<M; j+=4)
6524 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6525 // that the misalignment of the initial address is
6526 // 0).
6528 The loop can then be vectorized as follows:
6530 for (k=0; k<4; k++){
6531 rt = get_realignment_token (&vp[k]);
6532 for (i=0; i<N; i+=4){
6533 v1 = vp[i+k];
6534 for (j=k; j<M; j+=4){
6535 v2 = vp[i+j+VS-1];
6536 va = REALIGN_LOAD <v1,v2,rt>;
6537 vs += va;
6538 v1 = v2;
6541 } */
6543 if (DR_IS_READ (dr))
6545 bool is_packed = false;
6546 tree type = (TREE_TYPE (DR_REF (dr)));
6548 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6549 && (!targetm.vectorize.builtin_mask_for_load
6550 || targetm.vectorize.builtin_mask_for_load ()))
6552 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6554 /* If we are doing SLP then the accesses need not have the
6555 same alignment, instead it depends on the SLP group size. */
6556 if (loop_vinfo
6557 && STMT_SLP_TYPE (stmt_info)
6558 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6559 * (DR_GROUP_SIZE
6560 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6561 TYPE_VECTOR_SUBPARTS (vectype)))
6563 else if (!loop_vinfo
6564 || (nested_in_vect_loop
6565 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6566 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6567 return dr_explicit_realign;
6568 else
6569 return dr_explicit_realign_optimized;
6571 if (!known_alignment_for_access_p (dr_info))
6572 is_packed = not_size_aligned (DR_REF (dr));
6574 if (targetm.vectorize.support_vector_misalignment
6575 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6576 /* Can't software pipeline the loads, but can at least do them. */
6577 return dr_unaligned_supported;
6579 else
6581 bool is_packed = false;
6582 tree type = (TREE_TYPE (DR_REF (dr)));
6584 if (!known_alignment_for_access_p (dr_info))
6585 is_packed = not_size_aligned (DR_REF (dr));
6587 if (targetm.vectorize.support_vector_misalignment
6588 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6589 return dr_unaligned_supported;
6592 /* Unsupported. */
6593 return dr_unaligned_unsupported;