2018-10-24 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
bloba24e1853e038325e35ededa2feb5d9d40d17931f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static opt_result
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return opt_result::failure_at (vect_location,
166 "will not create alias checks, as"
167 " --param vect-max-version-for-alias-checks"
168 " == 0\n");
170 opt_result res
171 = runtime_alias_check_p (ddr, loop,
172 optimize_loop_nest_for_speed_p (loop));
173 if (!res)
174 return res;
176 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
177 return opt_result::success ();
180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
182 static void
183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
185 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
186 for (unsigned int i = 0; i < checks.length(); ++i)
187 if (checks[i] == value)
188 return;
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_NOTE, vect_location,
192 "need run-time check that %T is nonzero\n",
193 value);
194 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
197 /* Return true if we know that the order of vectorized DR_INFO_A and
198 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
199 DR_INFO_B. At least one of the accesses is a write. */
201 static bool
202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
204 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
205 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
207 /* Single statements are always kept in their original order. */
208 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
209 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
210 return true;
212 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
213 group are emitted at the position of the last scalar load and all
214 stores in a group are emitted at the position of the last scalar store.
215 Compute that position and check whether the resulting order matches
216 the current one. */
217 stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
218 if (last_a)
219 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
220 s = DR_GROUP_NEXT_ELEMENT (s))
221 last_a = get_later_stmt (last_a, s);
222 else
223 last_a = stmtinfo_a;
224 stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
225 if (last_b)
226 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
227 s = DR_GROUP_NEXT_ELEMENT (s))
228 last_b = get_later_stmt (last_b, s);
229 else
230 last_b = stmtinfo_b;
231 return ((get_later_stmt (last_a, last_b) == last_a)
232 == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
235 /* A subroutine of vect_analyze_data_ref_dependence. Handle
236 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
237 distances. These distances are conservatively correct but they don't
238 reflect a guaranteed dependence.
240 Return true if this function does all the work necessary to avoid
241 an alias or false if the caller should use the dependence distances
242 to limit the vectorization factor in the usual way. LOOP_DEPTH is
243 the depth of the loop described by LOOP_VINFO and the other arguments
244 are as for vect_analyze_data_ref_dependence. */
246 static bool
247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
248 loop_vec_info loop_vinfo,
249 int loop_depth, unsigned int *max_vf)
251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
252 lambda_vector dist_v;
253 unsigned int i;
254 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
256 int dist = dist_v[loop_depth];
257 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
259 /* If the user asserted safelen >= DIST consecutive iterations
260 can be executed concurrently, assume independence.
262 ??? An alternative would be to add the alias check even
263 in this case, and vectorize the fallback loop with the
264 maximum VF set to safelen. However, if the user has
265 explicitly given a length, it's less likely that that
266 would be a win. */
267 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
269 if ((unsigned int) loop->safelen < *max_vf)
270 *max_vf = loop->safelen;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
272 continue;
275 /* For dependence distances of 2 or more, we have the option
276 of limiting VF or checking for an alias at runtime.
277 Prefer to check at runtime if we can, to avoid limiting
278 the VF unnecessarily when the bases are in fact independent.
280 Note that the alias checks will be removed if the VF ends up
281 being small enough. */
282 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
283 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
284 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
285 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
286 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
289 return true;
293 /* Function vect_analyze_data_ref_dependence.
295 FIXME: I needed to change the sense of the returned flag.
297 Return FALSE if there (might) exist a dependence between a memory-reference
298 DRA and a memory-reference DRB. When versioning for alias may check a
299 dependence at run-time, return TRUE. Adjust *MAX_VF according to
300 the data dependence. */
302 static opt_result
303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
304 loop_vec_info loop_vinfo,
305 unsigned int *max_vf)
307 unsigned int i;
308 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309 struct data_reference *dra = DDR_A (ddr);
310 struct data_reference *drb = DDR_B (ddr);
311 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
312 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
313 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
314 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
315 lambda_vector dist_v;
316 unsigned int loop_depth;
318 /* In loop analysis all data references should be vectorizable. */
319 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
320 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
321 gcc_unreachable ();
323 /* Independent data accesses. */
324 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
325 return opt_result::success ();
327 if (dra == drb
328 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
329 return opt_result::success ();
331 /* We do not have to consider dependences between accesses that belong
332 to the same group, unless the stride could be smaller than the
333 group size. */
334 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
335 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
336 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
337 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
338 return opt_result::success ();
340 /* Even if we have an anti-dependence then, as the vectorized loop covers at
341 least two scalar iterations, there is always also a true dependence.
342 As the vectorizer does not re-order loads and stores we can ignore
343 the anti-dependence if TBAA can disambiguate both DRs similar to the
344 case with known negative distance anti-dependences (positive
345 distance anti-dependences would violate TBAA constraints). */
346 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
347 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
348 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
349 get_alias_set (DR_REF (drb))))
350 return opt_result::success ();
352 /* Unknown data dependence. */
353 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
355 /* If user asserted safelen consecutive iterations can be
356 executed concurrently, assume independence. */
357 if (loop->safelen >= 2)
359 if ((unsigned int) loop->safelen < *max_vf)
360 *max_vf = loop->safelen;
361 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
362 return opt_result::success ();
365 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
366 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
367 return opt_result::failure_at
368 (stmtinfo_a->stmt,
369 "versioning for alias not supported for: "
370 "can't determine dependence between %T and %T\n",
371 DR_REF (dra), DR_REF (drb));
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
375 "versioning for alias required: "
376 "can't determine dependence between %T and %T\n",
377 DR_REF (dra), DR_REF (drb));
379 /* Add to list of ddrs that need to be tested at run-time. */
380 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
383 /* Known data dependence. */
384 if (DDR_NUM_DIST_VECTS (ddr) == 0)
386 /* If user asserted safelen consecutive iterations can be
387 executed concurrently, assume independence. */
388 if (loop->safelen >= 2)
390 if ((unsigned int) loop->safelen < *max_vf)
391 *max_vf = loop->safelen;
392 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
393 return opt_result::success ();
396 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
397 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
398 return opt_result::failure_at
399 (stmtinfo_a->stmt,
400 "versioning for alias not supported for: "
401 "bad dist vector for %T and %T\n",
402 DR_REF (dra), DR_REF (drb));
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
406 "versioning for alias required: "
407 "bad dist vector for %T and %T\n",
408 DR_REF (dra), DR_REF (drb));
409 /* Add to list of ddrs that need to be tested at run-time. */
410 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
413 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
415 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
416 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
417 loop_depth, max_vf))
418 return opt_result::success ();
420 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
422 int dist = dist_v[loop_depth];
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "dependence distance = %d.\n", dist);
428 if (dist == 0)
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_NOTE, vect_location,
432 "dependence distance == 0 between %T and %T\n",
433 DR_REF (dra), DR_REF (drb));
435 /* When we perform grouped accesses and perform implicit CSE
436 by detecting equal accesses and doing disambiguation with
437 runtime alias tests like for
438 .. = a[i];
439 .. = a[i+1];
440 a[i] = ..;
441 a[i+1] = ..;
442 *p = ..;
443 .. = a[i];
444 .. = a[i+1];
445 where we will end up loading { a[i], a[i+1] } once, make
446 sure that inserting group loads before the first load and
447 stores after the last store will do the right thing.
448 Similar for groups like
449 a[i] = ...;
450 ... = a[i];
451 a[i+1] = ...;
452 where loads from the group interleave with the store. */
453 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
454 return opt_result::failure_at (stmtinfo_a->stmt,
455 "READ_WRITE dependence"
456 " in interleaving.\n");
458 if (loop->safelen < 2)
460 tree indicator = dr_zero_step_indicator (dra);
461 if (!indicator || integer_zerop (indicator))
462 return opt_result::failure_at (stmtinfo_a->stmt,
463 "access also has a zero step\n");
464 else if (TREE_CODE (indicator) != INTEGER_CST)
465 vect_check_nonzero_value (loop_vinfo, indicator);
467 continue;
470 if (dist > 0 && DDR_REVERSED_P (ddr))
472 /* If DDR_REVERSED_P the order of the data-refs in DDR was
473 reversed (to make distance vector positive), and the actual
474 distance is negative. */
475 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "dependence distance negative.\n");
478 /* Record a negative dependence distance to later limit the
479 amount of stmt copying / unrolling we can perform.
480 Only need to handle read-after-write dependence. */
481 if (DR_IS_READ (drb)
482 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
483 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
484 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
485 continue;
488 unsigned int abs_dist = abs (dist);
489 if (abs_dist >= 2 && abs_dist < *max_vf)
491 /* The dependence distance requires reduction of the maximal
492 vectorization factor. */
493 *max_vf = abs (dist);
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location,
496 "adjusting maximal vectorization factor to %i\n",
497 *max_vf);
500 if (abs_dist >= *max_vf)
502 /* Dependence distance does not create dependence, as far as
503 vectorization is concerned, in this case. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location,
506 "dependence distance >= VF.\n");
507 continue;
510 return opt_result::failure_at (stmtinfo_a->stmt,
511 "not vectorized, possible dependence "
512 "between data-refs %T and %T\n",
513 DR_REF (dra), DR_REF (drb));
516 return opt_result::success ();
519 /* Function vect_analyze_data_ref_dependences.
521 Examine all the data references in the loop, and make sure there do not
522 exist any data dependences between them. Set *MAX_VF according to
523 the maximum vectorization factor the data dependences allow. */
525 opt_result
526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
527 unsigned int *max_vf)
529 unsigned int i;
530 struct data_dependence_relation *ddr;
532 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
534 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
536 LOOP_VINFO_DDRS (loop_vinfo)
537 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
538 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
539 /* We need read-read dependences to compute
540 STMT_VINFO_SAME_ALIGN_REFS. */
541 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
542 &LOOP_VINFO_DDRS (loop_vinfo),
543 LOOP_VINFO_LOOP_NEST (loop_vinfo),
544 true);
545 gcc_assert (res);
548 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
550 /* For epilogues we either have no aliases or alias versioning
551 was applied to original loop. Therefore we may just get max_vf
552 using VF of original loop. */
553 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
554 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
555 else
556 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
558 opt_result res
559 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
560 if (!res)
561 return res;
564 return opt_result::success ();
568 /* Function vect_slp_analyze_data_ref_dependence.
570 Return TRUE if there (might) exist a dependence between a memory-reference
571 DRA and a memory-reference DRB for VINFO. When versioning for alias
572 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
573 according to the data dependence. */
575 static bool
576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
577 struct data_dependence_relation *ddr)
579 struct data_reference *dra = DDR_A (ddr);
580 struct data_reference *drb = DDR_B (ddr);
581 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
582 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
584 /* We need to check dependences of statements marked as unvectorizable
585 as well, they still can prohibit vectorization. */
587 /* Independent data accesses. */
588 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
589 return false;
591 if (dra == drb)
592 return false;
594 /* Read-read is OK. */
595 if (DR_IS_READ (dra) && DR_IS_READ (drb))
596 return false;
598 /* If dra and drb are part of the same interleaving chain consider
599 them independent. */
600 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
601 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
602 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
603 return false;
605 /* Unknown data dependence. */
606 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
610 "can't determine dependence between %T and %T\n",
611 DR_REF (dra), DR_REF (drb));
613 else if (dump_enabled_p ())
614 dump_printf_loc (MSG_NOTE, vect_location,
615 "determined dependence between %T and %T\n",
616 DR_REF (dra), DR_REF (drb));
618 return true;
622 /* Analyze dependences involved in the transform of SLP NODE. STORES
623 contain the vector of scalar stores of this instance if we are
624 disambiguating the loads. */
626 static bool
627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
628 vec<stmt_vec_info> stores,
629 stmt_vec_info last_store_info)
631 /* This walks over all stmts involved in the SLP load/store done
632 in NODE verifying we can sink them up to the last stmt in the
633 group. */
634 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
635 vec_info *vinfo = last_access_info->vinfo;
636 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
638 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
639 if (access_info == last_access_info)
640 continue;
641 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
642 ao_ref ref;
643 bool ref_initialized_p = false;
644 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
645 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
647 gimple *stmt = gsi_stmt (gsi);
648 if (! gimple_vuse (stmt)
649 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
650 continue;
652 /* If we couldn't record a (single) data reference for this
653 stmt we have to resort to the alias oracle. */
654 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
655 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
656 if (!dr_b)
658 /* We are moving a store or sinking a load - this means
659 we cannot use TBAA for disambiguation. */
660 if (!ref_initialized_p)
661 ao_ref_init (&ref, DR_REF (dr_a));
662 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
663 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
664 return false;
665 continue;
668 bool dependent = false;
669 /* If we run into a store of this same instance (we've just
670 marked those) then delay dependence checking until we run
671 into the last store because this is where it will have
672 been sunk to (and we verify if we can do that as well). */
673 if (gimple_visited_p (stmt))
675 if (stmt_info != last_store_info)
676 continue;
677 unsigned i;
678 stmt_vec_info store_info;
679 FOR_EACH_VEC_ELT (stores, i, store_info)
681 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
682 ddr_p ddr = initialize_data_dependence_relation
683 (dr_a, store_dr, vNULL);
684 dependent
685 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
686 free_dependence_relation (ddr);
687 if (dependent)
688 break;
691 else
693 ddr_p ddr = initialize_data_dependence_relation (dr_a,
694 dr_b, vNULL);
695 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
696 free_dependence_relation (ddr);
698 if (dependent)
699 return false;
702 return true;
706 /* Function vect_analyze_data_ref_dependences.
708 Examine all the data references in the basic-block, and make sure there
709 do not exist any data dependences between them. Set *MAX_VF according to
710 the maximum vectorization factor the data dependences allow. */
712 bool
713 vect_slp_analyze_instance_dependence (slp_instance instance)
715 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
717 /* The stores of this instance are at the root of the SLP tree. */
718 slp_tree store = SLP_INSTANCE_TREE (instance);
719 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
720 store = NULL;
722 /* Verify we can sink stores to the vectorized stmt insert location. */
723 stmt_vec_info last_store_info = NULL;
724 if (store)
726 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
727 return false;
729 /* Mark stores in this instance and remember the last one. */
730 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
731 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
732 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
735 bool res = true;
737 /* Verify we can sink loads to the vectorized stmt insert location,
738 special-casing stores of this instance. */
739 slp_tree load;
740 unsigned int i;
741 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
742 if (! vect_slp_analyze_node_dependences (instance, load,
743 store
744 ? SLP_TREE_SCALAR_STMTS (store)
745 : vNULL, last_store_info))
747 res = false;
748 break;
751 /* Unset the visited flag. */
752 if (store)
753 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
754 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
756 return res;
759 /* Record the base alignment guarantee given by DRB, which occurs
760 in STMT_INFO. */
762 static void
763 vect_record_base_alignment (stmt_vec_info stmt_info,
764 innermost_loop_behavior *drb)
766 vec_info *vinfo = stmt_info->vinfo;
767 bool existed;
768 innermost_loop_behavior *&entry
769 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
770 if (!existed || entry->base_alignment < drb->base_alignment)
772 entry = drb;
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "recording new base alignment for %T\n"
776 " alignment: %d\n"
777 " misalignment: %d\n"
778 " based on: %G",
779 drb->base_address,
780 drb->base_alignment,
781 drb->base_misalignment,
782 stmt_info->stmt);
786 /* If the region we're going to vectorize is reached, all unconditional
787 data references occur at least once. We can therefore pool the base
788 alignment guarantees from each unconditional reference. Do this by
789 going through all the data references in VINFO and checking whether
790 the containing statement makes the reference unconditionally. If so,
791 record the alignment of the base address in VINFO so that it can be
792 used for all other references with the same base. */
794 void
795 vect_record_base_alignments (vec_info *vinfo)
797 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
798 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
799 data_reference *dr;
800 unsigned int i;
801 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
803 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
804 stmt_vec_info stmt_info = dr_info->stmt;
805 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
806 && STMT_VINFO_VECTORIZABLE (stmt_info)
807 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
809 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
811 /* If DR is nested in the loop that is being vectorized, we can also
812 record the alignment of the base wrt the outer loop. */
813 if (loop && nested_in_vect_loop_p (loop, stmt_info))
814 vect_record_base_alignment
815 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
820 /* Return the target alignment for the vectorized form of DR_INFO. */
822 static unsigned int
823 vect_calculate_target_alignment (dr_vec_info *dr_info)
825 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
826 return targetm.vectorize.preferred_vector_alignment (vectype);
829 /* Function vect_compute_data_ref_alignment
831 Compute the misalignment of the data reference DR_INFO.
833 Output:
834 1. DR_MISALIGNMENT (DR_INFO) is defined.
836 FOR NOW: No analysis is actually performed. Misalignment is calculated
837 only for trivial cases. TODO. */
839 static void
840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
842 stmt_vec_info stmt_info = dr_info->stmt;
843 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
844 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
845 struct loop *loop = NULL;
846 tree ref = DR_REF (dr_info->dr);
847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "vect_compute_data_ref_alignment:\n");
853 if (loop_vinfo)
854 loop = LOOP_VINFO_LOOP (loop_vinfo);
856 /* Initialize misalignment to unknown. */
857 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
859 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
860 return;
862 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
863 bool step_preserves_misalignment_p;
865 unsigned HOST_WIDE_INT vector_alignment
866 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
867 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
869 /* No step for BB vectorization. */
870 if (!loop)
872 gcc_assert (integer_zerop (drb->step));
873 step_preserves_misalignment_p = true;
876 /* In case the dataref is in an inner-loop of the loop that is being
877 vectorized (LOOP), we use the base and misalignment information
878 relative to the outer-loop (LOOP). This is ok only if the misalignment
879 stays the same throughout the execution of the inner-loop, which is why
880 we have to check that the stride of the dataref in the inner-loop evenly
881 divides by the vector alignment. */
882 else if (nested_in_vect_loop_p (loop, stmt_info))
884 step_preserves_misalignment_p
885 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
887 if (dump_enabled_p ())
889 if (step_preserves_misalignment_p)
890 dump_printf_loc (MSG_NOTE, vect_location,
891 "inner step divides the vector alignment.\n");
892 else
893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
894 "inner step doesn't divide the vector"
895 " alignment.\n");
899 /* Similarly we can only use base and misalignment information relative to
900 an innermost loop if the misalignment stays the same throughout the
901 execution of the loop. As above, this is the case if the stride of
902 the dataref evenly divides by the alignment. */
903 else
905 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
906 step_preserves_misalignment_p
907 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
909 if (!step_preserves_misalignment_p && dump_enabled_p ())
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "step doesn't divide the vector alignment.\n");
914 unsigned int base_alignment = drb->base_alignment;
915 unsigned int base_misalignment = drb->base_misalignment;
917 /* Calculate the maximum of the pooled base address alignment and the
918 alignment that we can compute for DR itself. */
919 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
920 if (entry && base_alignment < (*entry)->base_alignment)
922 base_alignment = (*entry)->base_alignment;
923 base_misalignment = (*entry)->base_misalignment;
926 if (drb->offset_alignment < vector_alignment
927 || !step_preserves_misalignment_p
928 /* We need to know whether the step wrt the vectorized loop is
929 negative when computing the starting misalignment below. */
930 || TREE_CODE (drb->step) != INTEGER_CST)
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
934 "Unknown alignment for access: %T\n", ref);
935 return;
938 if (base_alignment < vector_alignment)
940 unsigned int max_alignment;
941 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
942 if (max_alignment < vector_alignment
943 || !vect_can_force_dr_alignment_p (base,
944 vector_alignment * BITS_PER_UNIT))
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE, vect_location,
948 "can't force alignment of ref: %T\n", ref);
949 return;
952 /* Force the alignment of the decl.
953 NOTE: This is the only change to the code we make during
954 the analysis phase, before deciding to vectorize the loop. */
955 if (dump_enabled_p ())
956 dump_printf_loc (MSG_NOTE, vect_location,
957 "force alignment of %T\n", ref);
959 dr_info->base_decl = base;
960 dr_info->base_misaligned = true;
961 base_misalignment = 0;
963 poly_int64 misalignment
964 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
966 /* If this is a backward running DR then first access in the larger
967 vectype actually is N-1 elements before the address in the DR.
968 Adjust misalign accordingly. */
969 if (tree_int_cst_sgn (drb->step) < 0)
970 /* PLUS because STEP is negative. */
971 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
972 * TREE_INT_CST_LOW (drb->step));
974 unsigned int const_misalignment;
975 if (!known_misalignment (misalignment, vector_alignment,
976 &const_misalignment))
978 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "Non-constant misalignment for access: %T\n", ref);
981 return;
984 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "misalign = %d bytes of ref %T\n",
989 DR_MISALIGNMENT (dr_info), ref);
991 return;
994 /* Function vect_update_misalignment_for_peel.
995 Sets DR_INFO's misalignment
996 - to 0 if it has the same alignment as DR_PEEL_INFO,
997 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
998 - to -1 (unknown) otherwise.
1000 DR_INFO - the data reference whose misalignment is to be adjusted.
1001 DR_PEEL_INFO - the data reference whose misalignment is being made
1002 zero in the vector loop by the peel.
1003 NPEEL - the number of iterations in the peel loop if the misalignment
1004 of DR_PEEL_INFO is known at compile time. */
1006 static void
1007 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1008 dr_vec_info *dr_peel_info, int npeel)
1010 unsigned int i;
1011 vec<dr_p> same_aligned_drs;
1012 struct data_reference *current_dr;
1013 int dr_size = vect_get_scalar_dr_size (dr_info);
1014 int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1015 stmt_vec_info stmt_info = dr_info->stmt;
1016 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1018 /* For interleaved data accesses the step in the loop must be multiplied by
1019 the size of the interleaving group. */
1020 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1021 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1022 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1023 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1025 /* It can be assumed that the data refs with the same alignment as dr_peel
1026 are aligned in the vector loop. */
1027 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1028 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1030 if (current_dr != dr_info->dr)
1031 continue;
1032 gcc_assert (!known_alignment_for_access_p (dr_info)
1033 || !known_alignment_for_access_p (dr_peel_info)
1034 || (DR_MISALIGNMENT (dr_info) / dr_size
1035 == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1036 SET_DR_MISALIGNMENT (dr_info, 0);
1037 return;
1040 if (known_alignment_for_access_p (dr_info)
1041 && known_alignment_for_access_p (dr_peel_info))
1043 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1044 size_zero_node) < 0;
1045 int misal = DR_MISALIGNMENT (dr_info);
1046 misal += negative ? -npeel * dr_size : npeel * dr_size;
1047 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1048 SET_DR_MISALIGNMENT (dr_info, misal);
1049 return;
1052 if (dump_enabled_p ())
1053 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1054 "to unknown (-1).\n");
1055 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1059 /* Function verify_data_ref_alignment
1061 Return TRUE if DR_INFO can be handled with respect to alignment. */
1063 static opt_result
1064 verify_data_ref_alignment (dr_vec_info *dr_info)
1066 enum dr_alignment_support supportable_dr_alignment
1067 = vect_supportable_dr_alignment (dr_info, false);
1068 if (!supportable_dr_alignment)
1069 return opt_result::failure_at
1070 (dr_info->stmt->stmt,
1071 DR_IS_READ (dr_info->dr)
1072 ? "not vectorized: unsupported unaligned load: %T\n"
1073 : "not vectorized: unsupported unaligned store: %T\n",
1074 DR_REF (dr_info->dr));
1076 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "Vectorizing an unaligned access.\n");
1080 return opt_result::success ();
1083 /* Function vect_verify_datarefs_alignment
1085 Return TRUE if all data references in the loop can be
1086 handled with respect to alignment. */
1088 opt_result
1089 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1091 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1092 struct data_reference *dr;
1093 unsigned int i;
1095 FOR_EACH_VEC_ELT (datarefs, i, dr)
1097 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1098 stmt_vec_info stmt_info = dr_info->stmt;
1100 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1101 continue;
1103 /* For interleaving, only the alignment of the first access matters. */
1104 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1105 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1106 continue;
1108 /* Strided accesses perform only component accesses, alignment is
1109 irrelevant for them. */
1110 if (STMT_VINFO_STRIDED_P (stmt_info)
1111 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1112 continue;
1114 opt_result res = verify_data_ref_alignment (dr_info);
1115 if (!res)
1116 return res;
1119 return opt_result::success ();
1122 /* Given an memory reference EXP return whether its alignment is less
1123 than its size. */
1125 static bool
1126 not_size_aligned (tree exp)
1128 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1129 return true;
1131 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1132 > get_object_alignment (exp));
1135 /* Function vector_alignment_reachable_p
1137 Return true if vector alignment for DR_INFO is reachable by peeling
1138 a few loop iterations. Return false otherwise. */
1140 static bool
1141 vector_alignment_reachable_p (dr_vec_info *dr_info)
1143 stmt_vec_info stmt_info = dr_info->stmt;
1144 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1146 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1148 /* For interleaved access we peel only if number of iterations in
1149 the prolog loop ({VF - misalignment}), is a multiple of the
1150 number of the interleaved accesses. */
1151 int elem_size, mis_in_elements;
1153 /* FORNOW: handle only known alignment. */
1154 if (!known_alignment_for_access_p (dr_info))
1155 return false;
1157 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1158 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1159 elem_size = vector_element_size (vector_size, nelements);
1160 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1162 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1163 return false;
1166 /* If misalignment is known at the compile time then allow peeling
1167 only if natural alignment is reachable through peeling. */
1168 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1170 HOST_WIDE_INT elmsize =
1171 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1172 if (dump_enabled_p ())
1174 dump_printf_loc (MSG_NOTE, vect_location,
1175 "data size = %wd. misalignment = %d.\n", elmsize,
1176 DR_MISALIGNMENT (dr_info));
1178 if (DR_MISALIGNMENT (dr_info) % elmsize)
1180 if (dump_enabled_p ())
1181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1182 "data size does not divide the misalignment.\n");
1183 return false;
1187 if (!known_alignment_for_access_p (dr_info))
1189 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1190 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1191 if (dump_enabled_p ())
1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193 "Unknown misalignment, %snaturally aligned\n",
1194 is_packed ? "not " : "");
1195 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1198 return true;
1202 /* Calculate the cost of the memory access represented by DR_INFO. */
1204 static void
1205 vect_get_data_access_cost (dr_vec_info *dr_info,
1206 unsigned int *inside_cost,
1207 unsigned int *outside_cost,
1208 stmt_vector_for_cost *body_cost_vec,
1209 stmt_vector_for_cost *prologue_cost_vec)
1211 stmt_vec_info stmt_info = dr_info->stmt;
1212 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1213 int ncopies;
1215 if (PURE_SLP_STMT (stmt_info))
1216 ncopies = 1;
1217 else
1218 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1220 if (DR_IS_READ (dr_info->dr))
1221 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1222 prologue_cost_vec, body_cost_vec, false);
1223 else
1224 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_NOTE, vect_location,
1228 "vect_get_data_access_cost: inside_cost = %d, "
1229 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1233 typedef struct _vect_peel_info
1235 dr_vec_info *dr_info;
1236 int npeel;
1237 unsigned int count;
1238 } *vect_peel_info;
1240 typedef struct _vect_peel_extended_info
1242 struct _vect_peel_info peel_info;
1243 unsigned int inside_cost;
1244 unsigned int outside_cost;
1245 } *vect_peel_extended_info;
1248 /* Peeling hashtable helpers. */
1250 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1252 static inline hashval_t hash (const _vect_peel_info *);
1253 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1256 inline hashval_t
1257 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1259 return (hashval_t) peel_info->npeel;
1262 inline bool
1263 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1265 return (a->npeel == b->npeel);
1269 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1271 static void
1272 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1273 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1274 int npeel)
1276 struct _vect_peel_info elem, *slot;
1277 _vect_peel_info **new_slot;
1278 bool supportable_dr_alignment
1279 = vect_supportable_dr_alignment (dr_info, true);
1281 elem.npeel = npeel;
1282 slot = peeling_htab->find (&elem);
1283 if (slot)
1284 slot->count++;
1285 else
1287 slot = XNEW (struct _vect_peel_info);
1288 slot->npeel = npeel;
1289 slot->dr_info = dr_info;
1290 slot->count = 1;
1291 new_slot = peeling_htab->find_slot (slot, INSERT);
1292 *new_slot = slot;
1295 if (!supportable_dr_alignment
1296 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1297 slot->count += VECT_MAX_COST;
1301 /* Traverse peeling hash table to find peeling option that aligns maximum
1302 number of data accesses. */
1305 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1306 _vect_peel_extended_info *max)
1308 vect_peel_info elem = *slot;
1310 if (elem->count > max->peel_info.count
1311 || (elem->count == max->peel_info.count
1312 && max->peel_info.npeel > elem->npeel))
1314 max->peel_info.npeel = elem->npeel;
1315 max->peel_info.count = elem->count;
1316 max->peel_info.dr_info = elem->dr_info;
1319 return 1;
1322 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1323 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1324 we assume DR0_INFO's misalignment will be zero after peeling. */
1326 static void
1327 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1328 dr_vec_info *dr0_info,
1329 unsigned int *inside_cost,
1330 unsigned int *outside_cost,
1331 stmt_vector_for_cost *body_cost_vec,
1332 stmt_vector_for_cost *prologue_cost_vec,
1333 unsigned int npeel,
1334 bool unknown_misalignment)
1336 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1337 unsigned i;
1338 data_reference *dr;
1340 FOR_EACH_VEC_ELT (datarefs, i, dr)
1342 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1343 stmt_vec_info stmt_info = dr_info->stmt;
1344 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1345 continue;
1347 /* For interleaving, only the alignment of the first access
1348 matters. */
1349 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1350 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1351 continue;
1353 /* Strided accesses perform only component accesses, alignment is
1354 irrelevant for them. */
1355 if (STMT_VINFO_STRIDED_P (stmt_info)
1356 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1357 continue;
1359 int save_misalignment;
1360 save_misalignment = DR_MISALIGNMENT (dr_info);
1361 if (npeel == 0)
1363 else if (unknown_misalignment && dr_info == dr0_info)
1364 SET_DR_MISALIGNMENT (dr_info, 0);
1365 else
1366 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1367 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1368 body_cost_vec, prologue_cost_vec);
1369 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1373 /* Traverse peeling hash table and calculate cost for each peeling option.
1374 Find the one with the lowest cost. */
1377 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1378 _vect_peel_extended_info *min)
1380 vect_peel_info elem = *slot;
1381 int dummy;
1382 unsigned int inside_cost = 0, outside_cost = 0;
1383 stmt_vec_info stmt_info = elem->dr_info->stmt;
1384 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1385 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1386 epilogue_cost_vec;
1388 prologue_cost_vec.create (2);
1389 body_cost_vec.create (2);
1390 epilogue_cost_vec.create (2);
1392 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1393 &outside_cost, &body_cost_vec,
1394 &prologue_cost_vec, elem->npeel, false);
1396 body_cost_vec.release ();
1398 outside_cost += vect_get_known_peeling_cost
1399 (loop_vinfo, elem->npeel, &dummy,
1400 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1401 &prologue_cost_vec, &epilogue_cost_vec);
1403 /* Prologue and epilogue costs are added to the target model later.
1404 These costs depend only on the scalar iteration cost, the
1405 number of peeling iterations finally chosen, and the number of
1406 misaligned statements. So discard the information found here. */
1407 prologue_cost_vec.release ();
1408 epilogue_cost_vec.release ();
1410 if (inside_cost < min->inside_cost
1411 || (inside_cost == min->inside_cost
1412 && outside_cost < min->outside_cost))
1414 min->inside_cost = inside_cost;
1415 min->outside_cost = outside_cost;
1416 min->peel_info.dr_info = elem->dr_info;
1417 min->peel_info.npeel = elem->npeel;
1418 min->peel_info.count = elem->count;
1421 return 1;
1425 /* Choose best peeling option by traversing peeling hash table and either
1426 choosing an option with the lowest cost (if cost model is enabled) or the
1427 option that aligns as many accesses as possible. */
1429 static struct _vect_peel_extended_info
1430 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1431 loop_vec_info loop_vinfo)
1433 struct _vect_peel_extended_info res;
1435 res.peel_info.dr_info = NULL;
1437 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1439 res.inside_cost = INT_MAX;
1440 res.outside_cost = INT_MAX;
1441 peeling_htab->traverse <_vect_peel_extended_info *,
1442 vect_peeling_hash_get_lowest_cost> (&res);
1444 else
1446 res.peel_info.count = 0;
1447 peeling_htab->traverse <_vect_peel_extended_info *,
1448 vect_peeling_hash_get_most_frequent> (&res);
1449 res.inside_cost = 0;
1450 res.outside_cost = 0;
1453 return res;
1456 /* Return true if the new peeling NPEEL is supported. */
1458 static bool
1459 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1460 unsigned npeel)
1462 unsigned i;
1463 struct data_reference *dr = NULL;
1464 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1465 enum dr_alignment_support supportable_dr_alignment;
1467 /* Ensure that all data refs can be vectorized after the peel. */
1468 FOR_EACH_VEC_ELT (datarefs, i, dr)
1470 int save_misalignment;
1472 if (dr == dr0_info->dr)
1473 continue;
1475 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1476 stmt_vec_info stmt_info = dr_info->stmt;
1477 /* For interleaving, only the alignment of the first access
1478 matters. */
1479 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1480 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1481 continue;
1483 /* Strided accesses perform only component accesses, alignment is
1484 irrelevant for them. */
1485 if (STMT_VINFO_STRIDED_P (stmt_info)
1486 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1487 continue;
1489 save_misalignment = DR_MISALIGNMENT (dr_info);
1490 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1491 supportable_dr_alignment
1492 = vect_supportable_dr_alignment (dr_info, false);
1493 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1495 if (!supportable_dr_alignment)
1496 return false;
1499 return true;
1502 /* Function vect_enhance_data_refs_alignment
1504 This pass will use loop versioning and loop peeling in order to enhance
1505 the alignment of data references in the loop.
1507 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1508 original loop is to be vectorized. Any other loops that are created by
1509 the transformations performed in this pass - are not supposed to be
1510 vectorized. This restriction will be relaxed.
1512 This pass will require a cost model to guide it whether to apply peeling
1513 or versioning or a combination of the two. For example, the scheme that
1514 intel uses when given a loop with several memory accesses, is as follows:
1515 choose one memory access ('p') which alignment you want to force by doing
1516 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1517 other accesses are not necessarily aligned, or (2) use loop versioning to
1518 generate one loop in which all accesses are aligned, and another loop in
1519 which only 'p' is necessarily aligned.
1521 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1522 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1523 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1525 Devising a cost model is the most critical aspect of this work. It will
1526 guide us on which access to peel for, whether to use loop versioning, how
1527 many versions to create, etc. The cost model will probably consist of
1528 generic considerations as well as target specific considerations (on
1529 powerpc for example, misaligned stores are more painful than misaligned
1530 loads).
1532 Here are the general steps involved in alignment enhancements:
1534 -- original loop, before alignment analysis:
1535 for (i=0; i<N; i++){
1536 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1537 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1540 -- After vect_compute_data_refs_alignment:
1541 for (i=0; i<N; i++){
1542 x = q[i]; # DR_MISALIGNMENT(q) = 3
1543 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1546 -- Possibility 1: we do loop versioning:
1547 if (p is aligned) {
1548 for (i=0; i<N; i++){ # loop 1A
1549 x = q[i]; # DR_MISALIGNMENT(q) = 3
1550 p[i] = y; # DR_MISALIGNMENT(p) = 0
1553 else {
1554 for (i=0; i<N; i++){ # loop 1B
1555 x = q[i]; # DR_MISALIGNMENT(q) = 3
1556 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1560 -- Possibility 2: we do loop peeling:
1561 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1562 x = q[i];
1563 p[i] = y;
1565 for (i = 3; i < N; i++){ # loop 2A
1566 x = q[i]; # DR_MISALIGNMENT(q) = 0
1567 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1570 -- Possibility 3: combination of loop peeling and versioning:
1571 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1572 x = q[i];
1573 p[i] = y;
1575 if (p is aligned) {
1576 for (i = 3; i<N; i++){ # loop 3A
1577 x = q[i]; # DR_MISALIGNMENT(q) = 0
1578 p[i] = y; # DR_MISALIGNMENT(p) = 0
1581 else {
1582 for (i = 3; i<N; i++){ # loop 3B
1583 x = q[i]; # DR_MISALIGNMENT(q) = 0
1584 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1588 These loops are later passed to loop_transform to be vectorized. The
1589 vectorizer will use the alignment information to guide the transformation
1590 (whether to generate regular loads/stores, or with special handling for
1591 misalignment). */
1593 opt_result
1594 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1596 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1597 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1598 enum dr_alignment_support supportable_dr_alignment;
1599 dr_vec_info *first_store = NULL;
1600 dr_vec_info *dr0_info = NULL;
1601 struct data_reference *dr;
1602 unsigned int i, j;
1603 bool do_peeling = false;
1604 bool do_versioning = false;
1605 unsigned int npeel = 0;
1606 bool one_misalignment_known = false;
1607 bool one_misalignment_unknown = false;
1608 bool one_dr_unsupportable = false;
1609 dr_vec_info *unsupportable_dr_info = NULL;
1610 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1611 unsigned possible_npeel_number = 1;
1612 tree vectype;
1613 unsigned int mis, same_align_drs_max = 0;
1614 hash_table<peel_info_hasher> peeling_htab (1);
1616 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1618 /* Reset data so we can safely be called multiple times. */
1619 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1620 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1622 /* While cost model enhancements are expected in the future, the high level
1623 view of the code at this time is as follows:
1625 A) If there is a misaligned access then see if peeling to align
1626 this access can make all data references satisfy
1627 vect_supportable_dr_alignment. If so, update data structures
1628 as needed and return true.
1630 B) If peeling wasn't possible and there is a data reference with an
1631 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1632 then see if loop versioning checks can be used to make all data
1633 references satisfy vect_supportable_dr_alignment. If so, update
1634 data structures as needed and return true.
1636 C) If neither peeling nor versioning were successful then return false if
1637 any data reference does not satisfy vect_supportable_dr_alignment.
1639 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1641 Note, Possibility 3 above (which is peeling and versioning together) is not
1642 being done at this time. */
1644 /* (1) Peeling to force alignment. */
1646 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1647 Considerations:
1648 + How many accesses will become aligned due to the peeling
1649 - How many accesses will become unaligned due to the peeling,
1650 and the cost of misaligned accesses.
1651 - The cost of peeling (the extra runtime checks, the increase
1652 in code size). */
1654 FOR_EACH_VEC_ELT (datarefs, i, dr)
1656 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1657 stmt_vec_info stmt_info = dr_info->stmt;
1659 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1660 continue;
1662 /* For interleaving, only the alignment of the first access
1663 matters. */
1664 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1665 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1666 continue;
1668 /* For scatter-gather or invariant accesses there is nothing
1669 to enhance. */
1670 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1671 || integer_zerop (DR_STEP (dr)))
1672 continue;
1674 /* Strided accesses perform only component accesses, alignment is
1675 irrelevant for them. */
1676 if (STMT_VINFO_STRIDED_P (stmt_info)
1677 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1678 continue;
1680 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1681 do_peeling = vector_alignment_reachable_p (dr_info);
1682 if (do_peeling)
1684 if (known_alignment_for_access_p (dr_info))
1686 unsigned int npeel_tmp = 0;
1687 bool negative = tree_int_cst_compare (DR_STEP (dr),
1688 size_zero_node) < 0;
1690 vectype = STMT_VINFO_VECTYPE (stmt_info);
1691 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1692 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1693 mis = (negative
1694 ? DR_MISALIGNMENT (dr_info)
1695 : -DR_MISALIGNMENT (dr_info));
1696 if (DR_MISALIGNMENT (dr_info) != 0)
1697 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1699 /* For multiple types, it is possible that the bigger type access
1700 will have more than one peeling option. E.g., a loop with two
1701 types: one of size (vector size / 4), and the other one of
1702 size (vector size / 8). Vectorization factor will 8. If both
1703 accesses are misaligned by 3, the first one needs one scalar
1704 iteration to be aligned, and the second one needs 5. But the
1705 first one will be aligned also by peeling 5 scalar
1706 iterations, and in that case both accesses will be aligned.
1707 Hence, except for the immediate peeling amount, we also want
1708 to try to add full vector size, while we don't exceed
1709 vectorization factor.
1710 We do this automatically for cost model, since we calculate
1711 cost for every peeling option. */
1712 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1714 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1715 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1716 possible_npeel_number
1717 = vect_get_num_vectors (nscalars, vectype);
1719 /* NPEEL_TMP is 0 when there is no misalignment, but also
1720 allow peeling NELEMENTS. */
1721 if (DR_MISALIGNMENT (dr_info) == 0)
1722 possible_npeel_number++;
1725 /* Save info about DR in the hash table. Also include peeling
1726 amounts according to the explanation above. */
1727 for (j = 0; j < possible_npeel_number; j++)
1729 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1730 dr_info, npeel_tmp);
1731 npeel_tmp += target_align / dr_size;
1734 one_misalignment_known = true;
1736 else
1738 /* If we don't know any misalignment values, we prefer
1739 peeling for data-ref that has the maximum number of data-refs
1740 with the same alignment, unless the target prefers to align
1741 stores over load. */
1742 unsigned same_align_drs
1743 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1744 if (!dr0_info
1745 || same_align_drs_max < same_align_drs)
1747 same_align_drs_max = same_align_drs;
1748 dr0_info = dr_info;
1750 /* For data-refs with the same number of related
1751 accesses prefer the one where the misalign
1752 computation will be invariant in the outermost loop. */
1753 else if (same_align_drs_max == same_align_drs)
1755 struct loop *ivloop0, *ivloop;
1756 ivloop0 = outermost_invariant_loop_for_expr
1757 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1758 ivloop = outermost_invariant_loop_for_expr
1759 (loop, DR_BASE_ADDRESS (dr));
1760 if ((ivloop && !ivloop0)
1761 || (ivloop && ivloop0
1762 && flow_loop_nested_p (ivloop, ivloop0)))
1763 dr0_info = dr_info;
1766 one_misalignment_unknown = true;
1768 /* Check for data refs with unsupportable alignment that
1769 can be peeled. */
1770 if (!supportable_dr_alignment)
1772 one_dr_unsupportable = true;
1773 unsupportable_dr_info = dr_info;
1776 if (!first_store && DR_IS_WRITE (dr))
1777 first_store = dr_info;
1780 else
1782 if (!aligned_access_p (dr_info))
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786 "vector alignment may not be reachable\n");
1787 break;
1792 /* Check if we can possibly peel the loop. */
1793 if (!vect_can_advance_ivs_p (loop_vinfo)
1794 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1795 || loop->inner)
1796 do_peeling = false;
1798 struct _vect_peel_extended_info peel_for_known_alignment;
1799 struct _vect_peel_extended_info peel_for_unknown_alignment;
1800 struct _vect_peel_extended_info best_peel;
1802 peel_for_unknown_alignment.inside_cost = INT_MAX;
1803 peel_for_unknown_alignment.outside_cost = INT_MAX;
1804 peel_for_unknown_alignment.peel_info.count = 0;
1806 if (do_peeling
1807 && one_misalignment_unknown)
1809 /* Check if the target requires to prefer stores over loads, i.e., if
1810 misaligned stores are more expensive than misaligned loads (taking
1811 drs with same alignment into account). */
1812 unsigned int load_inside_cost = 0;
1813 unsigned int load_outside_cost = 0;
1814 unsigned int store_inside_cost = 0;
1815 unsigned int store_outside_cost = 0;
1816 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1818 stmt_vector_for_cost dummy;
1819 dummy.create (2);
1820 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1821 &load_inside_cost,
1822 &load_outside_cost,
1823 &dummy, &dummy, estimated_npeels, true);
1824 dummy.release ();
1826 if (first_store)
1828 dummy.create (2);
1829 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1830 &store_inside_cost,
1831 &store_outside_cost,
1832 &dummy, &dummy,
1833 estimated_npeels, true);
1834 dummy.release ();
1836 else
1838 store_inside_cost = INT_MAX;
1839 store_outside_cost = INT_MAX;
1842 if (load_inside_cost > store_inside_cost
1843 || (load_inside_cost == store_inside_cost
1844 && load_outside_cost > store_outside_cost))
1846 dr0_info = first_store;
1847 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1848 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1850 else
1852 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1853 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1856 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1857 prologue_cost_vec.create (2);
1858 epilogue_cost_vec.create (2);
1860 int dummy2;
1861 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1862 (loop_vinfo, estimated_npeels, &dummy2,
1863 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1864 &prologue_cost_vec, &epilogue_cost_vec);
1866 prologue_cost_vec.release ();
1867 epilogue_cost_vec.release ();
1869 peel_for_unknown_alignment.peel_info.count = 1
1870 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1873 peel_for_unknown_alignment.peel_info.npeel = 0;
1874 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1876 best_peel = peel_for_unknown_alignment;
1878 peel_for_known_alignment.inside_cost = INT_MAX;
1879 peel_for_known_alignment.outside_cost = INT_MAX;
1880 peel_for_known_alignment.peel_info.count = 0;
1881 peel_for_known_alignment.peel_info.dr_info = NULL;
1883 if (do_peeling && one_misalignment_known)
1885 /* Peeling is possible, but there is no data access that is not supported
1886 unless aligned. So we try to choose the best possible peeling from
1887 the hash table. */
1888 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1889 (&peeling_htab, loop_vinfo);
1892 /* Compare costs of peeling for known and unknown alignment. */
1893 if (peel_for_known_alignment.peel_info.dr_info != NULL
1894 && peel_for_unknown_alignment.inside_cost
1895 >= peel_for_known_alignment.inside_cost)
1897 best_peel = peel_for_known_alignment;
1899 /* If the best peeling for known alignment has NPEEL == 0, perform no
1900 peeling at all except if there is an unsupportable dr that we can
1901 align. */
1902 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1903 do_peeling = false;
1906 /* If there is an unsupportable data ref, prefer this over all choices so far
1907 since we'd have to discard a chosen peeling except when it accidentally
1908 aligned the unsupportable data ref. */
1909 if (one_dr_unsupportable)
1910 dr0_info = unsupportable_dr_info;
1911 else if (do_peeling)
1913 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1914 TODO: Use nopeel_outside_cost or get rid of it? */
1915 unsigned nopeel_inside_cost = 0;
1916 unsigned nopeel_outside_cost = 0;
1918 stmt_vector_for_cost dummy;
1919 dummy.create (2);
1920 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1921 &nopeel_outside_cost, &dummy, &dummy,
1922 0, false);
1923 dummy.release ();
1925 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1926 costs will be recorded. */
1927 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1928 prologue_cost_vec.create (2);
1929 epilogue_cost_vec.create (2);
1931 int dummy2;
1932 nopeel_outside_cost += vect_get_known_peeling_cost
1933 (loop_vinfo, 0, &dummy2,
1934 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1935 &prologue_cost_vec, &epilogue_cost_vec);
1937 prologue_cost_vec.release ();
1938 epilogue_cost_vec.release ();
1940 npeel = best_peel.peel_info.npeel;
1941 dr0_info = best_peel.peel_info.dr_info;
1943 /* If no peeling is not more expensive than the best peeling we
1944 have so far, don't perform any peeling. */
1945 if (nopeel_inside_cost <= best_peel.inside_cost)
1946 do_peeling = false;
1949 if (do_peeling)
1951 stmt_vec_info stmt_info = dr0_info->stmt;
1952 vectype = STMT_VINFO_VECTYPE (stmt_info);
1954 if (known_alignment_for_access_p (dr0_info))
1956 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1957 size_zero_node) < 0;
1958 if (!npeel)
1960 /* Since it's known at compile time, compute the number of
1961 iterations in the peeled loop (the peeling factor) for use in
1962 updating DR_MISALIGNMENT values. The peeling factor is the
1963 vectorization factor minus the misalignment as an element
1964 count. */
1965 mis = (negative
1966 ? DR_MISALIGNMENT (dr0_info)
1967 : -DR_MISALIGNMENT (dr0_info));
1968 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1969 npeel = ((mis & (target_align - 1))
1970 / vect_get_scalar_dr_size (dr0_info));
1973 /* For interleaved data access every iteration accesses all the
1974 members of the group, therefore we divide the number of iterations
1975 by the group size. */
1976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1977 npeel /= DR_GROUP_SIZE (stmt_info);
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_NOTE, vect_location,
1981 "Try peeling by %d\n", npeel);
1984 /* Ensure that all datarefs can be vectorized after the peel. */
1985 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1986 do_peeling = false;
1988 /* Check if all datarefs are supportable and log. */
1989 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1991 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1992 if (!stat)
1993 do_peeling = false;
1994 else
1995 return stat;
1998 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1999 if (do_peeling)
2001 unsigned max_allowed_peel
2002 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2003 if (max_allowed_peel != (unsigned)-1)
2005 unsigned max_peel = npeel;
2006 if (max_peel == 0)
2008 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2009 max_peel = (target_align
2010 / vect_get_scalar_dr_size (dr0_info) - 1);
2012 if (max_peel > max_allowed_peel)
2014 do_peeling = false;
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_NOTE, vect_location,
2017 "Disable peeling, max peels reached: %d\n", max_peel);
2022 /* Cost model #2 - if peeling may result in a remaining loop not
2023 iterating enough to be vectorized then do not peel. Since this
2024 is a cost heuristic rather than a correctness decision, use the
2025 most likely runtime value for variable vectorization factors. */
2026 if (do_peeling
2027 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2029 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2030 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2031 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2032 < assumed_vf + max_peel)
2033 do_peeling = false;
2036 if (do_peeling)
2038 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2039 If the misalignment of DR_i is identical to that of dr0 then set
2040 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2041 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2042 by the peeling factor times the element size of DR_i (MOD the
2043 vectorization factor times the size). Otherwise, the
2044 misalignment of DR_i must be set to unknown. */
2045 FOR_EACH_VEC_ELT (datarefs, i, dr)
2046 if (dr != dr0_info->dr)
2048 /* Strided accesses perform only component accesses, alignment
2049 is irrelevant for them. */
2050 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2051 stmt_info = dr_info->stmt;
2052 if (STMT_VINFO_STRIDED_P (stmt_info)
2053 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054 continue;
2056 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2059 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2060 if (npeel)
2061 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2062 else
2063 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2064 = DR_MISALIGNMENT (dr0_info);
2065 SET_DR_MISALIGNMENT (dr0_info, 0);
2066 if (dump_enabled_p ())
2068 dump_printf_loc (MSG_NOTE, vect_location,
2069 "Alignment of access forced using peeling.\n");
2070 dump_printf_loc (MSG_NOTE, vect_location,
2071 "Peeling for alignment will be applied.\n");
2074 /* The inside-loop cost will be accounted for in vectorizable_load
2075 and vectorizable_store correctly with adjusted alignments.
2076 Drop the body_cst_vec on the floor here. */
2077 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078 gcc_assert (stat);
2079 return stat;
2083 /* (2) Versioning to force alignment. */
2085 /* Try versioning if:
2086 1) optimize loop for speed
2087 2) there is at least one unsupported misaligned data ref with an unknown
2088 misalignment, and
2089 3) all misaligned data refs with a known misalignment are supported, and
2090 4) the number of runtime alignment checks is within reason. */
2092 do_versioning =
2093 optimize_loop_nest_for_speed_p (loop)
2094 && (!loop->inner); /* FORNOW */
2096 if (do_versioning)
2098 FOR_EACH_VEC_ELT (datarefs, i, dr)
2100 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2101 stmt_vec_info stmt_info = dr_info->stmt;
2103 /* For interleaving, only the alignment of the first access
2104 matters. */
2105 if (aligned_access_p (dr_info)
2106 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2107 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2108 continue;
2110 if (STMT_VINFO_STRIDED_P (stmt_info))
2112 /* Strided loads perform only component accesses, alignment is
2113 irrelevant for them. */
2114 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115 continue;
2116 do_versioning = false;
2117 break;
2120 supportable_dr_alignment
2121 = vect_supportable_dr_alignment (dr_info, false);
2123 if (!supportable_dr_alignment)
2125 int mask;
2126 tree vectype;
2128 if (known_alignment_for_access_p (dr_info)
2129 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2130 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2132 do_versioning = false;
2133 break;
2136 vectype = STMT_VINFO_VECTYPE (stmt_info);
2137 gcc_assert (vectype);
2139 /* At present we don't support versioning for alignment
2140 with variable VF, since there's no guarantee that the
2141 VF is a power of two. We could relax this if we added
2142 a way of enforcing a power-of-two size. */
2143 unsigned HOST_WIDE_INT size;
2144 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2146 do_versioning = false;
2147 break;
2150 /* The rightmost bits of an aligned address must be zeros.
2151 Construct the mask needed for this test. For example,
2152 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2153 mask must be 15 = 0xf. */
2154 mask = size - 1;
2156 /* FORNOW: use the same mask to test all potentially unaligned
2157 references in the loop. The vectorizer currently supports
2158 a single vector size, see the reference to
2159 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2160 vectorization factor is computed. */
2161 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2162 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2163 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2164 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2168 /* Versioning requires at least one misaligned data reference. */
2169 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2170 do_versioning = false;
2171 else if (!do_versioning)
2172 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2175 if (do_versioning)
2177 vec<stmt_vec_info> may_misalign_stmts
2178 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2179 stmt_vec_info stmt_info;
2181 /* It can now be assumed that the data references in the statements
2182 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2183 of the loop being vectorized. */
2184 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2186 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2187 SET_DR_MISALIGNMENT (dr_info, 0);
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location,
2190 "Alignment of access forced using versioning.\n");
2193 if (dump_enabled_p ())
2194 dump_printf_loc (MSG_NOTE, vect_location,
2195 "Versioning for alignment will be applied.\n");
2197 /* Peeling and versioning can't be done together at this time. */
2198 gcc_assert (! (do_peeling && do_versioning));
2200 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2201 gcc_assert (stat);
2202 return stat;
2205 /* This point is reached if neither peeling nor versioning is being done. */
2206 gcc_assert (! (do_peeling || do_versioning));
2208 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2209 return stat;
2213 /* Function vect_find_same_alignment_drs.
2215 Update group and alignment relations in VINFO according to the chosen
2216 vectorization factor. */
2218 static void
2219 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2221 struct data_reference *dra = DDR_A (ddr);
2222 struct data_reference *drb = DDR_B (ddr);
2223 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2224 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2225 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2226 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2228 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2229 return;
2231 if (dra == drb)
2232 return;
2234 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2235 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2236 return;
2238 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2239 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2240 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2241 return;
2243 /* Two references with distance zero have the same alignment. */
2244 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2245 - wi::to_poly_offset (DR_INIT (drb)));
2246 if (maybe_ne (diff, 0))
2248 /* Get the wider of the two alignments. */
2249 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2250 / BITS_PER_UNIT);
2251 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2252 / BITS_PER_UNIT);
2253 unsigned int max_align = MAX (align_a, align_b);
2255 /* Require the gap to be a multiple of the larger vector alignment. */
2256 if (!multiple_p (diff, max_align))
2257 return;
2260 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2261 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_NOTE, vect_location,
2264 "accesses have the same alignment: %T and %T\n",
2265 DR_REF (dra), DR_REF (drb));
2269 /* Function vect_analyze_data_refs_alignment
2271 Analyze the alignment of the data-references in the loop.
2272 Return FALSE if a data reference is found that cannot be vectorized. */
2274 opt_result
2275 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2277 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2279 /* Mark groups of data references with same alignment using
2280 data dependence information. */
2281 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2282 struct data_dependence_relation *ddr;
2283 unsigned int i;
2285 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2286 vect_find_same_alignment_drs (vinfo, ddr);
2288 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2289 struct data_reference *dr;
2291 vect_record_base_alignments (vinfo);
2292 FOR_EACH_VEC_ELT (datarefs, i, dr)
2294 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2295 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2296 vect_compute_data_ref_alignment (dr_info);
2299 return opt_result::success ();
2303 /* Analyze alignment of DRs of stmts in NODE. */
2305 static bool
2306 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2308 /* We vectorize from the first scalar stmt in the node unless
2309 the node is permuted in which case we start from the first
2310 element in the group. */
2311 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2312 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2313 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2314 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2316 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2317 vect_compute_data_ref_alignment (dr_info);
2318 /* For creating the data-ref pointer we need alignment of the
2319 first element anyway. */
2320 if (dr_info != first_dr_info)
2321 vect_compute_data_ref_alignment (first_dr_info);
2322 if (! verify_data_ref_alignment (dr_info))
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326 "not vectorized: bad data alignment in basic "
2327 "block.\n");
2328 return false;
2331 return true;
2334 /* Function vect_slp_analyze_instance_alignment
2336 Analyze the alignment of the data-references in the SLP instance.
2337 Return FALSE if a data reference is found that cannot be vectorized. */
2339 bool
2340 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2342 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2344 slp_tree node;
2345 unsigned i;
2346 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2347 if (! vect_slp_analyze_and_verify_node_alignment (node))
2348 return false;
2350 node = SLP_INSTANCE_TREE (instance);
2351 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2352 && ! vect_slp_analyze_and_verify_node_alignment
2353 (SLP_INSTANCE_TREE (instance)))
2354 return false;
2356 return true;
2360 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2361 accesses of legal size, step, etc. Detect gaps, single element
2362 interleaving, and other special cases. Set grouped access info.
2363 Collect groups of strided stores for further use in SLP analysis.
2364 Worker for vect_analyze_group_access. */
2366 static bool
2367 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2369 data_reference *dr = dr_info->dr;
2370 tree step = DR_STEP (dr);
2371 tree scalar_type = TREE_TYPE (DR_REF (dr));
2372 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2373 stmt_vec_info stmt_info = dr_info->stmt;
2374 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2375 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2376 HOST_WIDE_INT dr_step = -1;
2377 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2378 bool slp_impossible = false;
2380 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2381 size of the interleaving group (including gaps). */
2382 if (tree_fits_shwi_p (step))
2384 dr_step = tree_to_shwi (step);
2385 /* Check that STEP is a multiple of type size. Otherwise there is
2386 a non-element-sized gap at the end of the group which we
2387 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2388 ??? As we can handle non-constant step fine here we should
2389 simply remove uses of DR_GROUP_GAP between the last and first
2390 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2391 simply not include that gap. */
2392 if ((dr_step % type_size) != 0)
2394 if (dump_enabled_p ())
2395 dump_printf_loc (MSG_NOTE, vect_location,
2396 "Step %T is not a multiple of the element size"
2397 " for %T\n",
2398 step, DR_REF (dr));
2399 return false;
2401 groupsize = absu_hwi (dr_step) / type_size;
2403 else
2404 groupsize = 0;
2406 /* Not consecutive access is possible only if it is a part of interleaving. */
2407 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2409 /* Check if it this DR is a part of interleaving, and is a single
2410 element of the group that is accessed in the loop. */
2412 /* Gaps are supported only for loads. STEP must be a multiple of the type
2413 size. */
2414 if (DR_IS_READ (dr)
2415 && (dr_step % type_size) == 0
2416 && groupsize > 0)
2418 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2419 DR_GROUP_SIZE (stmt_info) = groupsize;
2420 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_NOTE, vect_location,
2423 "Detected single element interleaving %T"
2424 " step %T\n",
2425 DR_REF (dr), step);
2427 return true;
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "not consecutive access %G", stmt_info->stmt);
2434 if (bb_vinfo)
2436 /* Mark the statement as unvectorizable. */
2437 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2438 return true;
2441 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2442 STMT_VINFO_STRIDED_P (stmt_info) = true;
2443 return true;
2446 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2448 /* First stmt in the interleaving chain. Check the chain. */
2449 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2450 struct data_reference *data_ref = dr;
2451 unsigned int count = 1;
2452 tree prev_init = DR_INIT (data_ref);
2453 stmt_vec_info prev = stmt_info;
2454 HOST_WIDE_INT diff, gaps = 0;
2456 /* By construction, all group members have INTEGER_CST DR_INITs. */
2457 while (next)
2459 /* Skip same data-refs. In case that two or more stmts share
2460 data-ref (supported only for loads), we vectorize only the first
2461 stmt, and the rest get their vectorized loads from the first
2462 one. */
2463 if (!tree_int_cst_compare (DR_INIT (data_ref),
2464 DR_INIT (STMT_VINFO_DATA_REF (next))))
2466 if (DR_IS_WRITE (data_ref))
2468 if (dump_enabled_p ())
2469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2470 "Two store stmts share the same dr.\n");
2471 return false;
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2476 "Two or more load stmts share the same dr.\n");
2478 /* For load use the same data-ref load. */
2479 DR_GROUP_SAME_DR_STMT (next) = prev;
2481 prev = next;
2482 next = DR_GROUP_NEXT_ELEMENT (next);
2483 continue;
2486 prev = next;
2487 data_ref = STMT_VINFO_DATA_REF (next);
2489 /* All group members have the same STEP by construction. */
2490 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2492 /* Check that the distance between two accesses is equal to the type
2493 size. Otherwise, we have gaps. */
2494 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2495 - TREE_INT_CST_LOW (prev_init)) / type_size;
2496 if (diff != 1)
2498 /* FORNOW: SLP of accesses with gaps is not supported. */
2499 slp_impossible = true;
2500 if (DR_IS_WRITE (data_ref))
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504 "interleaved store with gaps\n");
2505 return false;
2508 gaps += diff - 1;
2511 last_accessed_element += diff;
2513 /* Store the gap from the previous member of the group. If there is no
2514 gap in the access, DR_GROUP_GAP is always 1. */
2515 DR_GROUP_GAP (next) = diff;
2517 prev_init = DR_INIT (data_ref);
2518 next = DR_GROUP_NEXT_ELEMENT (next);
2519 /* Count the number of data-refs in the chain. */
2520 count++;
2523 if (groupsize == 0)
2524 groupsize = count + gaps;
2526 /* This could be UINT_MAX but as we are generating code in a very
2527 inefficient way we have to cap earlier. See PR78699 for example. */
2528 if (groupsize > 4096)
2530 if (dump_enabled_p ())
2531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2532 "group is too large\n");
2533 return false;
2536 /* Check that the size of the interleaving is equal to count for stores,
2537 i.e., that there are no gaps. */
2538 if (groupsize != count
2539 && !DR_IS_READ (dr))
2541 groupsize = count;
2542 STMT_VINFO_STRIDED_P (stmt_info) = true;
2545 /* If there is a gap after the last load in the group it is the
2546 difference between the groupsize and the last accessed
2547 element.
2548 When there is no gap, this difference should be 0. */
2549 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2551 DR_GROUP_SIZE (stmt_info) = groupsize;
2552 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_NOTE, vect_location,
2555 "Detected interleaving ");
2556 if (DR_IS_READ (dr))
2557 dump_printf (MSG_NOTE, "load ");
2558 else if (STMT_VINFO_STRIDED_P (stmt_info))
2559 dump_printf (MSG_NOTE, "strided store ");
2560 else
2561 dump_printf (MSG_NOTE, "store ");
2562 dump_printf (MSG_NOTE, "of size %u starting with %G",
2563 (unsigned)groupsize, stmt_info->stmt);
2564 if (DR_GROUP_GAP (stmt_info) != 0)
2565 dump_printf_loc (MSG_NOTE, vect_location,
2566 "There is a gap of %u elements after the group\n",
2567 DR_GROUP_GAP (stmt_info));
2570 /* SLP: create an SLP data structure for every interleaving group of
2571 stores for further analysis in vect_analyse_slp. */
2572 if (DR_IS_WRITE (dr) && !slp_impossible)
2574 if (loop_vinfo)
2575 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2576 if (bb_vinfo)
2577 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2581 return true;
2584 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2585 accesses of legal size, step, etc. Detect gaps, single element
2586 interleaving, and other special cases. Set grouped access info.
2587 Collect groups of strided stores for further use in SLP analysis. */
2589 static bool
2590 vect_analyze_group_access (dr_vec_info *dr_info)
2592 if (!vect_analyze_group_access_1 (dr_info))
2594 /* Dissolve the group if present. */
2595 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2596 while (stmt_info)
2598 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2599 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2600 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2601 stmt_info = next;
2603 return false;
2605 return true;
2608 /* Analyze the access pattern of the data-reference DR_INFO.
2609 In case of non-consecutive accesses call vect_analyze_group_access() to
2610 analyze groups of accesses. */
2612 static bool
2613 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2615 data_reference *dr = dr_info->dr;
2616 tree step = DR_STEP (dr);
2617 tree scalar_type = TREE_TYPE (DR_REF (dr));
2618 stmt_vec_info stmt_info = dr_info->stmt;
2619 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2620 struct loop *loop = NULL;
2622 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2623 return true;
2625 if (loop_vinfo)
2626 loop = LOOP_VINFO_LOOP (loop_vinfo);
2628 if (loop_vinfo && !step)
2630 if (dump_enabled_p ())
2631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2632 "bad data-ref access in loop\n");
2633 return false;
2636 /* Allow loads with zero step in inner-loop vectorization. */
2637 if (loop_vinfo && integer_zerop (step))
2639 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2640 if (!nested_in_vect_loop_p (loop, stmt_info))
2641 return DR_IS_READ (dr);
2642 /* Allow references with zero step for outer loops marked
2643 with pragma omp simd only - it guarantees absence of
2644 loop-carried dependencies between inner loop iterations. */
2645 if (loop->safelen < 2)
2647 if (dump_enabled_p ())
2648 dump_printf_loc (MSG_NOTE, vect_location,
2649 "zero step in inner loop of nest\n");
2650 return false;
2654 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2656 /* Interleaved accesses are not yet supported within outer-loop
2657 vectorization for references in the inner-loop. */
2658 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2660 /* For the rest of the analysis we use the outer-loop step. */
2661 step = STMT_VINFO_DR_STEP (stmt_info);
2662 if (integer_zerop (step))
2664 if (dump_enabled_p ())
2665 dump_printf_loc (MSG_NOTE, vect_location,
2666 "zero step in outer loop.\n");
2667 return DR_IS_READ (dr);
2671 /* Consecutive? */
2672 if (TREE_CODE (step) == INTEGER_CST)
2674 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2675 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2676 || (dr_step < 0
2677 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2679 /* Mark that it is not interleaving. */
2680 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2681 return true;
2685 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_NOTE, vect_location,
2689 "grouped access in outer loop.\n");
2690 return false;
2694 /* Assume this is a DR handled by non-constant strided load case. */
2695 if (TREE_CODE (step) != INTEGER_CST)
2696 return (STMT_VINFO_STRIDED_P (stmt_info)
2697 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2698 || vect_analyze_group_access (dr_info)));
2700 /* Not consecutive access - check if it's a part of interleaving group. */
2701 return vect_analyze_group_access (dr_info);
2704 /* Compare two data-references DRA and DRB to group them into chunks
2705 suitable for grouping. */
2707 static int
2708 dr_group_sort_cmp (const void *dra_, const void *drb_)
2710 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2711 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2712 int cmp;
2714 /* Stabilize sort. */
2715 if (dra == drb)
2716 return 0;
2718 /* DRs in different loops never belong to the same group. */
2719 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2720 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2721 if (loopa != loopb)
2722 return loopa->num < loopb->num ? -1 : 1;
2724 /* Ordering of DRs according to base. */
2725 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2726 DR_BASE_ADDRESS (drb));
2727 if (cmp != 0)
2728 return cmp;
2730 /* And according to DR_OFFSET. */
2731 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2732 if (cmp != 0)
2733 return cmp;
2735 /* Put reads before writes. */
2736 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2737 return DR_IS_READ (dra) ? -1 : 1;
2739 /* Then sort after access size. */
2740 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2741 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2742 if (cmp != 0)
2743 return cmp;
2745 /* And after step. */
2746 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2747 if (cmp != 0)
2748 return cmp;
2750 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2751 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2752 if (cmp == 0)
2753 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2754 return cmp;
2757 /* If OP is the result of a conversion, return the unconverted value,
2758 otherwise return null. */
2760 static tree
2761 strip_conversion (tree op)
2763 if (TREE_CODE (op) != SSA_NAME)
2764 return NULL_TREE;
2765 gimple *stmt = SSA_NAME_DEF_STMT (op);
2766 if (!is_gimple_assign (stmt)
2767 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2768 return NULL_TREE;
2769 return gimple_assign_rhs1 (stmt);
2772 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2773 and STMT2_INFO being in a single group. */
2775 static bool
2776 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2778 if (gimple_assign_single_p (stmt1_info->stmt))
2779 return gimple_assign_single_p (stmt2_info->stmt);
2781 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2782 if (call1 && gimple_call_internal_p (call1))
2784 /* Check for two masked loads or two masked stores. */
2785 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2786 if (!call2 || !gimple_call_internal_p (call2))
2787 return false;
2788 internal_fn ifn = gimple_call_internal_fn (call1);
2789 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2790 return false;
2791 if (ifn != gimple_call_internal_fn (call2))
2792 return false;
2794 /* Check that the masks are the same. Cope with casts of masks,
2795 like those created by build_mask_conversion. */
2796 tree mask1 = gimple_call_arg (call1, 2);
2797 tree mask2 = gimple_call_arg (call2, 2);
2798 if (!operand_equal_p (mask1, mask2, 0))
2800 mask1 = strip_conversion (mask1);
2801 if (!mask1)
2802 return false;
2803 mask2 = strip_conversion (mask2);
2804 if (!mask2)
2805 return false;
2806 if (!operand_equal_p (mask1, mask2, 0))
2807 return false;
2809 return true;
2812 return false;
2815 /* Function vect_analyze_data_ref_accesses.
2817 Analyze the access pattern of all the data references in the loop.
2819 FORNOW: the only access pattern that is considered vectorizable is a
2820 simple step 1 (consecutive) access.
2822 FORNOW: handle only arrays and pointer accesses. */
2824 opt_result
2825 vect_analyze_data_ref_accesses (vec_info *vinfo)
2827 unsigned int i;
2828 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2829 struct data_reference *dr;
2831 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2833 if (datarefs.is_empty ())
2834 return opt_result::success ();
2836 /* Sort the array of datarefs to make building the interleaving chains
2837 linear. Don't modify the original vector's order, it is needed for
2838 determining what dependencies are reversed. */
2839 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2840 datarefs_copy.qsort (dr_group_sort_cmp);
2842 /* Build the interleaving chains. */
2843 for (i = 0; i < datarefs_copy.length () - 1;)
2845 data_reference_p dra = datarefs_copy[i];
2846 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2847 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2848 stmt_vec_info lastinfo = NULL;
2849 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2850 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2852 ++i;
2853 continue;
2855 for (i = i + 1; i < datarefs_copy.length (); ++i)
2857 data_reference_p drb = datarefs_copy[i];
2858 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2859 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2860 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2861 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2862 break;
2864 /* ??? Imperfect sorting (non-compatible types, non-modulo
2865 accesses, same accesses) can lead to a group to be artificially
2866 split here as we don't just skip over those. If it really
2867 matters we can push those to a worklist and re-iterate
2868 over them. The we can just skip ahead to the next DR here. */
2870 /* DRs in a different loop should not be put into the same
2871 interleaving group. */
2872 if (gimple_bb (DR_STMT (dra))->loop_father
2873 != gimple_bb (DR_STMT (drb))->loop_father)
2874 break;
2876 /* Check that the data-refs have same first location (except init)
2877 and they are both either store or load (not load and store,
2878 not masked loads or stores). */
2879 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2880 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2881 DR_BASE_ADDRESS (drb)) != 0
2882 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2883 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2884 break;
2886 /* Check that the data-refs have the same constant size. */
2887 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2888 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2889 if (!tree_fits_uhwi_p (sza)
2890 || !tree_fits_uhwi_p (szb)
2891 || !tree_int_cst_equal (sza, szb))
2892 break;
2894 /* Check that the data-refs have the same step. */
2895 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2896 break;
2898 /* Check the types are compatible.
2899 ??? We don't distinguish this during sorting. */
2900 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2901 TREE_TYPE (DR_REF (drb))))
2902 break;
2904 /* Check that the DR_INITs are compile-time constants. */
2905 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2906 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2907 break;
2909 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2910 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2911 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2912 HOST_WIDE_INT init_prev
2913 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2914 gcc_assert (init_a <= init_b
2915 && init_a <= init_prev
2916 && init_prev <= init_b);
2918 /* Do not place the same access in the interleaving chain twice. */
2919 if (init_b == init_prev)
2921 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2922 < gimple_uid (DR_STMT (drb)));
2923 /* ??? For now we simply "drop" the later reference which is
2924 otherwise the same rather than finishing off this group.
2925 In the end we'd want to re-process duplicates forming
2926 multiple groups from the refs, likely by just collecting
2927 all candidates (including duplicates and split points
2928 below) in a vector and then process them together. */
2929 continue;
2932 /* If init_b == init_a + the size of the type * k, we have an
2933 interleaving, and DRA is accessed before DRB. */
2934 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2935 if (type_size_a == 0
2936 || (init_b - init_a) % type_size_a != 0)
2937 break;
2939 /* If we have a store, the accesses are adjacent. This splits
2940 groups into chunks we support (we don't support vectorization
2941 of stores with gaps). */
2942 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2943 break;
2945 /* If the step (if not zero or non-constant) is greater than the
2946 difference between data-refs' inits this splits groups into
2947 suitable sizes. */
2948 if (tree_fits_shwi_p (DR_STEP (dra)))
2950 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2951 if (step != 0 && step <= (init_b - init_a))
2952 break;
2955 if (dump_enabled_p ())
2956 dump_printf_loc (MSG_NOTE, vect_location,
2957 DR_IS_READ (dra)
2958 ? "Detected interleaving load %T and %T\n"
2959 : "Detected interleaving store %T and %T\n",
2960 DR_REF (dra), DR_REF (drb));
2962 /* Link the found element into the group list. */
2963 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2965 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2966 lastinfo = stmtinfo_a;
2968 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2969 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2970 lastinfo = stmtinfo_b;
2974 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2976 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2977 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
2978 && !vect_analyze_data_ref_access (dr_info))
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982 "not vectorized: complicated access pattern.\n");
2984 if (is_a <bb_vec_info> (vinfo))
2986 /* Mark the statement as not vectorizable. */
2987 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
2988 continue;
2990 else
2992 datarefs_copy.release ();
2993 return opt_result::failure_at (dr_info->stmt->stmt,
2994 "not vectorized:"
2995 " complicated access pattern.\n");
3000 datarefs_copy.release ();
3001 return opt_result::success ();
3004 /* Function vect_vfa_segment_size.
3006 Input:
3007 DR_INFO: The data reference.
3008 LENGTH_FACTOR: segment length to consider.
3010 Return a value suitable for the dr_with_seg_len::seg_len field.
3011 This is the "distance travelled" by the pointer from the first
3012 iteration in the segment to the last. Note that it does not include
3013 the size of the access; in effect it only describes the first byte. */
3015 static tree
3016 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3018 length_factor = size_binop (MINUS_EXPR,
3019 fold_convert (sizetype, length_factor),
3020 size_one_node);
3021 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3022 length_factor);
3025 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3026 gives the worst-case number of bytes covered by the segment. */
3028 static unsigned HOST_WIDE_INT
3029 vect_vfa_access_size (dr_vec_info *dr_info)
3031 stmt_vec_info stmt_vinfo = dr_info->stmt;
3032 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3033 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3034 unsigned HOST_WIDE_INT access_size = ref_size;
3035 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3037 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3038 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3040 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3041 && (vect_supportable_dr_alignment (dr_info, false)
3042 == dr_explicit_realign_optimized))
3044 /* We might access a full vector's worth. */
3045 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3046 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3048 return access_size;
3051 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3052 describes. */
3054 static unsigned int
3055 vect_vfa_align (dr_vec_info *dr_info)
3057 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3060 /* Function vect_no_alias_p.
3062 Given data references A and B with equal base and offset, see whether
3063 the alias relation can be decided at compilation time. Return 1 if
3064 it can and the references alias, 0 if it can and the references do
3065 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3066 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3067 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3069 static int
3070 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3071 tree segment_length_a, tree segment_length_b,
3072 unsigned HOST_WIDE_INT access_size_a,
3073 unsigned HOST_WIDE_INT access_size_b)
3075 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3076 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3077 poly_uint64 const_length_a;
3078 poly_uint64 const_length_b;
3080 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3081 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3082 [a, a+12) */
3083 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3085 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3086 offset_a = (offset_a + access_size_a) - const_length_a;
3088 else
3089 const_length_a = tree_to_poly_uint64 (segment_length_a);
3090 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3092 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3093 offset_b = (offset_b + access_size_b) - const_length_b;
3095 else
3096 const_length_b = tree_to_poly_uint64 (segment_length_b);
3098 const_length_a += access_size_a;
3099 const_length_b += access_size_b;
3101 if (ranges_known_overlap_p (offset_a, const_length_a,
3102 offset_b, const_length_b))
3103 return 1;
3105 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3106 offset_b, const_length_b))
3107 return 0;
3109 return -1;
3112 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3113 in DDR is >= VF. */
3115 static bool
3116 dependence_distance_ge_vf (data_dependence_relation *ddr,
3117 unsigned int loop_depth, poly_uint64 vf)
3119 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3120 || DDR_NUM_DIST_VECTS (ddr) == 0)
3121 return false;
3123 /* If the dependence is exact, we should have limited the VF instead. */
3124 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3126 unsigned int i;
3127 lambda_vector dist_v;
3128 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3130 HOST_WIDE_INT dist = dist_v[loop_depth];
3131 if (dist != 0
3132 && !(dist > 0 && DDR_REVERSED_P (ddr))
3133 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3134 return false;
3137 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "dependence distance between %T and %T is >= VF\n",
3140 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3142 return true;
3145 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3147 static void
3148 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3150 dump_printf (dump_kind, "%s (%T) >= ",
3151 lower_bound.unsigned_p ? "unsigned" : "abs",
3152 lower_bound.expr);
3153 dump_dec (dump_kind, lower_bound.min_value);
3156 /* Record that the vectorized loop requires the vec_lower_bound described
3157 by EXPR, UNSIGNED_P and MIN_VALUE. */
3159 static void
3160 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3161 poly_uint64 min_value)
3163 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3164 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3165 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3167 unsigned_p &= lower_bounds[i].unsigned_p;
3168 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3169 if (lower_bounds[i].unsigned_p != unsigned_p
3170 || maybe_lt (lower_bounds[i].min_value, min_value))
3172 lower_bounds[i].unsigned_p = unsigned_p;
3173 lower_bounds[i].min_value = min_value;
3174 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_NOTE, vect_location,
3177 "updating run-time check to ");
3178 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3179 dump_printf (MSG_NOTE, "\n");
3182 return;
3185 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3186 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3189 dump_lower_bound (MSG_NOTE, lower_bound);
3190 dump_printf (MSG_NOTE, "\n");
3192 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3195 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3196 will span fewer than GAP bytes. */
3198 static bool
3199 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3200 poly_int64 gap)
3202 stmt_vec_info stmt_info = dr_info->stmt;
3203 HOST_WIDE_INT count
3204 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3205 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3206 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3207 return (estimated_poly_value (gap)
3208 <= count * vect_get_scalar_dr_size (dr_info));
3211 /* Return true if we know that there is no alias between DR_INFO_A and
3212 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3213 When returning true, set *LOWER_BOUND_OUT to this N. */
3215 static bool
3216 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3217 poly_uint64 *lower_bound_out)
3219 /* Check that there is a constant gap of known sign between DR_A
3220 and DR_B. */
3221 data_reference *dr_a = dr_info_a->dr;
3222 data_reference *dr_b = dr_info_b->dr;
3223 poly_int64 init_a, init_b;
3224 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3225 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3226 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3227 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3228 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3229 || !ordered_p (init_a, init_b))
3230 return false;
3232 /* Sort DR_A and DR_B by the address they access. */
3233 if (maybe_lt (init_b, init_a))
3235 std::swap (init_a, init_b);
3236 std::swap (dr_info_a, dr_info_b);
3237 std::swap (dr_a, dr_b);
3240 /* If the two accesses could be dependent within a scalar iteration,
3241 make sure that we'd retain their order. */
3242 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3243 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3244 return false;
3246 /* There is no alias if abs (DR_STEP) is greater than or equal to
3247 the bytes spanned by the combination of the two accesses. */
3248 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3249 return true;
3252 /* Function vect_prune_runtime_alias_test_list.
3254 Prune a list of ddrs to be tested at run-time by versioning for alias.
3255 Merge several alias checks into one if possible.
3256 Return FALSE if resulting list of ddrs is longer then allowed by
3257 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3259 opt_result
3260 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3262 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3263 hash_set <tree_pair_hash> compared_objects;
3265 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3266 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3267 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3268 vec<vec_object_pair> &check_unequal_addrs
3269 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3270 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3271 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3273 ddr_p ddr;
3274 unsigned int i;
3275 tree length_factor;
3277 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3279 /* Step values are irrelevant for aliasing if the number of vector
3280 iterations is equal to the number of scalar iterations (which can
3281 happen for fully-SLP loops). */
3282 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3284 if (!ignore_step_p)
3286 /* Convert the checks for nonzero steps into bound tests. */
3287 tree value;
3288 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3289 vect_check_lower_bound (loop_vinfo, value, true, 1);
3292 if (may_alias_ddrs.is_empty ())
3293 return opt_result::success ();
3295 comp_alias_ddrs.create (may_alias_ddrs.length ());
3297 unsigned int loop_depth
3298 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3299 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3301 /* First, we collect all data ref pairs for aliasing checks. */
3302 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3304 int comp_res;
3305 poly_uint64 lower_bound;
3306 tree segment_length_a, segment_length_b;
3307 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3308 unsigned int align_a, align_b;
3310 /* Ignore the alias if the VF we chose ended up being no greater
3311 than the dependence distance. */
3312 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3313 continue;
3315 if (DDR_OBJECT_A (ddr))
3317 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3318 if (!compared_objects.add (new_pair))
3320 if (dump_enabled_p ())
3321 dump_printf_loc (MSG_NOTE, vect_location,
3322 "checking that %T and %T"
3323 " have different addresses\n",
3324 new_pair.first, new_pair.second);
3325 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3327 continue;
3330 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3331 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3333 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3334 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3336 /* Skip the pair if inter-iteration dependencies are irrelevant
3337 and intra-iteration dependencies are guaranteed to be honored. */
3338 if (ignore_step_p
3339 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3340 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3341 &lower_bound)))
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "no need for alias check between "
3346 "%T and %T when VF is 1\n",
3347 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3348 continue;
3351 /* See whether we can handle the alias using a bounds check on
3352 the step, and whether that's likely to be the best approach.
3353 (It might not be, for example, if the minimum step is much larger
3354 than the number of bytes handled by one vector iteration.) */
3355 if (!ignore_step_p
3356 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3357 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3358 &lower_bound)
3359 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3360 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3362 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3363 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3366 "%T and %T when the step %T is outside ",
3367 DR_REF (dr_info_a->dr),
3368 DR_REF (dr_info_b->dr),
3369 DR_STEP (dr_info_a->dr));
3370 if (unsigned_p)
3371 dump_printf (MSG_NOTE, "[0");
3372 else
3374 dump_printf (MSG_NOTE, "(");
3375 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3377 dump_printf (MSG_NOTE, ", ");
3378 dump_dec (MSG_NOTE, lower_bound);
3379 dump_printf (MSG_NOTE, ")\n");
3381 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3382 unsigned_p, lower_bound);
3383 continue;
3386 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3387 if (dr_group_first_a)
3389 stmt_info_a = dr_group_first_a;
3390 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3393 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3394 if (dr_group_first_b)
3396 stmt_info_b = dr_group_first_b;
3397 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3400 if (ignore_step_p)
3402 segment_length_a = size_zero_node;
3403 segment_length_b = size_zero_node;
3405 else
3407 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3408 DR_STEP (dr_info_b->dr), 0))
3409 length_factor = scalar_loop_iters;
3410 else
3411 length_factor = size_int (vect_factor);
3412 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3413 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3415 access_size_a = vect_vfa_access_size (dr_info_a);
3416 access_size_b = vect_vfa_access_size (dr_info_b);
3417 align_a = vect_vfa_align (dr_info_a);
3418 align_b = vect_vfa_align (dr_info_b);
3420 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3421 DR_BASE_ADDRESS (dr_info_b->dr));
3422 if (comp_res == 0)
3423 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3424 DR_OFFSET (dr_info_b->dr));
3426 /* See whether the alias is known at compilation time. */
3427 if (comp_res == 0
3428 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3429 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3430 && poly_int_tree_p (segment_length_a)
3431 && poly_int_tree_p (segment_length_b))
3433 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3434 segment_length_a,
3435 segment_length_b,
3436 access_size_a,
3437 access_size_b);
3438 if (res >= 0 && dump_enabled_p ())
3440 dump_printf_loc (MSG_NOTE, vect_location,
3441 "can tell at compile time that %T and %T",
3442 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3443 if (res == 0)
3444 dump_printf (MSG_NOTE, " do not alias\n");
3445 else
3446 dump_printf (MSG_NOTE, " alias\n");
3449 if (res == 0)
3450 continue;
3452 if (res == 1)
3453 return opt_result::failure_at (stmt_info_b->stmt,
3454 "not vectorized:"
3455 " compilation time alias: %G%G",
3456 stmt_info_a->stmt,
3457 stmt_info_b->stmt);
3460 dr_with_seg_len_pair_t dr_with_seg_len_pair
3461 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3462 access_size_a, align_a),
3463 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3464 access_size_b, align_b));
3466 /* Canonicalize pairs by sorting the two DR members. */
3467 if (comp_res > 0)
3468 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3470 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3473 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3475 unsigned int count = (comp_alias_ddrs.length ()
3476 + check_unequal_addrs.length ());
3478 dump_printf_loc (MSG_NOTE, vect_location,
3479 "improved number of alias checks from %d to %d\n",
3480 may_alias_ddrs.length (), count);
3481 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3482 return opt_result::failure_at
3483 (vect_location,
3484 "number of versioning for alias "
3485 "run-time tests exceeds %d "
3486 "(--param vect-max-version-for-alias-checks)\n",
3487 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3489 return opt_result::success ();
3492 /* Check whether we can use an internal function for a gather load
3493 or scatter store. READ_P is true for loads and false for stores.
3494 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3495 the type of the memory elements being loaded or stored. OFFSET_BITS
3496 is the number of bits in each scalar offset and OFFSET_SIGN is the
3497 sign of the offset. SCALE is the amount by which the offset should
3498 be multiplied *after* it has been converted to address width.
3500 Return true if the function is supported, storing the function
3501 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3503 bool
3504 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3505 tree memory_type, unsigned int offset_bits,
3506 signop offset_sign, int scale,
3507 internal_fn *ifn_out, tree *element_type_out)
3509 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3510 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3511 if (offset_bits > element_bits)
3512 /* Internal functions require the offset to be the same width as
3513 the vector elements. We can extend narrower offsets, but it isn't
3514 safe to truncate wider offsets. */
3515 return false;
3517 if (element_bits != memory_bits)
3518 /* For now the vector elements must be the same width as the
3519 memory elements. */
3520 return false;
3522 /* Work out which function we need. */
3523 internal_fn ifn;
3524 if (read_p)
3525 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3526 else
3527 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3529 /* Test whether the target supports this combination. */
3530 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3531 offset_sign, scale))
3532 return false;
3534 *ifn_out = ifn;
3535 *element_type_out = TREE_TYPE (vectype);
3536 return true;
3539 /* STMT_INFO is a call to an internal gather load or scatter store function.
3540 Describe the operation in INFO. */
3542 static void
3543 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3544 gather_scatter_info *info)
3546 gcall *call = as_a <gcall *> (stmt_info->stmt);
3547 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3548 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3550 info->ifn = gimple_call_internal_fn (call);
3551 info->decl = NULL_TREE;
3552 info->base = gimple_call_arg (call, 0);
3553 info->offset = gimple_call_arg (call, 1);
3554 info->offset_dt = vect_unknown_def_type;
3555 info->offset_vectype = NULL_TREE;
3556 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3557 info->element_type = TREE_TYPE (vectype);
3558 info->memory_type = TREE_TYPE (DR_REF (dr));
3561 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3562 gather load or scatter store. Describe the operation in *INFO if so. */
3564 bool
3565 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3566 gather_scatter_info *info)
3568 HOST_WIDE_INT scale = 1;
3569 poly_int64 pbitpos, pbitsize;
3570 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3571 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3572 tree offtype = NULL_TREE;
3573 tree decl = NULL_TREE, base, off;
3574 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3575 tree memory_type = TREE_TYPE (DR_REF (dr));
3576 machine_mode pmode;
3577 int punsignedp, reversep, pvolatilep = 0;
3578 internal_fn ifn;
3579 tree element_type;
3580 bool masked_p = false;
3582 /* See whether this is already a call to a gather/scatter internal function.
3583 If not, see whether it's a masked load or store. */
3584 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3585 if (call && gimple_call_internal_p (call))
3587 ifn = gimple_call_internal_fn (call);
3588 if (internal_gather_scatter_fn_p (ifn))
3590 vect_describe_gather_scatter_call (stmt_info, info);
3591 return true;
3593 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3596 /* True if we should aim to use internal functions rather than
3597 built-in functions. */
3598 bool use_ifn_p = (DR_IS_READ (dr)
3599 ? supports_vec_gather_load_p ()
3600 : supports_vec_scatter_store_p ());
3602 base = DR_REF (dr);
3603 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3604 see if we can use the def stmt of the address. */
3605 if (masked_p
3606 && TREE_CODE (base) == MEM_REF
3607 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3608 && integer_zerop (TREE_OPERAND (base, 1))
3609 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3611 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3612 if (is_gimple_assign (def_stmt)
3613 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3614 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3617 /* The gather and scatter builtins need address of the form
3618 loop_invariant + vector * {1, 2, 4, 8}
3620 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3621 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3622 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3623 multiplications and additions in it. To get a vector, we need
3624 a single SSA_NAME that will be defined in the loop and will
3625 contain everything that is not loop invariant and that can be
3626 vectorized. The following code attempts to find such a preexistng
3627 SSA_NAME OFF and put the loop invariants into a tree BASE
3628 that can be gimplified before the loop. */
3629 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3630 &punsignedp, &reversep, &pvolatilep);
3631 if (reversep)
3632 return false;
3634 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3636 if (TREE_CODE (base) == MEM_REF)
3638 if (!integer_zerop (TREE_OPERAND (base, 1)))
3640 if (off == NULL_TREE)
3641 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3642 else
3643 off = size_binop (PLUS_EXPR, off,
3644 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3646 base = TREE_OPERAND (base, 0);
3648 else
3649 base = build_fold_addr_expr (base);
3651 if (off == NULL_TREE)
3652 off = size_zero_node;
3654 /* If base is not loop invariant, either off is 0, then we start with just
3655 the constant offset in the loop invariant BASE and continue with base
3656 as OFF, otherwise give up.
3657 We could handle that case by gimplifying the addition of base + off
3658 into some SSA_NAME and use that as off, but for now punt. */
3659 if (!expr_invariant_in_loop_p (loop, base))
3661 if (!integer_zerop (off))
3662 return false;
3663 off = base;
3664 base = size_int (pbytepos);
3666 /* Otherwise put base + constant offset into the loop invariant BASE
3667 and continue with OFF. */
3668 else
3670 base = fold_convert (sizetype, base);
3671 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3674 /* OFF at this point may be either a SSA_NAME or some tree expression
3675 from get_inner_reference. Try to peel off loop invariants from it
3676 into BASE as long as possible. */
3677 STRIP_NOPS (off);
3678 while (offtype == NULL_TREE)
3680 enum tree_code code;
3681 tree op0, op1, add = NULL_TREE;
3683 if (TREE_CODE (off) == SSA_NAME)
3685 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3687 if (expr_invariant_in_loop_p (loop, off))
3688 return false;
3690 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3691 break;
3693 op0 = gimple_assign_rhs1 (def_stmt);
3694 code = gimple_assign_rhs_code (def_stmt);
3695 op1 = gimple_assign_rhs2 (def_stmt);
3697 else
3699 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3700 return false;
3701 code = TREE_CODE (off);
3702 extract_ops_from_tree (off, &code, &op0, &op1);
3704 switch (code)
3706 case POINTER_PLUS_EXPR:
3707 case PLUS_EXPR:
3708 if (expr_invariant_in_loop_p (loop, op0))
3710 add = op0;
3711 off = op1;
3712 do_add:
3713 add = fold_convert (sizetype, add);
3714 if (scale != 1)
3715 add = size_binop (MULT_EXPR, add, size_int (scale));
3716 base = size_binop (PLUS_EXPR, base, add);
3717 continue;
3719 if (expr_invariant_in_loop_p (loop, op1))
3721 add = op1;
3722 off = op0;
3723 goto do_add;
3725 break;
3726 case MINUS_EXPR:
3727 if (expr_invariant_in_loop_p (loop, op1))
3729 add = fold_convert (sizetype, op1);
3730 add = size_binop (MINUS_EXPR, size_zero_node, add);
3731 off = op0;
3732 goto do_add;
3734 break;
3735 case MULT_EXPR:
3736 if (scale == 1 && tree_fits_shwi_p (op1))
3738 int new_scale = tree_to_shwi (op1);
3739 /* Only treat this as a scaling operation if the target
3740 supports it. */
3741 if (use_ifn_p
3742 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3743 vectype, memory_type, 1,
3744 TYPE_SIGN (TREE_TYPE (op0)),
3745 new_scale, &ifn,
3746 &element_type))
3747 break;
3748 scale = new_scale;
3749 off = op0;
3750 continue;
3752 break;
3753 case SSA_NAME:
3754 off = op0;
3755 continue;
3756 CASE_CONVERT:
3757 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3758 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3759 break;
3760 if (TYPE_PRECISION (TREE_TYPE (op0))
3761 == TYPE_PRECISION (TREE_TYPE (off)))
3763 off = op0;
3764 continue;
3767 /* The internal functions need the offset to be the same width
3768 as the elements of VECTYPE. Don't include operations that
3769 cast the offset from that width to a different width. */
3770 if (use_ifn_p
3771 && (int_size_in_bytes (TREE_TYPE (vectype))
3772 == int_size_in_bytes (TREE_TYPE (off))))
3773 break;
3775 if (TYPE_PRECISION (TREE_TYPE (op0))
3776 < TYPE_PRECISION (TREE_TYPE (off)))
3778 off = op0;
3779 offtype = TREE_TYPE (off);
3780 STRIP_NOPS (off);
3781 continue;
3783 break;
3784 default:
3785 break;
3787 break;
3790 /* If at the end OFF still isn't a SSA_NAME or isn't
3791 defined in the loop, punt. */
3792 if (TREE_CODE (off) != SSA_NAME
3793 || expr_invariant_in_loop_p (loop, off))
3794 return false;
3796 if (offtype == NULL_TREE)
3797 offtype = TREE_TYPE (off);
3799 if (use_ifn_p)
3801 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3802 memory_type, TYPE_PRECISION (offtype),
3803 TYPE_SIGN (offtype), scale, &ifn,
3804 &element_type))
3805 return false;
3807 else
3809 if (DR_IS_READ (dr))
3811 if (targetm.vectorize.builtin_gather)
3812 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3814 else
3816 if (targetm.vectorize.builtin_scatter)
3817 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3820 if (!decl)
3821 return false;
3823 ifn = IFN_LAST;
3824 element_type = TREE_TYPE (vectype);
3827 info->ifn = ifn;
3828 info->decl = decl;
3829 info->base = base;
3830 info->offset = off;
3831 info->offset_dt = vect_unknown_def_type;
3832 info->offset_vectype = NULL_TREE;
3833 info->scale = scale;
3834 info->element_type = element_type;
3835 info->memory_type = memory_type;
3836 return true;
3839 /* Find the data references in STMT, analyze them with respect to LOOP and
3840 append them to DATAREFS. Return false if datarefs in this stmt cannot
3841 be handled. */
3843 opt_result
3844 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3845 vec<data_reference_p> *datarefs)
3847 /* We can ignore clobbers for dataref analysis - they are removed during
3848 loop vectorization and BB vectorization checks dependences with a
3849 stmt walk. */
3850 if (gimple_clobber_p (stmt))
3851 return opt_result::success ();
3853 if (gimple_has_volatile_ops (stmt))
3854 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3855 stmt);
3857 if (stmt_can_throw_internal (cfun, stmt))
3858 return opt_result::failure_at (stmt,
3859 "not vectorized:"
3860 " statement can throw an exception: %G",
3861 stmt);
3863 auto_vec<data_reference_p, 2> refs;
3864 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3865 if (!res)
3866 return res;
3868 if (refs.is_empty ())
3869 return opt_result::success ();
3871 if (refs.length () > 1)
3872 return opt_result::failure_at (stmt,
3873 "not vectorized:"
3874 " more than one data ref in stmt: %G", stmt);
3876 if (gcall *call = dyn_cast <gcall *> (stmt))
3877 if (!gimple_call_internal_p (call)
3878 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3879 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3880 return opt_result::failure_at (stmt,
3881 "not vectorized: dr in a call %G", stmt);
3883 data_reference_p dr = refs.pop ();
3884 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3885 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3886 return opt_result::failure_at (stmt,
3887 "not vectorized:"
3888 " statement is bitfield access %G", stmt);
3890 if (DR_BASE_ADDRESS (dr)
3891 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3892 return opt_result::failure_at (stmt,
3893 "not vectorized:"
3894 " base addr of dr is a constant\n");
3896 /* Check whether this may be a SIMD lane access and adjust the
3897 DR to make it easier for us to handle it. */
3898 if (loop
3899 && loop->simduid
3900 && (!DR_BASE_ADDRESS (dr)
3901 || !DR_OFFSET (dr)
3902 || !DR_INIT (dr)
3903 || !DR_STEP (dr)))
3905 struct data_reference *newdr
3906 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3907 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3908 if (DR_BASE_ADDRESS (newdr)
3909 && DR_OFFSET (newdr)
3910 && DR_INIT (newdr)
3911 && DR_STEP (newdr)
3912 && integer_zerop (DR_STEP (newdr)))
3914 tree off = DR_OFFSET (newdr);
3915 STRIP_NOPS (off);
3916 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3917 && TREE_CODE (off) == MULT_EXPR
3918 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3920 tree step = TREE_OPERAND (off, 1);
3921 off = TREE_OPERAND (off, 0);
3922 STRIP_NOPS (off);
3923 if (CONVERT_EXPR_P (off)
3924 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
3925 < TYPE_PRECISION (TREE_TYPE (off))))
3926 off = TREE_OPERAND (off, 0);
3927 if (TREE_CODE (off) == SSA_NAME)
3929 gimple *def = SSA_NAME_DEF_STMT (off);
3930 tree reft = TREE_TYPE (DR_REF (newdr));
3931 if (is_gimple_call (def)
3932 && gimple_call_internal_p (def)
3933 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
3935 tree arg = gimple_call_arg (def, 0);
3936 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3937 arg = SSA_NAME_VAR (arg);
3938 if (arg == loop->simduid
3939 /* For now. */
3940 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
3942 DR_OFFSET (newdr) = ssize_int (0);
3943 DR_STEP (newdr) = step;
3944 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
3945 DR_STEP_ALIGNMENT (newdr)
3946 = highest_pow2_factor (step);
3947 /* Mark as simd-lane access. */
3948 newdr->aux = (void *)-1;
3949 free_data_ref (dr);
3950 datarefs->safe_push (newdr);
3951 return opt_result::success ();
3957 free_data_ref (newdr);
3960 datarefs->safe_push (dr);
3961 return opt_result::success ();
3964 /* Function vect_analyze_data_refs.
3966 Find all the data references in the loop or basic block.
3968 The general structure of the analysis of data refs in the vectorizer is as
3969 follows:
3970 1- vect_analyze_data_refs(loop/bb): call
3971 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3972 in the loop/bb and their dependences.
3973 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3974 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3975 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3979 opt_result
3980 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3982 struct loop *loop = NULL;
3983 unsigned int i;
3984 struct data_reference *dr;
3985 tree scalar_type;
3987 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
3989 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3990 loop = LOOP_VINFO_LOOP (loop_vinfo);
3992 /* Go through the data-refs, check that the analysis succeeded. Update
3993 pointer from stmt_vec_info struct to DR and vectype. */
3995 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3996 FOR_EACH_VEC_ELT (datarefs, i, dr)
3998 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3999 poly_uint64 vf;
4001 gcc_assert (DR_REF (dr));
4002 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4003 gcc_assert (!stmt_info->dr_aux.dr);
4004 stmt_info->dr_aux.dr = dr;
4005 stmt_info->dr_aux.stmt = stmt_info;
4007 /* Check that analysis of the data-ref succeeded. */
4008 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4009 || !DR_STEP (dr))
4011 bool maybe_gather
4012 = DR_IS_READ (dr)
4013 && !TREE_THIS_VOLATILE (DR_REF (dr))
4014 && (targetm.vectorize.builtin_gather != NULL
4015 || supports_vec_gather_load_p ());
4016 bool maybe_scatter
4017 = DR_IS_WRITE (dr)
4018 && !TREE_THIS_VOLATILE (DR_REF (dr))
4019 && (targetm.vectorize.builtin_scatter != NULL
4020 || supports_vec_scatter_store_p ());
4022 /* If target supports vector gather loads or scatter stores,
4023 see if they can't be used. */
4024 if (is_a <loop_vec_info> (vinfo)
4025 && !nested_in_vect_loop_p (loop, stmt_info))
4027 if (maybe_gather || maybe_scatter)
4029 if (maybe_gather)
4030 gatherscatter = GATHER;
4031 else
4032 gatherscatter = SCATTER;
4036 if (gatherscatter == SG_NONE)
4038 if (dump_enabled_p ())
4039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4040 "not vectorized: data ref analysis "
4041 "failed %G", stmt_info->stmt);
4042 if (is_a <bb_vec_info> (vinfo))
4044 /* In BB vectorization the ref can still participate
4045 in dependence analysis, we just can't vectorize it. */
4046 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4047 continue;
4049 return opt_result::failure_at (stmt_info->stmt,
4050 "not vectorized:"
4051 " data ref analysis failed: %G",
4052 stmt_info->stmt);
4056 /* See if this was detected as SIMD lane access. */
4057 if (dr->aux == (void *)-1)
4059 if (nested_in_vect_loop_p (loop, stmt_info))
4060 return opt_result::failure_at (stmt_info->stmt,
4061 "not vectorized:"
4062 " data ref analysis failed: %G",
4063 stmt_info->stmt);
4064 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4067 tree base = get_base_address (DR_REF (dr));
4068 if (base && VAR_P (base) && DECL_NONALIASED (base))
4070 if (dump_enabled_p ())
4071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4072 "not vectorized: base object not addressable "
4073 "for stmt: %G", stmt_info->stmt);
4074 if (is_a <bb_vec_info> (vinfo))
4076 /* In BB vectorization the ref can still participate
4077 in dependence analysis, we just can't vectorize it. */
4078 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4079 continue;
4081 return opt_result::failure_at (stmt_info->stmt,
4082 "not vectorized: base object not"
4083 " addressable for stmt: %G",
4084 stmt_info->stmt);
4087 if (is_a <loop_vec_info> (vinfo)
4088 && DR_STEP (dr)
4089 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4091 if (nested_in_vect_loop_p (loop, stmt_info))
4092 return opt_result::failure_at (stmt_info->stmt,
4093 "not vectorized:"
4094 "not suitable for strided load %G",
4095 stmt_info->stmt);
4096 STMT_VINFO_STRIDED_P (stmt_info) = true;
4099 /* Update DR field in stmt_vec_info struct. */
4101 /* If the dataref is in an inner-loop of the loop that is considered for
4102 for vectorization, we also want to analyze the access relative to
4103 the outer-loop (DR contains information only relative to the
4104 inner-most enclosing loop). We do that by building a reference to the
4105 first location accessed by the inner-loop, and analyze it relative to
4106 the outer-loop. */
4107 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4109 /* Build a reference to the first location accessed by the
4110 inner loop: *(BASE + INIT + OFFSET). By construction,
4111 this address must be invariant in the inner loop, so we
4112 can consider it as being used in the outer loop. */
4113 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4114 tree offset = unshare_expr (DR_OFFSET (dr));
4115 tree init = unshare_expr (DR_INIT (dr));
4116 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4117 init, offset);
4118 tree init_addr = fold_build_pointer_plus (base, init_offset);
4119 tree init_ref = build_fold_indirect_ref (init_addr);
4121 if (dump_enabled_p ())
4122 dump_printf_loc (MSG_NOTE, vect_location,
4123 "analyze in outer loop: %T\n", init_ref);
4125 opt_result res
4126 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4127 init_ref, loop, stmt_info->stmt);
4128 if (!res)
4129 /* dr_analyze_innermost already explained the failure. */
4130 return res;
4132 if (dump_enabled_p ())
4133 dump_printf_loc (MSG_NOTE, vect_location,
4134 "\touter base_address: %T\n"
4135 "\touter offset from base address: %T\n"
4136 "\touter constant offset from base address: %T\n"
4137 "\touter step: %T\n"
4138 "\touter base alignment: %d\n\n"
4139 "\touter base misalignment: %d\n"
4140 "\touter offset alignment: %d\n"
4141 "\touter step alignment: %d\n",
4142 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4143 STMT_VINFO_DR_OFFSET (stmt_info),
4144 STMT_VINFO_DR_INIT (stmt_info),
4145 STMT_VINFO_DR_STEP (stmt_info),
4146 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4147 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4148 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4149 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4152 /* Set vectype for STMT. */
4153 scalar_type = TREE_TYPE (DR_REF (dr));
4154 STMT_VINFO_VECTYPE (stmt_info)
4155 = get_vectype_for_scalar_type (scalar_type);
4156 if (!STMT_VINFO_VECTYPE (stmt_info))
4158 if (dump_enabled_p ())
4160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4161 "not vectorized: no vectype for stmt: %G",
4162 stmt_info->stmt);
4163 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4164 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4165 scalar_type);
4166 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4169 if (is_a <bb_vec_info> (vinfo))
4171 /* No vector type is fine, the ref can still participate
4172 in dependence analysis, we just can't vectorize it. */
4173 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4174 continue;
4176 return opt_result::failure_at (stmt_info->stmt,
4177 "not vectorized:"
4178 " no vectype for stmt: %G"
4179 " scalar_type: %T\n",
4180 stmt_info->stmt, scalar_type);
4182 else
4184 if (dump_enabled_p ())
4185 dump_printf_loc (MSG_NOTE, vect_location,
4186 "got vectype for stmt: %G%T\n",
4187 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4190 /* Adjust the minimal vectorization factor according to the
4191 vector type. */
4192 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4193 *min_vf = upper_bound (*min_vf, vf);
4195 if (gatherscatter != SG_NONE)
4197 gather_scatter_info gs_info;
4198 if (!vect_check_gather_scatter (stmt_info,
4199 as_a <loop_vec_info> (vinfo),
4200 &gs_info)
4201 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4202 return opt_result::failure_at
4203 (stmt_info->stmt,
4204 (gatherscatter == GATHER) ?
4205 "not vectorized: not suitable for gather load %G" :
4206 "not vectorized: not suitable for scatter store %G",
4207 stmt_info->stmt);
4208 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4212 /* We used to stop processing and prune the list here. Verify we no
4213 longer need to. */
4214 gcc_assert (i == datarefs.length ());
4216 return opt_result::success ();
4220 /* Function vect_get_new_vect_var.
4222 Returns a name for a new variable. The current naming scheme appends the
4223 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4224 the name of vectorizer generated variables, and appends that to NAME if
4225 provided. */
4227 tree
4228 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4230 const char *prefix;
4231 tree new_vect_var;
4233 switch (var_kind)
4235 case vect_simple_var:
4236 prefix = "vect";
4237 break;
4238 case vect_scalar_var:
4239 prefix = "stmp";
4240 break;
4241 case vect_mask_var:
4242 prefix = "mask";
4243 break;
4244 case vect_pointer_var:
4245 prefix = "vectp";
4246 break;
4247 default:
4248 gcc_unreachable ();
4251 if (name)
4253 char* tmp = concat (prefix, "_", name, NULL);
4254 new_vect_var = create_tmp_reg (type, tmp);
4255 free (tmp);
4257 else
4258 new_vect_var = create_tmp_reg (type, prefix);
4260 return new_vect_var;
4263 /* Like vect_get_new_vect_var but return an SSA name. */
4265 tree
4266 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4268 const char *prefix;
4269 tree new_vect_var;
4271 switch (var_kind)
4273 case vect_simple_var:
4274 prefix = "vect";
4275 break;
4276 case vect_scalar_var:
4277 prefix = "stmp";
4278 break;
4279 case vect_pointer_var:
4280 prefix = "vectp";
4281 break;
4282 default:
4283 gcc_unreachable ();
4286 if (name)
4288 char* tmp = concat (prefix, "_", name, NULL);
4289 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4290 free (tmp);
4292 else
4293 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4295 return new_vect_var;
4298 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4300 static void
4301 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4303 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4304 int misalign = DR_MISALIGNMENT (dr_info);
4305 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4306 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4307 else
4308 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4309 DR_TARGET_ALIGNMENT (dr_info), misalign);
4312 /* Function vect_create_addr_base_for_vector_ref.
4314 Create an expression that computes the address of the first memory location
4315 that will be accessed for a data reference.
4317 Input:
4318 STMT_INFO: The statement containing the data reference.
4319 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4320 OFFSET: Optional. If supplied, it is be added to the initial address.
4321 LOOP: Specify relative to which loop-nest should the address be computed.
4322 For example, when the dataref is in an inner-loop nested in an
4323 outer-loop that is now being vectorized, LOOP can be either the
4324 outer-loop, or the inner-loop. The first memory location accessed
4325 by the following dataref ('in' points to short):
4327 for (i=0; i<N; i++)
4328 for (j=0; j<M; j++)
4329 s += in[i+j]
4331 is as follows:
4332 if LOOP=i_loop: &in (relative to i_loop)
4333 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4334 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4335 initial address. Unlike OFFSET, which is number of elements to
4336 be added, BYTE_OFFSET is measured in bytes.
4338 Output:
4339 1. Return an SSA_NAME whose value is the address of the memory location of
4340 the first vector of the data reference.
4341 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4342 these statement(s) which define the returned SSA_NAME.
4344 FORNOW: We are only handling array accesses with step 1. */
4346 tree
4347 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4348 gimple_seq *new_stmt_list,
4349 tree offset,
4350 tree byte_offset)
4352 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4353 struct data_reference *dr = dr_info->dr;
4354 const char *base_name;
4355 tree addr_base;
4356 tree dest;
4357 gimple_seq seq = NULL;
4358 tree vect_ptr_type;
4359 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4360 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4361 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4363 tree data_ref_base = unshare_expr (drb->base_address);
4364 tree base_offset = unshare_expr (drb->offset);
4365 tree init = unshare_expr (drb->init);
4367 if (loop_vinfo)
4368 base_name = get_name (data_ref_base);
4369 else
4371 base_offset = ssize_int (0);
4372 init = ssize_int (0);
4373 base_name = get_name (DR_REF (dr));
4376 /* Create base_offset */
4377 base_offset = size_binop (PLUS_EXPR,
4378 fold_convert (sizetype, base_offset),
4379 fold_convert (sizetype, init));
4381 if (offset)
4383 offset = fold_build2 (MULT_EXPR, sizetype,
4384 fold_convert (sizetype, offset), step);
4385 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4386 base_offset, offset);
4388 if (byte_offset)
4390 byte_offset = fold_convert (sizetype, byte_offset);
4391 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4392 base_offset, byte_offset);
4395 /* base + base_offset */
4396 if (loop_vinfo)
4397 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4398 else
4400 addr_base = build1 (ADDR_EXPR,
4401 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4402 unshare_expr (DR_REF (dr)));
4405 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4406 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4407 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4408 gimple_seq_add_seq (new_stmt_list, seq);
4410 if (DR_PTR_INFO (dr)
4411 && TREE_CODE (addr_base) == SSA_NAME
4412 && !SSA_NAME_PTR_INFO (addr_base))
4414 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4415 if (offset || byte_offset)
4416 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4419 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4422 return addr_base;
4426 /* Function vect_create_data_ref_ptr.
4428 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4429 location accessed in the loop by STMT_INFO, along with the def-use update
4430 chain to appropriately advance the pointer through the loop iterations.
4431 Also set aliasing information for the pointer. This pointer is used by
4432 the callers to this function to create a memory reference expression for
4433 vector load/store access.
4435 Input:
4436 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4437 GIMPLE_ASSIGN <name, data-ref> or
4438 GIMPLE_ASSIGN <data-ref, name>.
4439 2. AGGR_TYPE: the type of the reference, which should be either a vector
4440 or an array.
4441 3. AT_LOOP: the loop where the vector memref is to be created.
4442 4. OFFSET (optional): an offset to be added to the initial address accessed
4443 by the data-ref in STMT_INFO.
4444 5. BSI: location where the new stmts are to be placed if there is no loop
4445 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4446 pointing to the initial address.
4447 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4448 to the initial address accessed by the data-ref in STMT_INFO. This is
4449 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4450 in bytes.
4451 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4452 to the IV during each iteration of the loop. NULL says to move
4453 by one copy of AGGR_TYPE up or down, depending on the step of the
4454 data reference.
4456 Output:
4457 1. Declare a new ptr to vector_type, and have it point to the base of the
4458 data reference (initial addressed accessed by the data reference).
4459 For example, for vector of type V8HI, the following code is generated:
4461 v8hi *ap;
4462 ap = (v8hi *)initial_address;
4464 if OFFSET is not supplied:
4465 initial_address = &a[init];
4466 if OFFSET is supplied:
4467 initial_address = &a[init + OFFSET];
4468 if BYTE_OFFSET is supplied:
4469 initial_address = &a[init] + BYTE_OFFSET;
4471 Return the initial_address in INITIAL_ADDRESS.
4473 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4474 update the pointer in each iteration of the loop.
4476 Return the increment stmt that updates the pointer in PTR_INCR.
4478 3. Return the pointer. */
4480 tree
4481 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4482 struct loop *at_loop, tree offset,
4483 tree *initial_address, gimple_stmt_iterator *gsi,
4484 gimple **ptr_incr, bool only_init,
4485 tree byte_offset, tree iv_step)
4487 const char *base_name;
4488 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4489 struct loop *loop = NULL;
4490 bool nested_in_vect_loop = false;
4491 struct loop *containing_loop = NULL;
4492 tree aggr_ptr_type;
4493 tree aggr_ptr;
4494 tree new_temp;
4495 gimple_seq new_stmt_list = NULL;
4496 edge pe = NULL;
4497 basic_block new_bb;
4498 tree aggr_ptr_init;
4499 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4500 struct data_reference *dr = dr_info->dr;
4501 tree aptr;
4502 gimple_stmt_iterator incr_gsi;
4503 bool insert_after;
4504 tree indx_before_incr, indx_after_incr;
4505 gimple *incr;
4506 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4508 gcc_assert (iv_step != NULL_TREE
4509 || TREE_CODE (aggr_type) == ARRAY_TYPE
4510 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4512 if (loop_vinfo)
4514 loop = LOOP_VINFO_LOOP (loop_vinfo);
4515 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4516 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4517 pe = loop_preheader_edge (loop);
4519 else
4521 gcc_assert (bb_vinfo);
4522 only_init = true;
4523 *ptr_incr = NULL;
4526 /* Create an expression for the first address accessed by this load
4527 in LOOP. */
4528 base_name = get_name (DR_BASE_ADDRESS (dr));
4530 if (dump_enabled_p ())
4532 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4533 dump_printf_loc (MSG_NOTE, vect_location,
4534 "create %s-pointer variable to type: %T",
4535 get_tree_code_name (TREE_CODE (aggr_type)),
4536 aggr_type);
4537 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4538 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4539 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4540 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4541 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4542 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4543 else
4544 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4545 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4548 /* (1) Create the new aggregate-pointer variable.
4549 Vector and array types inherit the alias set of their component
4550 type by default so we need to use a ref-all pointer if the data
4551 reference does not conflict with the created aggregated data
4552 reference because it is not addressable. */
4553 bool need_ref_all = false;
4554 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4555 get_alias_set (DR_REF (dr))))
4556 need_ref_all = true;
4557 /* Likewise for any of the data references in the stmt group. */
4558 else if (DR_GROUP_SIZE (stmt_info) > 1)
4560 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4563 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4564 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4565 get_alias_set (DR_REF (sdr))))
4567 need_ref_all = true;
4568 break;
4570 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4572 while (sinfo);
4574 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4575 need_ref_all);
4576 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4579 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4580 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4581 def-use update cycles for the pointer: one relative to the outer-loop
4582 (LOOP), which is what steps (3) and (4) below do. The other is relative
4583 to the inner-loop (which is the inner-most loop containing the dataref),
4584 and this is done be step (5) below.
4586 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4587 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4588 redundant. Steps (3),(4) create the following:
4590 vp0 = &base_addr;
4591 LOOP: vp1 = phi(vp0,vp2)
4594 vp2 = vp1 + step
4595 goto LOOP
4597 If there is an inner-loop nested in loop, then step (5) will also be
4598 applied, and an additional update in the inner-loop will be created:
4600 vp0 = &base_addr;
4601 LOOP: vp1 = phi(vp0,vp2)
4603 inner: vp3 = phi(vp1,vp4)
4604 vp4 = vp3 + inner_step
4605 if () goto inner
4607 vp2 = vp1 + step
4608 if () goto LOOP */
4610 /* (2) Calculate the initial address of the aggregate-pointer, and set
4611 the aggregate-pointer to point to it before the loop. */
4613 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4615 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4616 offset, byte_offset);
4617 if (new_stmt_list)
4619 if (pe)
4621 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4622 gcc_assert (!new_bb);
4624 else
4625 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4628 *initial_address = new_temp;
4629 aggr_ptr_init = new_temp;
4631 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4632 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4633 inner-loop nested in LOOP (during outer-loop vectorization). */
4635 /* No update in loop is required. */
4636 if (only_init && (!loop_vinfo || at_loop == loop))
4637 aptr = aggr_ptr_init;
4638 else
4640 /* Accesses to invariant addresses should be handled specially
4641 by the caller. */
4642 tree step = vect_dr_behavior (dr_info)->step;
4643 gcc_assert (!integer_zerop (step));
4645 if (iv_step == NULL_TREE)
4647 /* The step of the aggregate pointer is the type size,
4648 negated for downward accesses. */
4649 iv_step = TYPE_SIZE_UNIT (aggr_type);
4650 if (tree_int_cst_sgn (step) == -1)
4651 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4654 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4656 create_iv (aggr_ptr_init,
4657 fold_convert (aggr_ptr_type, iv_step),
4658 aggr_ptr, loop, &incr_gsi, insert_after,
4659 &indx_before_incr, &indx_after_incr);
4660 incr = gsi_stmt (incr_gsi);
4661 loop_vinfo->add_stmt (incr);
4663 /* Copy the points-to information if it exists. */
4664 if (DR_PTR_INFO (dr))
4666 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4667 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4669 if (ptr_incr)
4670 *ptr_incr = incr;
4672 aptr = indx_before_incr;
4675 if (!nested_in_vect_loop || only_init)
4676 return aptr;
4679 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4680 nested in LOOP, if exists. */
4682 gcc_assert (nested_in_vect_loop);
4683 if (!only_init)
4685 standard_iv_increment_position (containing_loop, &incr_gsi,
4686 &insert_after);
4687 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4688 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4689 &indx_after_incr);
4690 incr = gsi_stmt (incr_gsi);
4691 loop_vinfo->add_stmt (incr);
4693 /* Copy the points-to information if it exists. */
4694 if (DR_PTR_INFO (dr))
4696 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4697 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4699 if (ptr_incr)
4700 *ptr_incr = incr;
4702 return indx_before_incr;
4704 else
4705 gcc_unreachable ();
4709 /* Function bump_vector_ptr
4711 Increment a pointer (to a vector type) by vector-size. If requested,
4712 i.e. if PTR-INCR is given, then also connect the new increment stmt
4713 to the existing def-use update-chain of the pointer, by modifying
4714 the PTR_INCR as illustrated below:
4716 The pointer def-use update-chain before this function:
4717 DATAREF_PTR = phi (p_0, p_2)
4718 ....
4719 PTR_INCR: p_2 = DATAREF_PTR + step
4721 The pointer def-use update-chain after this function:
4722 DATAREF_PTR = phi (p_0, p_2)
4723 ....
4724 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4725 ....
4726 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4728 Input:
4729 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4730 in the loop.
4731 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4732 the loop. The increment amount across iterations is expected
4733 to be vector_size.
4734 BSI - location where the new update stmt is to be placed.
4735 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4736 BUMP - optional. The offset by which to bump the pointer. If not given,
4737 the offset is assumed to be vector_size.
4739 Output: Return NEW_DATAREF_PTR as illustrated above.
4743 tree
4744 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4745 stmt_vec_info stmt_info, tree bump)
4747 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4748 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4749 tree update = TYPE_SIZE_UNIT (vectype);
4750 gassign *incr_stmt;
4751 ssa_op_iter iter;
4752 use_operand_p use_p;
4753 tree new_dataref_ptr;
4755 if (bump)
4756 update = bump;
4758 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4759 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4760 else
4761 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4762 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4763 dataref_ptr, update);
4764 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4766 /* Copy the points-to information if it exists. */
4767 if (DR_PTR_INFO (dr))
4769 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4770 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4773 if (!ptr_incr)
4774 return new_dataref_ptr;
4776 /* Update the vector-pointer's cross-iteration increment. */
4777 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4779 tree use = USE_FROM_PTR (use_p);
4781 if (use == dataref_ptr)
4782 SET_USE (use_p, new_dataref_ptr);
4783 else
4784 gcc_assert (operand_equal_p (use, update, 0));
4787 return new_dataref_ptr;
4791 /* Copy memory reference info such as base/clique from the SRC reference
4792 to the DEST MEM_REF. */
4794 void
4795 vect_copy_ref_info (tree dest, tree src)
4797 if (TREE_CODE (dest) != MEM_REF)
4798 return;
4800 tree src_base = src;
4801 while (handled_component_p (src_base))
4802 src_base = TREE_OPERAND (src_base, 0);
4803 if (TREE_CODE (src_base) != MEM_REF
4804 && TREE_CODE (src_base) != TARGET_MEM_REF)
4805 return;
4807 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4808 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4812 /* Function vect_create_destination_var.
4814 Create a new temporary of type VECTYPE. */
4816 tree
4817 vect_create_destination_var (tree scalar_dest, tree vectype)
4819 tree vec_dest;
4820 const char *name;
4821 char *new_name;
4822 tree type;
4823 enum vect_var_kind kind;
4825 kind = vectype
4826 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4827 ? vect_mask_var
4828 : vect_simple_var
4829 : vect_scalar_var;
4830 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4832 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4834 name = get_name (scalar_dest);
4835 if (name)
4836 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4837 else
4838 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4839 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4840 free (new_name);
4842 return vec_dest;
4845 /* Function vect_grouped_store_supported.
4847 Returns TRUE if interleave high and interleave low permutations
4848 are supported, and FALSE otherwise. */
4850 bool
4851 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4853 machine_mode mode = TYPE_MODE (vectype);
4855 /* vect_permute_store_chain requires the group size to be equal to 3 or
4856 be a power of two. */
4857 if (count != 3 && exact_log2 (count) == -1)
4859 if (dump_enabled_p ())
4860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4861 "the size of the group of accesses"
4862 " is not a power of 2 or not eqaul to 3\n");
4863 return false;
4866 /* Check that the permutation is supported. */
4867 if (VECTOR_MODE_P (mode))
4869 unsigned int i;
4870 if (count == 3)
4872 unsigned int j0 = 0, j1 = 0, j2 = 0;
4873 unsigned int i, j;
4875 unsigned int nelt;
4876 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4878 if (dump_enabled_p ())
4879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4880 "cannot handle groups of 3 stores for"
4881 " variable-length vectors\n");
4882 return false;
4885 vec_perm_builder sel (nelt, nelt, 1);
4886 sel.quick_grow (nelt);
4887 vec_perm_indices indices;
4888 for (j = 0; j < 3; j++)
4890 int nelt0 = ((3 - j) * nelt) % 3;
4891 int nelt1 = ((3 - j) * nelt + 1) % 3;
4892 int nelt2 = ((3 - j) * nelt + 2) % 3;
4893 for (i = 0; i < nelt; i++)
4895 if (3 * i + nelt0 < nelt)
4896 sel[3 * i + nelt0] = j0++;
4897 if (3 * i + nelt1 < nelt)
4898 sel[3 * i + nelt1] = nelt + j1++;
4899 if (3 * i + nelt2 < nelt)
4900 sel[3 * i + nelt2] = 0;
4902 indices.new_vector (sel, 2, nelt);
4903 if (!can_vec_perm_const_p (mode, indices))
4905 if (dump_enabled_p ())
4906 dump_printf (MSG_MISSED_OPTIMIZATION,
4907 "permutation op not supported by target.\n");
4908 return false;
4911 for (i = 0; i < nelt; i++)
4913 if (3 * i + nelt0 < nelt)
4914 sel[3 * i + nelt0] = 3 * i + nelt0;
4915 if (3 * i + nelt1 < nelt)
4916 sel[3 * i + nelt1] = 3 * i + nelt1;
4917 if (3 * i + nelt2 < nelt)
4918 sel[3 * i + nelt2] = nelt + j2++;
4920 indices.new_vector (sel, 2, nelt);
4921 if (!can_vec_perm_const_p (mode, indices))
4923 if (dump_enabled_p ())
4924 dump_printf (MSG_MISSED_OPTIMIZATION,
4925 "permutation op not supported by target.\n");
4926 return false;
4929 return true;
4931 else
4933 /* If length is not equal to 3 then only power of 2 is supported. */
4934 gcc_assert (pow2p_hwi (count));
4935 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4937 /* The encoding has 2 interleaved stepped patterns. */
4938 vec_perm_builder sel (nelt, 2, 3);
4939 sel.quick_grow (6);
4940 for (i = 0; i < 3; i++)
4942 sel[i * 2] = i;
4943 sel[i * 2 + 1] = i + nelt;
4945 vec_perm_indices indices (sel, 2, nelt);
4946 if (can_vec_perm_const_p (mode, indices))
4948 for (i = 0; i < 6; i++)
4949 sel[i] += exact_div (nelt, 2);
4950 indices.new_vector (sel, 2, nelt);
4951 if (can_vec_perm_const_p (mode, indices))
4952 return true;
4957 if (dump_enabled_p ())
4958 dump_printf (MSG_MISSED_OPTIMIZATION,
4959 "permutation op not supported by target.\n");
4960 return false;
4964 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
4965 type VECTYPE. MASKED_P says whether the masked form is needed. */
4967 bool
4968 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
4969 bool masked_p)
4971 if (masked_p)
4972 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
4973 vec_mask_store_lanes_optab,
4974 vectype, count);
4975 else
4976 return vect_lanes_optab_supported_p ("vec_store_lanes",
4977 vec_store_lanes_optab,
4978 vectype, count);
4982 /* Function vect_permute_store_chain.
4984 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4985 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4986 the data correctly for the stores. Return the final references for stores
4987 in RESULT_CHAIN.
4989 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4990 The input is 4 vectors each containing 8 elements. We assign a number to
4991 each element, the input sequence is:
4993 1st vec: 0 1 2 3 4 5 6 7
4994 2nd vec: 8 9 10 11 12 13 14 15
4995 3rd vec: 16 17 18 19 20 21 22 23
4996 4th vec: 24 25 26 27 28 29 30 31
4998 The output sequence should be:
5000 1st vec: 0 8 16 24 1 9 17 25
5001 2nd vec: 2 10 18 26 3 11 19 27
5002 3rd vec: 4 12 20 28 5 13 21 30
5003 4th vec: 6 14 22 30 7 15 23 31
5005 i.e., we interleave the contents of the four vectors in their order.
5007 We use interleave_high/low instructions to create such output. The input of
5008 each interleave_high/low operation is two vectors:
5009 1st vec 2nd vec
5010 0 1 2 3 4 5 6 7
5011 the even elements of the result vector are obtained left-to-right from the
5012 high/low elements of the first vector. The odd elements of the result are
5013 obtained left-to-right from the high/low elements of the second vector.
5014 The output of interleave_high will be: 0 4 1 5
5015 and of interleave_low: 2 6 3 7
5018 The permutation is done in log LENGTH stages. In each stage interleave_high
5019 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5020 where the first argument is taken from the first half of DR_CHAIN and the
5021 second argument from it's second half.
5022 In our example,
5024 I1: interleave_high (1st vec, 3rd vec)
5025 I2: interleave_low (1st vec, 3rd vec)
5026 I3: interleave_high (2nd vec, 4th vec)
5027 I4: interleave_low (2nd vec, 4th vec)
5029 The output for the first stage is:
5031 I1: 0 16 1 17 2 18 3 19
5032 I2: 4 20 5 21 6 22 7 23
5033 I3: 8 24 9 25 10 26 11 27
5034 I4: 12 28 13 29 14 30 15 31
5036 The output of the second stage, i.e. the final result is:
5038 I1: 0 8 16 24 1 9 17 25
5039 I2: 2 10 18 26 3 11 19 27
5040 I3: 4 12 20 28 5 13 21 30
5041 I4: 6 14 22 30 7 15 23 31. */
5043 void
5044 vect_permute_store_chain (vec<tree> dr_chain,
5045 unsigned int length,
5046 stmt_vec_info stmt_info,
5047 gimple_stmt_iterator *gsi,
5048 vec<tree> *result_chain)
5050 tree vect1, vect2, high, low;
5051 gimple *perm_stmt;
5052 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5053 tree perm_mask_low, perm_mask_high;
5054 tree data_ref;
5055 tree perm3_mask_low, perm3_mask_high;
5056 unsigned int i, j, n, log_length = exact_log2 (length);
5058 result_chain->quick_grow (length);
5059 memcpy (result_chain->address (), dr_chain.address (),
5060 length * sizeof (tree));
5062 if (length == 3)
5064 /* vect_grouped_store_supported ensures that this is constant. */
5065 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5066 unsigned int j0 = 0, j1 = 0, j2 = 0;
5068 vec_perm_builder sel (nelt, nelt, 1);
5069 sel.quick_grow (nelt);
5070 vec_perm_indices indices;
5071 for (j = 0; j < 3; j++)
5073 int nelt0 = ((3 - j) * nelt) % 3;
5074 int nelt1 = ((3 - j) * nelt + 1) % 3;
5075 int nelt2 = ((3 - j) * nelt + 2) % 3;
5077 for (i = 0; i < nelt; i++)
5079 if (3 * i + nelt0 < nelt)
5080 sel[3 * i + nelt0] = j0++;
5081 if (3 * i + nelt1 < nelt)
5082 sel[3 * i + nelt1] = nelt + j1++;
5083 if (3 * i + nelt2 < nelt)
5084 sel[3 * i + nelt2] = 0;
5086 indices.new_vector (sel, 2, nelt);
5087 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5089 for (i = 0; i < nelt; i++)
5091 if (3 * i + nelt0 < nelt)
5092 sel[3 * i + nelt0] = 3 * i + nelt0;
5093 if (3 * i + nelt1 < nelt)
5094 sel[3 * i + nelt1] = 3 * i + nelt1;
5095 if (3 * i + nelt2 < nelt)
5096 sel[3 * i + nelt2] = nelt + j2++;
5098 indices.new_vector (sel, 2, nelt);
5099 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5101 vect1 = dr_chain[0];
5102 vect2 = dr_chain[1];
5104 /* Create interleaving stmt:
5105 low = VEC_PERM_EXPR <vect1, vect2,
5106 {j, nelt, *, j + 1, nelt + j + 1, *,
5107 j + 2, nelt + j + 2, *, ...}> */
5108 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5109 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5110 vect2, perm3_mask_low);
5111 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5113 vect1 = data_ref;
5114 vect2 = dr_chain[2];
5115 /* Create interleaving stmt:
5116 low = VEC_PERM_EXPR <vect1, vect2,
5117 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5118 6, 7, nelt + j + 2, ...}> */
5119 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5120 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5121 vect2, perm3_mask_high);
5122 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5123 (*result_chain)[j] = data_ref;
5126 else
5128 /* If length is not equal to 3 then only power of 2 is supported. */
5129 gcc_assert (pow2p_hwi (length));
5131 /* The encoding has 2 interleaved stepped patterns. */
5132 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5133 vec_perm_builder sel (nelt, 2, 3);
5134 sel.quick_grow (6);
5135 for (i = 0; i < 3; i++)
5137 sel[i * 2] = i;
5138 sel[i * 2 + 1] = i + nelt;
5140 vec_perm_indices indices (sel, 2, nelt);
5141 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5143 for (i = 0; i < 6; i++)
5144 sel[i] += exact_div (nelt, 2);
5145 indices.new_vector (sel, 2, nelt);
5146 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5148 for (i = 0, n = log_length; i < n; i++)
5150 for (j = 0; j < length/2; j++)
5152 vect1 = dr_chain[j];
5153 vect2 = dr_chain[j+length/2];
5155 /* Create interleaving stmt:
5156 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5157 ...}> */
5158 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5159 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5160 vect2, perm_mask_high);
5161 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5162 (*result_chain)[2*j] = high;
5164 /* Create interleaving stmt:
5165 low = VEC_PERM_EXPR <vect1, vect2,
5166 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5167 ...}> */
5168 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5169 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5170 vect2, perm_mask_low);
5171 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5172 (*result_chain)[2*j+1] = low;
5174 memcpy (dr_chain.address (), result_chain->address (),
5175 length * sizeof (tree));
5180 /* Function vect_setup_realignment
5182 This function is called when vectorizing an unaligned load using
5183 the dr_explicit_realign[_optimized] scheme.
5184 This function generates the following code at the loop prolog:
5186 p = initial_addr;
5187 x msq_init = *(floor(p)); # prolog load
5188 realignment_token = call target_builtin;
5189 loop:
5190 x msq = phi (msq_init, ---)
5192 The stmts marked with x are generated only for the case of
5193 dr_explicit_realign_optimized.
5195 The code above sets up a new (vector) pointer, pointing to the first
5196 location accessed by STMT_INFO, and a "floor-aligned" load using that
5197 pointer. It also generates code to compute the "realignment-token"
5198 (if the relevant target hook was defined), and creates a phi-node at the
5199 loop-header bb whose arguments are the result of the prolog-load (created
5200 by this function) and the result of a load that takes place in the loop
5201 (to be created by the caller to this function).
5203 For the case of dr_explicit_realign_optimized:
5204 The caller to this function uses the phi-result (msq) to create the
5205 realignment code inside the loop, and sets up the missing phi argument,
5206 as follows:
5207 loop:
5208 msq = phi (msq_init, lsq)
5209 lsq = *(floor(p')); # load in loop
5210 result = realign_load (msq, lsq, realignment_token);
5212 For the case of dr_explicit_realign:
5213 loop:
5214 msq = *(floor(p)); # load in loop
5215 p' = p + (VS-1);
5216 lsq = *(floor(p')); # load in loop
5217 result = realign_load (msq, lsq, realignment_token);
5219 Input:
5220 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5221 a memory location that may be unaligned.
5222 BSI - place where new code is to be inserted.
5223 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5224 is used.
5226 Output:
5227 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5228 target hook, if defined.
5229 Return value - the result of the loop-header phi node. */
5231 tree
5232 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5233 tree *realignment_token,
5234 enum dr_alignment_support alignment_support_scheme,
5235 tree init_addr,
5236 struct loop **at_loop)
5238 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5239 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5240 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5241 struct data_reference *dr = dr_info->dr;
5242 struct loop *loop = NULL;
5243 edge pe = NULL;
5244 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5245 tree vec_dest;
5246 gimple *inc;
5247 tree ptr;
5248 tree data_ref;
5249 basic_block new_bb;
5250 tree msq_init = NULL_TREE;
5251 tree new_temp;
5252 gphi *phi_stmt;
5253 tree msq = NULL_TREE;
5254 gimple_seq stmts = NULL;
5255 bool compute_in_loop = false;
5256 bool nested_in_vect_loop = false;
5257 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5258 struct loop *loop_for_initial_load = NULL;
5260 if (loop_vinfo)
5262 loop = LOOP_VINFO_LOOP (loop_vinfo);
5263 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5266 gcc_assert (alignment_support_scheme == dr_explicit_realign
5267 || alignment_support_scheme == dr_explicit_realign_optimized);
5269 /* We need to generate three things:
5270 1. the misalignment computation
5271 2. the extra vector load (for the optimized realignment scheme).
5272 3. the phi node for the two vectors from which the realignment is
5273 done (for the optimized realignment scheme). */
5275 /* 1. Determine where to generate the misalignment computation.
5277 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5278 calculation will be generated by this function, outside the loop (in the
5279 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5280 caller, inside the loop.
5282 Background: If the misalignment remains fixed throughout the iterations of
5283 the loop, then both realignment schemes are applicable, and also the
5284 misalignment computation can be done outside LOOP. This is because we are
5285 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5286 are a multiple of VS (the Vector Size), and therefore the misalignment in
5287 different vectorized LOOP iterations is always the same.
5288 The problem arises only if the memory access is in an inner-loop nested
5289 inside LOOP, which is now being vectorized using outer-loop vectorization.
5290 This is the only case when the misalignment of the memory access may not
5291 remain fixed throughout the iterations of the inner-loop (as explained in
5292 detail in vect_supportable_dr_alignment). In this case, not only is the
5293 optimized realignment scheme not applicable, but also the misalignment
5294 computation (and generation of the realignment token that is passed to
5295 REALIGN_LOAD) have to be done inside the loop.
5297 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5298 or not, which in turn determines if the misalignment is computed inside
5299 the inner-loop, or outside LOOP. */
5301 if (init_addr != NULL_TREE || !loop_vinfo)
5303 compute_in_loop = true;
5304 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5308 /* 2. Determine where to generate the extra vector load.
5310 For the optimized realignment scheme, instead of generating two vector
5311 loads in each iteration, we generate a single extra vector load in the
5312 preheader of the loop, and in each iteration reuse the result of the
5313 vector load from the previous iteration. In case the memory access is in
5314 an inner-loop nested inside LOOP, which is now being vectorized using
5315 outer-loop vectorization, we need to determine whether this initial vector
5316 load should be generated at the preheader of the inner-loop, or can be
5317 generated at the preheader of LOOP. If the memory access has no evolution
5318 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5319 to be generated inside LOOP (in the preheader of the inner-loop). */
5321 if (nested_in_vect_loop)
5323 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5324 bool invariant_in_outerloop =
5325 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5326 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5328 else
5329 loop_for_initial_load = loop;
5330 if (at_loop)
5331 *at_loop = loop_for_initial_load;
5333 if (loop_for_initial_load)
5334 pe = loop_preheader_edge (loop_for_initial_load);
5336 /* 3. For the case of the optimized realignment, create the first vector
5337 load at the loop preheader. */
5339 if (alignment_support_scheme == dr_explicit_realign_optimized)
5341 /* Create msq_init = *(floor(p1)) in the loop preheader */
5342 gassign *new_stmt;
5344 gcc_assert (!compute_in_loop);
5345 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5346 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5347 loop_for_initial_load, NULL_TREE,
5348 &init_addr, NULL, &inc, true);
5349 if (TREE_CODE (ptr) == SSA_NAME)
5350 new_temp = copy_ssa_name (ptr);
5351 else
5352 new_temp = make_ssa_name (TREE_TYPE (ptr));
5353 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5354 new_stmt = gimple_build_assign
5355 (new_temp, BIT_AND_EXPR, ptr,
5356 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5357 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5358 gcc_assert (!new_bb);
5359 data_ref
5360 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5361 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5362 vect_copy_ref_info (data_ref, DR_REF (dr));
5363 new_stmt = gimple_build_assign (vec_dest, data_ref);
5364 new_temp = make_ssa_name (vec_dest, new_stmt);
5365 gimple_assign_set_lhs (new_stmt, new_temp);
5366 if (pe)
5368 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5369 gcc_assert (!new_bb);
5371 else
5372 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5374 msq_init = gimple_assign_lhs (new_stmt);
5377 /* 4. Create realignment token using a target builtin, if available.
5378 It is done either inside the containing loop, or before LOOP (as
5379 determined above). */
5381 if (targetm.vectorize.builtin_mask_for_load)
5383 gcall *new_stmt;
5384 tree builtin_decl;
5386 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5387 if (!init_addr)
5389 /* Generate the INIT_ADDR computation outside LOOP. */
5390 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5391 NULL_TREE);
5392 if (loop)
5394 pe = loop_preheader_edge (loop);
5395 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5396 gcc_assert (!new_bb);
5398 else
5399 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5402 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5403 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5404 vec_dest =
5405 vect_create_destination_var (scalar_dest,
5406 gimple_call_return_type (new_stmt));
5407 new_temp = make_ssa_name (vec_dest, new_stmt);
5408 gimple_call_set_lhs (new_stmt, new_temp);
5410 if (compute_in_loop)
5411 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5412 else
5414 /* Generate the misalignment computation outside LOOP. */
5415 pe = loop_preheader_edge (loop);
5416 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5417 gcc_assert (!new_bb);
5420 *realignment_token = gimple_call_lhs (new_stmt);
5422 /* The result of the CALL_EXPR to this builtin is determined from
5423 the value of the parameter and no global variables are touched
5424 which makes the builtin a "const" function. Requiring the
5425 builtin to have the "const" attribute makes it unnecessary
5426 to call mark_call_clobbered. */
5427 gcc_assert (TREE_READONLY (builtin_decl));
5430 if (alignment_support_scheme == dr_explicit_realign)
5431 return msq;
5433 gcc_assert (!compute_in_loop);
5434 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5437 /* 5. Create msq = phi <msq_init, lsq> in loop */
5439 pe = loop_preheader_edge (containing_loop);
5440 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5441 msq = make_ssa_name (vec_dest);
5442 phi_stmt = create_phi_node (msq, containing_loop->header);
5443 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5445 return msq;
5449 /* Function vect_grouped_load_supported.
5451 COUNT is the size of the load group (the number of statements plus the
5452 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5453 only one statement, with a gap of COUNT - 1.
5455 Returns true if a suitable permute exists. */
5457 bool
5458 vect_grouped_load_supported (tree vectype, bool single_element_p,
5459 unsigned HOST_WIDE_INT count)
5461 machine_mode mode = TYPE_MODE (vectype);
5463 /* If this is single-element interleaving with an element distance
5464 that leaves unused vector loads around punt - we at least create
5465 very sub-optimal code in that case (and blow up memory,
5466 see PR65518). */
5467 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5469 if (dump_enabled_p ())
5470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5471 "single-element interleaving not supported "
5472 "for not adjacent vector loads\n");
5473 return false;
5476 /* vect_permute_load_chain requires the group size to be equal to 3 or
5477 be a power of two. */
5478 if (count != 3 && exact_log2 (count) == -1)
5480 if (dump_enabled_p ())
5481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5482 "the size of the group of accesses"
5483 " is not a power of 2 or not equal to 3\n");
5484 return false;
5487 /* Check that the permutation is supported. */
5488 if (VECTOR_MODE_P (mode))
5490 unsigned int i, j;
5491 if (count == 3)
5493 unsigned int nelt;
5494 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498 "cannot handle groups of 3 loads for"
5499 " variable-length vectors\n");
5500 return false;
5503 vec_perm_builder sel (nelt, nelt, 1);
5504 sel.quick_grow (nelt);
5505 vec_perm_indices indices;
5506 unsigned int k;
5507 for (k = 0; k < 3; k++)
5509 for (i = 0; i < nelt; i++)
5510 if (3 * i + k < 2 * nelt)
5511 sel[i] = 3 * i + k;
5512 else
5513 sel[i] = 0;
5514 indices.new_vector (sel, 2, nelt);
5515 if (!can_vec_perm_const_p (mode, indices))
5517 if (dump_enabled_p ())
5518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5519 "shuffle of 3 loads is not supported by"
5520 " target\n");
5521 return false;
5523 for (i = 0, j = 0; i < nelt; i++)
5524 if (3 * i + k < 2 * nelt)
5525 sel[i] = i;
5526 else
5527 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5528 indices.new_vector (sel, 2, nelt);
5529 if (!can_vec_perm_const_p (mode, indices))
5531 if (dump_enabled_p ())
5532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5533 "shuffle of 3 loads is not supported by"
5534 " target\n");
5535 return false;
5538 return true;
5540 else
5542 /* If length is not equal to 3 then only power of 2 is supported. */
5543 gcc_assert (pow2p_hwi (count));
5544 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5546 /* The encoding has a single stepped pattern. */
5547 vec_perm_builder sel (nelt, 1, 3);
5548 sel.quick_grow (3);
5549 for (i = 0; i < 3; i++)
5550 sel[i] = i * 2;
5551 vec_perm_indices indices (sel, 2, nelt);
5552 if (can_vec_perm_const_p (mode, indices))
5554 for (i = 0; i < 3; i++)
5555 sel[i] = i * 2 + 1;
5556 indices.new_vector (sel, 2, nelt);
5557 if (can_vec_perm_const_p (mode, indices))
5558 return true;
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565 "extract even/odd not supported by target\n");
5566 return false;
5569 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5570 type VECTYPE. MASKED_P says whether the masked form is needed. */
5572 bool
5573 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5574 bool masked_p)
5576 if (masked_p)
5577 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5578 vec_mask_load_lanes_optab,
5579 vectype, count);
5580 else
5581 return vect_lanes_optab_supported_p ("vec_load_lanes",
5582 vec_load_lanes_optab,
5583 vectype, count);
5586 /* Function vect_permute_load_chain.
5588 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5589 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5590 the input data correctly. Return the final references for loads in
5591 RESULT_CHAIN.
5593 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5594 The input is 4 vectors each containing 8 elements. We assign a number to each
5595 element, the input sequence is:
5597 1st vec: 0 1 2 3 4 5 6 7
5598 2nd vec: 8 9 10 11 12 13 14 15
5599 3rd vec: 16 17 18 19 20 21 22 23
5600 4th vec: 24 25 26 27 28 29 30 31
5602 The output sequence should be:
5604 1st vec: 0 4 8 12 16 20 24 28
5605 2nd vec: 1 5 9 13 17 21 25 29
5606 3rd vec: 2 6 10 14 18 22 26 30
5607 4th vec: 3 7 11 15 19 23 27 31
5609 i.e., the first output vector should contain the first elements of each
5610 interleaving group, etc.
5612 We use extract_even/odd instructions to create such output. The input of
5613 each extract_even/odd operation is two vectors
5614 1st vec 2nd vec
5615 0 1 2 3 4 5 6 7
5617 and the output is the vector of extracted even/odd elements. The output of
5618 extract_even will be: 0 2 4 6
5619 and of extract_odd: 1 3 5 7
5622 The permutation is done in log LENGTH stages. In each stage extract_even
5623 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5624 their order. In our example,
5626 E1: extract_even (1st vec, 2nd vec)
5627 E2: extract_odd (1st vec, 2nd vec)
5628 E3: extract_even (3rd vec, 4th vec)
5629 E4: extract_odd (3rd vec, 4th vec)
5631 The output for the first stage will be:
5633 E1: 0 2 4 6 8 10 12 14
5634 E2: 1 3 5 7 9 11 13 15
5635 E3: 16 18 20 22 24 26 28 30
5636 E4: 17 19 21 23 25 27 29 31
5638 In order to proceed and create the correct sequence for the next stage (or
5639 for the correct output, if the second stage is the last one, as in our
5640 example), we first put the output of extract_even operation and then the
5641 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5642 The input for the second stage is:
5644 1st vec (E1): 0 2 4 6 8 10 12 14
5645 2nd vec (E3): 16 18 20 22 24 26 28 30
5646 3rd vec (E2): 1 3 5 7 9 11 13 15
5647 4th vec (E4): 17 19 21 23 25 27 29 31
5649 The output of the second stage:
5651 E1: 0 4 8 12 16 20 24 28
5652 E2: 2 6 10 14 18 22 26 30
5653 E3: 1 5 9 13 17 21 25 29
5654 E4: 3 7 11 15 19 23 27 31
5656 And RESULT_CHAIN after reordering:
5658 1st vec (E1): 0 4 8 12 16 20 24 28
5659 2nd vec (E3): 1 5 9 13 17 21 25 29
5660 3rd vec (E2): 2 6 10 14 18 22 26 30
5661 4th vec (E4): 3 7 11 15 19 23 27 31. */
5663 static void
5664 vect_permute_load_chain (vec<tree> dr_chain,
5665 unsigned int length,
5666 stmt_vec_info stmt_info,
5667 gimple_stmt_iterator *gsi,
5668 vec<tree> *result_chain)
5670 tree data_ref, first_vect, second_vect;
5671 tree perm_mask_even, perm_mask_odd;
5672 tree perm3_mask_low, perm3_mask_high;
5673 gimple *perm_stmt;
5674 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5675 unsigned int i, j, log_length = exact_log2 (length);
5677 result_chain->quick_grow (length);
5678 memcpy (result_chain->address (), dr_chain.address (),
5679 length * sizeof (tree));
5681 if (length == 3)
5683 /* vect_grouped_load_supported ensures that this is constant. */
5684 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5685 unsigned int k;
5687 vec_perm_builder sel (nelt, nelt, 1);
5688 sel.quick_grow (nelt);
5689 vec_perm_indices indices;
5690 for (k = 0; k < 3; k++)
5692 for (i = 0; i < nelt; i++)
5693 if (3 * i + k < 2 * nelt)
5694 sel[i] = 3 * i + k;
5695 else
5696 sel[i] = 0;
5697 indices.new_vector (sel, 2, nelt);
5698 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5700 for (i = 0, j = 0; i < nelt; i++)
5701 if (3 * i + k < 2 * nelt)
5702 sel[i] = i;
5703 else
5704 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5705 indices.new_vector (sel, 2, nelt);
5706 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5708 first_vect = dr_chain[0];
5709 second_vect = dr_chain[1];
5711 /* Create interleaving stmt (low part of):
5712 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5713 ...}> */
5714 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5715 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5716 second_vect, perm3_mask_low);
5717 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5719 /* Create interleaving stmt (high part of):
5720 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5721 ...}> */
5722 first_vect = data_ref;
5723 second_vect = dr_chain[2];
5724 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5725 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5726 second_vect, perm3_mask_high);
5727 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5728 (*result_chain)[k] = data_ref;
5731 else
5733 /* If length is not equal to 3 then only power of 2 is supported. */
5734 gcc_assert (pow2p_hwi (length));
5736 /* The encoding has a single stepped pattern. */
5737 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5738 vec_perm_builder sel (nelt, 1, 3);
5739 sel.quick_grow (3);
5740 for (i = 0; i < 3; ++i)
5741 sel[i] = i * 2;
5742 vec_perm_indices indices (sel, 2, nelt);
5743 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5745 for (i = 0; i < 3; ++i)
5746 sel[i] = i * 2 + 1;
5747 indices.new_vector (sel, 2, nelt);
5748 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5750 for (i = 0; i < log_length; i++)
5752 for (j = 0; j < length; j += 2)
5754 first_vect = dr_chain[j];
5755 second_vect = dr_chain[j+1];
5757 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5758 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5759 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5760 first_vect, second_vect,
5761 perm_mask_even);
5762 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5763 (*result_chain)[j/2] = data_ref;
5765 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5766 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5767 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5768 first_vect, second_vect,
5769 perm_mask_odd);
5770 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5771 (*result_chain)[j/2+length/2] = data_ref;
5773 memcpy (dr_chain.address (), result_chain->address (),
5774 length * sizeof (tree));
5779 /* Function vect_shift_permute_load_chain.
5781 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5782 sequence of stmts to reorder the input data accordingly.
5783 Return the final references for loads in RESULT_CHAIN.
5784 Return true if successed, false otherwise.
5786 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5787 The input is 3 vectors each containing 8 elements. We assign a
5788 number to each element, the input sequence is:
5790 1st vec: 0 1 2 3 4 5 6 7
5791 2nd vec: 8 9 10 11 12 13 14 15
5792 3rd vec: 16 17 18 19 20 21 22 23
5794 The output sequence should be:
5796 1st vec: 0 3 6 9 12 15 18 21
5797 2nd vec: 1 4 7 10 13 16 19 22
5798 3rd vec: 2 5 8 11 14 17 20 23
5800 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5802 First we shuffle all 3 vectors to get correct elements order:
5804 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5805 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5806 3rd vec: (16 19 22) (17 20 23) (18 21)
5808 Next we unite and shift vector 3 times:
5810 1st step:
5811 shift right by 6 the concatenation of:
5812 "1st vec" and "2nd vec"
5813 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5814 "2nd vec" and "3rd vec"
5815 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5816 "3rd vec" and "1st vec"
5817 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5818 | New vectors |
5820 So that now new vectors are:
5822 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5823 2nd vec: (10 13) (16 19 22) (17 20 23)
5824 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5826 2nd step:
5827 shift right by 5 the concatenation of:
5828 "1st vec" and "3rd vec"
5829 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5830 "2nd vec" and "1st vec"
5831 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5832 "3rd vec" and "2nd vec"
5833 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5834 | New vectors |
5836 So that now new vectors are:
5838 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5839 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5840 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5842 3rd step:
5843 shift right by 5 the concatenation of:
5844 "1st vec" and "1st vec"
5845 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5846 shift right by 3 the concatenation of:
5847 "2nd vec" and "2nd vec"
5848 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5849 | New vectors |
5851 So that now all vectors are READY:
5852 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5853 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5854 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5856 This algorithm is faster than one in vect_permute_load_chain if:
5857 1. "shift of a concatination" is faster than general permutation.
5858 This is usually so.
5859 2. The TARGET machine can't execute vector instructions in parallel.
5860 This is because each step of the algorithm depends on previous.
5861 The algorithm in vect_permute_load_chain is much more parallel.
5863 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5866 static bool
5867 vect_shift_permute_load_chain (vec<tree> dr_chain,
5868 unsigned int length,
5869 stmt_vec_info stmt_info,
5870 gimple_stmt_iterator *gsi,
5871 vec<tree> *result_chain)
5873 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5874 tree perm2_mask1, perm2_mask2, perm3_mask;
5875 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5876 gimple *perm_stmt;
5878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5879 unsigned int i;
5880 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5882 unsigned HOST_WIDE_INT nelt, vf;
5883 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5884 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5885 /* Not supported for variable-length vectors. */
5886 return false;
5888 vec_perm_builder sel (nelt, nelt, 1);
5889 sel.quick_grow (nelt);
5891 result_chain->quick_grow (length);
5892 memcpy (result_chain->address (), dr_chain.address (),
5893 length * sizeof (tree));
5895 if (pow2p_hwi (length) && vf > 4)
5897 unsigned int j, log_length = exact_log2 (length);
5898 for (i = 0; i < nelt / 2; ++i)
5899 sel[i] = i * 2;
5900 for (i = 0; i < nelt / 2; ++i)
5901 sel[nelt / 2 + i] = i * 2 + 1;
5902 vec_perm_indices indices (sel, 2, nelt);
5903 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5905 if (dump_enabled_p ())
5906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5907 "shuffle of 2 fields structure is not \
5908 supported by target\n");
5909 return false;
5911 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5913 for (i = 0; i < nelt / 2; ++i)
5914 sel[i] = i * 2 + 1;
5915 for (i = 0; i < nelt / 2; ++i)
5916 sel[nelt / 2 + i] = i * 2;
5917 indices.new_vector (sel, 2, nelt);
5918 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5920 if (dump_enabled_p ())
5921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5922 "shuffle of 2 fields structure is not \
5923 supported by target\n");
5924 return false;
5926 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5928 /* Generating permutation constant to shift all elements.
5929 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5930 for (i = 0; i < nelt; i++)
5931 sel[i] = nelt / 2 + i;
5932 indices.new_vector (sel, 2, nelt);
5933 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5935 if (dump_enabled_p ())
5936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5937 "shift permutation is not supported by target\n");
5938 return false;
5940 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5942 /* Generating permutation constant to select vector from 2.
5943 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5944 for (i = 0; i < nelt / 2; i++)
5945 sel[i] = i;
5946 for (i = nelt / 2; i < nelt; i++)
5947 sel[i] = nelt + i;
5948 indices.new_vector (sel, 2, nelt);
5949 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5951 if (dump_enabled_p ())
5952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5953 "select is not supported by target\n");
5954 return false;
5956 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5958 for (i = 0; i < log_length; i++)
5960 for (j = 0; j < length; j += 2)
5962 first_vect = dr_chain[j];
5963 second_vect = dr_chain[j + 1];
5965 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5966 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5967 first_vect, first_vect,
5968 perm2_mask1);
5969 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5970 vect[0] = data_ref;
5972 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5973 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5974 second_vect, second_vect,
5975 perm2_mask2);
5976 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5977 vect[1] = data_ref;
5979 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5980 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5981 vect[0], vect[1], shift1_mask);
5982 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5983 (*result_chain)[j/2 + length/2] = data_ref;
5985 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5986 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5987 vect[0], vect[1], select_mask);
5988 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5989 (*result_chain)[j/2] = data_ref;
5991 memcpy (dr_chain.address (), result_chain->address (),
5992 length * sizeof (tree));
5994 return true;
5996 if (length == 3 && vf > 2)
5998 unsigned int k = 0, l = 0;
6000 /* Generating permutation constant to get all elements in rigth order.
6001 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6002 for (i = 0; i < nelt; i++)
6004 if (3 * k + (l % 3) >= nelt)
6006 k = 0;
6007 l += (3 - (nelt % 3));
6009 sel[i] = 3 * k + (l % 3);
6010 k++;
6012 vec_perm_indices indices (sel, 2, nelt);
6013 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6015 if (dump_enabled_p ())
6016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6017 "shuffle of 3 fields structure is not \
6018 supported by target\n");
6019 return false;
6021 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6023 /* Generating permutation constant to shift all elements.
6024 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6025 for (i = 0; i < nelt; i++)
6026 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6027 indices.new_vector (sel, 2, nelt);
6028 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6030 if (dump_enabled_p ())
6031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6032 "shift permutation is not supported by target\n");
6033 return false;
6035 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6037 /* Generating permutation constant to shift all elements.
6038 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6039 for (i = 0; i < nelt; i++)
6040 sel[i] = 2 * (nelt / 3) + 1 + i;
6041 indices.new_vector (sel, 2, nelt);
6042 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6044 if (dump_enabled_p ())
6045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6046 "shift permutation is not supported by target\n");
6047 return false;
6049 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6051 /* Generating permutation constant to shift all elements.
6052 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6053 for (i = 0; i < nelt; i++)
6054 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6055 indices.new_vector (sel, 2, nelt);
6056 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6058 if (dump_enabled_p ())
6059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6060 "shift permutation is not supported by target\n");
6061 return false;
6063 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6065 /* Generating permutation constant to shift all elements.
6066 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6067 for (i = 0; i < nelt; i++)
6068 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6069 indices.new_vector (sel, 2, nelt);
6070 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6072 if (dump_enabled_p ())
6073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6074 "shift permutation is not supported by target\n");
6075 return false;
6077 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6079 for (k = 0; k < 3; k++)
6081 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6082 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6083 dr_chain[k], dr_chain[k],
6084 perm3_mask);
6085 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6086 vect[k] = data_ref;
6089 for (k = 0; k < 3; k++)
6091 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6092 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6093 vect[k % 3], vect[(k + 1) % 3],
6094 shift1_mask);
6095 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6096 vect_shift[k] = data_ref;
6099 for (k = 0; k < 3; k++)
6101 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6102 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6103 vect_shift[(4 - k) % 3],
6104 vect_shift[(3 - k) % 3],
6105 shift2_mask);
6106 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6107 vect[k] = data_ref;
6110 (*result_chain)[3 - (nelt % 3)] = vect[2];
6112 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6113 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6114 vect[0], shift3_mask);
6115 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6116 (*result_chain)[nelt % 3] = data_ref;
6118 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6119 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6120 vect[1], shift4_mask);
6121 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6122 (*result_chain)[0] = data_ref;
6123 return true;
6125 return false;
6128 /* Function vect_transform_grouped_load.
6130 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6131 to perform their permutation and ascribe the result vectorized statements to
6132 the scalar statements.
6135 void
6136 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6137 int size, gimple_stmt_iterator *gsi)
6139 machine_mode mode;
6140 vec<tree> result_chain = vNULL;
6142 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6143 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6144 vectors, that are ready for vector computation. */
6145 result_chain.create (size);
6147 /* If reassociation width for vector type is 2 or greater target machine can
6148 execute 2 or more vector instructions in parallel. Otherwise try to
6149 get chain for loads group using vect_shift_permute_load_chain. */
6150 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6151 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6152 || pow2p_hwi (size)
6153 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6154 gsi, &result_chain))
6155 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6156 vect_record_grouped_load_vectors (stmt_info, result_chain);
6157 result_chain.release ();
6160 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6161 generated as part of the vectorization of STMT_INFO. Assign the statement
6162 for each vector to the associated scalar statement. */
6164 void
6165 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6166 vec<tree> result_chain)
6168 vec_info *vinfo = stmt_info->vinfo;
6169 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6170 unsigned int i, gap_count;
6171 tree tmp_data_ref;
6173 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6174 Since we scan the chain starting from it's first node, their order
6175 corresponds the order of data-refs in RESULT_CHAIN. */
6176 stmt_vec_info next_stmt_info = first_stmt_info;
6177 gap_count = 1;
6178 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6180 if (!next_stmt_info)
6181 break;
6183 /* Skip the gaps. Loads created for the gaps will be removed by dead
6184 code elimination pass later. No need to check for the first stmt in
6185 the group, since it always exists.
6186 DR_GROUP_GAP is the number of steps in elements from the previous
6187 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6188 correspond to the gaps. */
6189 if (next_stmt_info != first_stmt_info
6190 && gap_count < DR_GROUP_GAP (next_stmt_info))
6192 gap_count++;
6193 continue;
6196 while (next_stmt_info)
6198 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6199 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6200 copies, and we put the new vector statement in the first available
6201 RELATED_STMT. */
6202 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6203 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6204 else
6206 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6208 stmt_vec_info prev_stmt_info
6209 = STMT_VINFO_VEC_STMT (next_stmt_info);
6210 stmt_vec_info rel_stmt_info
6211 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6212 while (rel_stmt_info)
6214 prev_stmt_info = rel_stmt_info;
6215 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6218 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6222 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6223 gap_count = 1;
6224 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6225 put the same TMP_DATA_REF as its vectorized statement; otherwise
6226 get the next data-ref from RESULT_CHAIN. */
6227 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6228 break;
6233 /* Function vect_force_dr_alignment_p.
6235 Returns whether the alignment of a DECL can be forced to be aligned
6236 on ALIGNMENT bit boundary. */
6238 bool
6239 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6241 if (!VAR_P (decl))
6242 return false;
6244 if (decl_in_symtab_p (decl)
6245 && !symtab_node::get (decl)->can_increase_alignment_p ())
6246 return false;
6248 if (TREE_STATIC (decl))
6249 return (alignment <= MAX_OFILE_ALIGNMENT);
6250 else
6251 return (alignment <= MAX_STACK_ALIGNMENT);
6255 /* Return whether the data reference DR_INFO is supported with respect to its
6256 alignment.
6257 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6258 it is aligned, i.e., check if it is possible to vectorize it with different
6259 alignment. */
6261 enum dr_alignment_support
6262 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6263 bool check_aligned_accesses)
6265 data_reference *dr = dr_info->dr;
6266 stmt_vec_info stmt_info = dr_info->stmt;
6267 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6268 machine_mode mode = TYPE_MODE (vectype);
6269 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6270 struct loop *vect_loop = NULL;
6271 bool nested_in_vect_loop = false;
6273 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6274 return dr_aligned;
6276 /* For now assume all conditional loads/stores support unaligned
6277 access without any special code. */
6278 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6279 if (gimple_call_internal_p (stmt)
6280 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6281 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6282 return dr_unaligned_supported;
6284 if (loop_vinfo)
6286 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6287 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6290 /* Possibly unaligned access. */
6292 /* We can choose between using the implicit realignment scheme (generating
6293 a misaligned_move stmt) and the explicit realignment scheme (generating
6294 aligned loads with a REALIGN_LOAD). There are two variants to the
6295 explicit realignment scheme: optimized, and unoptimized.
6296 We can optimize the realignment only if the step between consecutive
6297 vector loads is equal to the vector size. Since the vector memory
6298 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6299 is guaranteed that the misalignment amount remains the same throughout the
6300 execution of the vectorized loop. Therefore, we can create the
6301 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6302 at the loop preheader.
6304 However, in the case of outer-loop vectorization, when vectorizing a
6305 memory access in the inner-loop nested within the LOOP that is now being
6306 vectorized, while it is guaranteed that the misalignment of the
6307 vectorized memory access will remain the same in different outer-loop
6308 iterations, it is *not* guaranteed that is will remain the same throughout
6309 the execution of the inner-loop. This is because the inner-loop advances
6310 with the original scalar step (and not in steps of VS). If the inner-loop
6311 step happens to be a multiple of VS, then the misalignment remains fixed
6312 and we can use the optimized realignment scheme. For example:
6314 for (i=0; i<N; i++)
6315 for (j=0; j<M; j++)
6316 s += a[i+j];
6318 When vectorizing the i-loop in the above example, the step between
6319 consecutive vector loads is 1, and so the misalignment does not remain
6320 fixed across the execution of the inner-loop, and the realignment cannot
6321 be optimized (as illustrated in the following pseudo vectorized loop):
6323 for (i=0; i<N; i+=4)
6324 for (j=0; j<M; j++){
6325 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6326 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6327 // (assuming that we start from an aligned address).
6330 We therefore have to use the unoptimized realignment scheme:
6332 for (i=0; i<N; i+=4)
6333 for (j=k; j<M; j+=4)
6334 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6335 // that the misalignment of the initial address is
6336 // 0).
6338 The loop can then be vectorized as follows:
6340 for (k=0; k<4; k++){
6341 rt = get_realignment_token (&vp[k]);
6342 for (i=0; i<N; i+=4){
6343 v1 = vp[i+k];
6344 for (j=k; j<M; j+=4){
6345 v2 = vp[i+j+VS-1];
6346 va = REALIGN_LOAD <v1,v2,rt>;
6347 vs += va;
6348 v1 = v2;
6351 } */
6353 if (DR_IS_READ (dr))
6355 bool is_packed = false;
6356 tree type = (TREE_TYPE (DR_REF (dr)));
6358 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6359 && (!targetm.vectorize.builtin_mask_for_load
6360 || targetm.vectorize.builtin_mask_for_load ()))
6362 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6364 /* If we are doing SLP then the accesses need not have the
6365 same alignment, instead it depends on the SLP group size. */
6366 if (loop_vinfo
6367 && STMT_SLP_TYPE (stmt_info)
6368 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6369 * (DR_GROUP_SIZE
6370 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6371 TYPE_VECTOR_SUBPARTS (vectype)))
6373 else if (!loop_vinfo
6374 || (nested_in_vect_loop
6375 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6376 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6377 return dr_explicit_realign;
6378 else
6379 return dr_explicit_realign_optimized;
6381 if (!known_alignment_for_access_p (dr_info))
6382 is_packed = not_size_aligned (DR_REF (dr));
6384 if (targetm.vectorize.support_vector_misalignment
6385 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6386 /* Can't software pipeline the loads, but can at least do them. */
6387 return dr_unaligned_supported;
6389 else
6391 bool is_packed = false;
6392 tree type = (TREE_TYPE (DR_REF (dr)));
6394 if (!known_alignment_for_access_p (dr_info))
6395 is_packed = not_size_aligned (DR_REF (dr));
6397 if (targetm.vectorize.support_vector_misalignment
6398 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6399 return dr_unaligned_supported;
6402 /* Unsupported. */
6403 return dr_unaligned_unsupported;