iconv.m4 (AM_ICONV_LINK): Don't overwrite CPPFLAGS.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob5f08cdf42ccd39484af50c5539f8a2d2ecb5791b
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static opt_result
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return opt_result::failure_at (vect_location,
166 "will not create alias checks, as"
167 " --param vect-max-version-for-alias-checks"
168 " == 0\n");
170 opt_result res
171 = runtime_alias_check_p (ddr, loop,
172 optimize_loop_nest_for_speed_p (loop));
173 if (!res)
174 return res;
176 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
177 return opt_result::success ();
180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
182 static void
183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
185 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
186 for (unsigned int i = 0; i < checks.length(); ++i)
187 if (checks[i] == value)
188 return;
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_NOTE, vect_location,
192 "need run-time check that %T is nonzero\n",
193 value);
194 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
197 /* Return true if we know that the order of vectorized DR_INFO_A and
198 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
199 DR_INFO_B. At least one of the accesses is a write. */
201 static bool
202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
204 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
205 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
207 /* Single statements are always kept in their original order. */
208 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
209 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
210 return true;
212 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
213 group are emitted at the position of the last scalar load and all
214 stores in a group are emitted at the position of the last scalar store.
215 Compute that position and check whether the resulting order matches
216 the current one. */
217 stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
218 if (last_a)
219 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
220 s = DR_GROUP_NEXT_ELEMENT (s))
221 last_a = get_later_stmt (last_a, s);
222 else
223 last_a = stmtinfo_a;
224 stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
225 if (last_b)
226 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
227 s = DR_GROUP_NEXT_ELEMENT (s))
228 last_b = get_later_stmt (last_b, s);
229 else
230 last_b = stmtinfo_b;
231 return ((get_later_stmt (last_a, last_b) == last_a)
232 == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
235 /* A subroutine of vect_analyze_data_ref_dependence. Handle
236 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
237 distances. These distances are conservatively correct but they don't
238 reflect a guaranteed dependence.
240 Return true if this function does all the work necessary to avoid
241 an alias or false if the caller should use the dependence distances
242 to limit the vectorization factor in the usual way. LOOP_DEPTH is
243 the depth of the loop described by LOOP_VINFO and the other arguments
244 are as for vect_analyze_data_ref_dependence. */
246 static bool
247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
248 loop_vec_info loop_vinfo,
249 int loop_depth, unsigned int *max_vf)
251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
252 lambda_vector dist_v;
253 unsigned int i;
254 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
256 int dist = dist_v[loop_depth];
257 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
259 /* If the user asserted safelen >= DIST consecutive iterations
260 can be executed concurrently, assume independence.
262 ??? An alternative would be to add the alias check even
263 in this case, and vectorize the fallback loop with the
264 maximum VF set to safelen. However, if the user has
265 explicitly given a length, it's less likely that that
266 would be a win. */
267 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
269 if ((unsigned int) loop->safelen < *max_vf)
270 *max_vf = loop->safelen;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
272 continue;
275 /* For dependence distances of 2 or more, we have the option
276 of limiting VF or checking for an alias at runtime.
277 Prefer to check at runtime if we can, to avoid limiting
278 the VF unnecessarily when the bases are in fact independent.
280 Note that the alias checks will be removed if the VF ends up
281 being small enough. */
282 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
283 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
284 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
285 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
286 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
289 return true;
293 /* Function vect_analyze_data_ref_dependence.
295 FIXME: I needed to change the sense of the returned flag.
297 Return FALSE if there (might) exist a dependence between a memory-reference
298 DRA and a memory-reference DRB. When versioning for alias may check a
299 dependence at run-time, return TRUE. Adjust *MAX_VF according to
300 the data dependence. */
302 static opt_result
303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
304 loop_vec_info loop_vinfo,
305 unsigned int *max_vf)
307 unsigned int i;
308 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309 struct data_reference *dra = DDR_A (ddr);
310 struct data_reference *drb = DDR_B (ddr);
311 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
312 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
313 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
314 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
315 lambda_vector dist_v;
316 unsigned int loop_depth;
318 /* In loop analysis all data references should be vectorizable. */
319 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
320 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
321 gcc_unreachable ();
323 /* Independent data accesses. */
324 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
325 return opt_result::success ();
327 if (dra == drb
328 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
329 return opt_result::success ();
331 /* We do not have to consider dependences between accesses that belong
332 to the same group, unless the stride could be smaller than the
333 group size. */
334 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
335 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
336 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
337 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
338 return opt_result::success ();
340 /* Even if we have an anti-dependence then, as the vectorized loop covers at
341 least two scalar iterations, there is always also a true dependence.
342 As the vectorizer does not re-order loads and stores we can ignore
343 the anti-dependence if TBAA can disambiguate both DRs similar to the
344 case with known negative distance anti-dependences (positive
345 distance anti-dependences would violate TBAA constraints). */
346 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
347 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
348 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
349 get_alias_set (DR_REF (drb))))
350 return opt_result::success ();
352 /* Unknown data dependence. */
353 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
355 /* If user asserted safelen consecutive iterations can be
356 executed concurrently, assume independence. */
357 if (loop->safelen >= 2)
359 if ((unsigned int) loop->safelen < *max_vf)
360 *max_vf = loop->safelen;
361 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
362 return opt_result::success ();
365 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
366 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
367 return opt_result::failure_at
368 (stmtinfo_a->stmt,
369 "versioning for alias not supported for: "
370 "can't determine dependence between %T and %T\n",
371 DR_REF (dra), DR_REF (drb));
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
375 "versioning for alias required: "
376 "can't determine dependence between %T and %T\n",
377 DR_REF (dra), DR_REF (drb));
379 /* Add to list of ddrs that need to be tested at run-time. */
380 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
383 /* Known data dependence. */
384 if (DDR_NUM_DIST_VECTS (ddr) == 0)
386 /* If user asserted safelen consecutive iterations can be
387 executed concurrently, assume independence. */
388 if (loop->safelen >= 2)
390 if ((unsigned int) loop->safelen < *max_vf)
391 *max_vf = loop->safelen;
392 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
393 return opt_result::success ();
396 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
397 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
398 return opt_result::failure_at
399 (stmtinfo_a->stmt,
400 "versioning for alias not supported for: "
401 "bad dist vector for %T and %T\n",
402 DR_REF (dra), DR_REF (drb));
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
406 "versioning for alias required: "
407 "bad dist vector for %T and %T\n",
408 DR_REF (dra), DR_REF (drb));
409 /* Add to list of ddrs that need to be tested at run-time. */
410 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
413 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
415 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
416 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
417 loop_depth, max_vf))
418 return opt_result::success ();
420 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
422 int dist = dist_v[loop_depth];
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "dependence distance = %d.\n", dist);
428 if (dist == 0)
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_NOTE, vect_location,
432 "dependence distance == 0 between %T and %T\n",
433 DR_REF (dra), DR_REF (drb));
435 /* When we perform grouped accesses and perform implicit CSE
436 by detecting equal accesses and doing disambiguation with
437 runtime alias tests like for
438 .. = a[i];
439 .. = a[i+1];
440 a[i] = ..;
441 a[i+1] = ..;
442 *p = ..;
443 .. = a[i];
444 .. = a[i+1];
445 where we will end up loading { a[i], a[i+1] } once, make
446 sure that inserting group loads before the first load and
447 stores after the last store will do the right thing.
448 Similar for groups like
449 a[i] = ...;
450 ... = a[i];
451 a[i+1] = ...;
452 where loads from the group interleave with the store. */
453 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
454 return opt_result::failure_at (stmtinfo_a->stmt,
455 "READ_WRITE dependence"
456 " in interleaving.\n");
458 if (loop->safelen < 2)
460 tree indicator = dr_zero_step_indicator (dra);
461 if (!indicator || integer_zerop (indicator))
462 return opt_result::failure_at (stmtinfo_a->stmt,
463 "access also has a zero step\n");
464 else if (TREE_CODE (indicator) != INTEGER_CST)
465 vect_check_nonzero_value (loop_vinfo, indicator);
467 continue;
470 if (dist > 0 && DDR_REVERSED_P (ddr))
472 /* If DDR_REVERSED_P the order of the data-refs in DDR was
473 reversed (to make distance vector positive), and the actual
474 distance is negative. */
475 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "dependence distance negative.\n");
478 /* Record a negative dependence distance to later limit the
479 amount of stmt copying / unrolling we can perform.
480 Only need to handle read-after-write dependence. */
481 if (DR_IS_READ (drb)
482 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
483 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
484 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
485 continue;
488 unsigned int abs_dist = abs (dist);
489 if (abs_dist >= 2 && abs_dist < *max_vf)
491 /* The dependence distance requires reduction of the maximal
492 vectorization factor. */
493 *max_vf = abs (dist);
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location,
496 "adjusting maximal vectorization factor to %i\n",
497 *max_vf);
500 if (abs_dist >= *max_vf)
502 /* Dependence distance does not create dependence, as far as
503 vectorization is concerned, in this case. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location,
506 "dependence distance >= VF.\n");
507 continue;
510 return opt_result::failure_at (stmtinfo_a->stmt,
511 "not vectorized, possible dependence "
512 "between data-refs %T and %T\n",
513 DR_REF (dra), DR_REF (drb));
516 return opt_result::success ();
519 /* Function vect_analyze_data_ref_dependences.
521 Examine all the data references in the loop, and make sure there do not
522 exist any data dependences between them. Set *MAX_VF according to
523 the maximum vectorization factor the data dependences allow. */
525 opt_result
526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
527 unsigned int *max_vf)
529 unsigned int i;
530 struct data_dependence_relation *ddr;
532 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
534 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
536 LOOP_VINFO_DDRS (loop_vinfo)
537 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
538 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
539 /* We need read-read dependences to compute
540 STMT_VINFO_SAME_ALIGN_REFS. */
541 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
542 &LOOP_VINFO_DDRS (loop_vinfo),
543 LOOP_VINFO_LOOP_NEST (loop_vinfo),
544 true);
545 gcc_assert (res);
548 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
550 /* For epilogues we either have no aliases or alias versioning
551 was applied to original loop. Therefore we may just get max_vf
552 using VF of original loop. */
553 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
554 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
555 else
556 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
558 opt_result res
559 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
560 if (!res)
561 return res;
564 return opt_result::success ();
568 /* Function vect_slp_analyze_data_ref_dependence.
570 Return TRUE if there (might) exist a dependence between a memory-reference
571 DRA and a memory-reference DRB for VINFO. When versioning for alias
572 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
573 according to the data dependence. */
575 static bool
576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
577 struct data_dependence_relation *ddr)
579 struct data_reference *dra = DDR_A (ddr);
580 struct data_reference *drb = DDR_B (ddr);
581 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
582 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
584 /* We need to check dependences of statements marked as unvectorizable
585 as well, they still can prohibit vectorization. */
587 /* Independent data accesses. */
588 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
589 return false;
591 if (dra == drb)
592 return false;
594 /* Read-read is OK. */
595 if (DR_IS_READ (dra) && DR_IS_READ (drb))
596 return false;
598 /* If dra and drb are part of the same interleaving chain consider
599 them independent. */
600 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
601 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
602 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
603 return false;
605 /* Unknown data dependence. */
606 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
610 "can't determine dependence between %T and %T\n",
611 DR_REF (dra), DR_REF (drb));
613 else if (dump_enabled_p ())
614 dump_printf_loc (MSG_NOTE, vect_location,
615 "determined dependence between %T and %T\n",
616 DR_REF (dra), DR_REF (drb));
618 return true;
622 /* Analyze dependences involved in the transform of SLP NODE. STORES
623 contain the vector of scalar stores of this instance if we are
624 disambiguating the loads. */
626 static bool
627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
628 vec<stmt_vec_info> stores,
629 stmt_vec_info last_store_info)
631 /* This walks over all stmts involved in the SLP load/store done
632 in NODE verifying we can sink them up to the last stmt in the
633 group. */
634 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
635 vec_info *vinfo = last_access_info->vinfo;
636 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
638 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
639 if (access_info == last_access_info)
640 continue;
641 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
642 ao_ref ref;
643 bool ref_initialized_p = false;
644 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
645 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
647 gimple *stmt = gsi_stmt (gsi);
648 if (! gimple_vuse (stmt)
649 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
650 continue;
652 /* If we couldn't record a (single) data reference for this
653 stmt we have to resort to the alias oracle. */
654 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
655 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
656 if (!dr_b)
658 /* We are moving a store or sinking a load - this means
659 we cannot use TBAA for disambiguation. */
660 if (!ref_initialized_p)
661 ao_ref_init (&ref, DR_REF (dr_a));
662 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
663 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
664 return false;
665 continue;
668 bool dependent = false;
669 /* If we run into a store of this same instance (we've just
670 marked those) then delay dependence checking until we run
671 into the last store because this is where it will have
672 been sunk to (and we verify if we can do that as well). */
673 if (gimple_visited_p (stmt))
675 if (stmt_info != last_store_info)
676 continue;
677 unsigned i;
678 stmt_vec_info store_info;
679 FOR_EACH_VEC_ELT (stores, i, store_info)
681 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
682 ddr_p ddr = initialize_data_dependence_relation
683 (dr_a, store_dr, vNULL);
684 dependent
685 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
686 free_dependence_relation (ddr);
687 if (dependent)
688 break;
691 else
693 ddr_p ddr = initialize_data_dependence_relation (dr_a,
694 dr_b, vNULL);
695 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
696 free_dependence_relation (ddr);
698 if (dependent)
699 return false;
702 return true;
706 /* Function vect_analyze_data_ref_dependences.
708 Examine all the data references in the basic-block, and make sure there
709 do not exist any data dependences between them. Set *MAX_VF according to
710 the maximum vectorization factor the data dependences allow. */
712 bool
713 vect_slp_analyze_instance_dependence (slp_instance instance)
715 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
717 /* The stores of this instance are at the root of the SLP tree. */
718 slp_tree store = SLP_INSTANCE_TREE (instance);
719 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
720 store = NULL;
722 /* Verify we can sink stores to the vectorized stmt insert location. */
723 stmt_vec_info last_store_info = NULL;
724 if (store)
726 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
727 return false;
729 /* Mark stores in this instance and remember the last one. */
730 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
731 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
732 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
735 bool res = true;
737 /* Verify we can sink loads to the vectorized stmt insert location,
738 special-casing stores of this instance. */
739 slp_tree load;
740 unsigned int i;
741 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
742 if (! vect_slp_analyze_node_dependences (instance, load,
743 store
744 ? SLP_TREE_SCALAR_STMTS (store)
745 : vNULL, last_store_info))
747 res = false;
748 break;
751 /* Unset the visited flag. */
752 if (store)
753 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
754 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
756 return res;
759 /* Record the base alignment guarantee given by DRB, which occurs
760 in STMT_INFO. */
762 static void
763 vect_record_base_alignment (stmt_vec_info stmt_info,
764 innermost_loop_behavior *drb)
766 vec_info *vinfo = stmt_info->vinfo;
767 bool existed;
768 innermost_loop_behavior *&entry
769 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
770 if (!existed || entry->base_alignment < drb->base_alignment)
772 entry = drb;
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "recording new base alignment for %T\n"
776 " alignment: %d\n"
777 " misalignment: %d\n"
778 " based on: %G",
779 drb->base_address,
780 drb->base_alignment,
781 drb->base_misalignment,
782 stmt_info->stmt);
786 /* If the region we're going to vectorize is reached, all unconditional
787 data references occur at least once. We can therefore pool the base
788 alignment guarantees from each unconditional reference. Do this by
789 going through all the data references in VINFO and checking whether
790 the containing statement makes the reference unconditionally. If so,
791 record the alignment of the base address in VINFO so that it can be
792 used for all other references with the same base. */
794 void
795 vect_record_base_alignments (vec_info *vinfo)
797 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
798 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
799 data_reference *dr;
800 unsigned int i;
801 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
803 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
804 stmt_vec_info stmt_info = dr_info->stmt;
805 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
806 && STMT_VINFO_VECTORIZABLE (stmt_info)
807 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
809 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
811 /* If DR is nested in the loop that is being vectorized, we can also
812 record the alignment of the base wrt the outer loop. */
813 if (loop && nested_in_vect_loop_p (loop, stmt_info))
814 vect_record_base_alignment
815 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
820 /* Return the target alignment for the vectorized form of DR_INFO. */
822 static unsigned int
823 vect_calculate_target_alignment (dr_vec_info *dr_info)
825 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
826 return targetm.vectorize.preferred_vector_alignment (vectype);
829 /* Function vect_compute_data_ref_alignment
831 Compute the misalignment of the data reference DR_INFO.
833 Output:
834 1. DR_MISALIGNMENT (DR_INFO) is defined.
836 FOR NOW: No analysis is actually performed. Misalignment is calculated
837 only for trivial cases. TODO. */
839 static void
840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
842 stmt_vec_info stmt_info = dr_info->stmt;
843 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
844 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
845 struct loop *loop = NULL;
846 tree ref = DR_REF (dr_info->dr);
847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "vect_compute_data_ref_alignment:\n");
853 if (loop_vinfo)
854 loop = LOOP_VINFO_LOOP (loop_vinfo);
856 /* Initialize misalignment to unknown. */
857 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
859 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
860 return;
862 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
863 bool step_preserves_misalignment_p;
865 unsigned HOST_WIDE_INT vector_alignment
866 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
867 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
869 /* No step for BB vectorization. */
870 if (!loop)
872 gcc_assert (integer_zerop (drb->step));
873 step_preserves_misalignment_p = true;
876 /* In case the dataref is in an inner-loop of the loop that is being
877 vectorized (LOOP), we use the base and misalignment information
878 relative to the outer-loop (LOOP). This is ok only if the misalignment
879 stays the same throughout the execution of the inner-loop, which is why
880 we have to check that the stride of the dataref in the inner-loop evenly
881 divides by the vector alignment. */
882 else if (nested_in_vect_loop_p (loop, stmt_info))
884 step_preserves_misalignment_p
885 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
887 if (dump_enabled_p ())
889 if (step_preserves_misalignment_p)
890 dump_printf_loc (MSG_NOTE, vect_location,
891 "inner step divides the vector alignment.\n");
892 else
893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
894 "inner step doesn't divide the vector"
895 " alignment.\n");
899 /* Similarly we can only use base and misalignment information relative to
900 an innermost loop if the misalignment stays the same throughout the
901 execution of the loop. As above, this is the case if the stride of
902 the dataref evenly divides by the alignment. */
903 else
905 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
906 step_preserves_misalignment_p
907 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
909 if (!step_preserves_misalignment_p && dump_enabled_p ())
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "step doesn't divide the vector alignment.\n");
914 unsigned int base_alignment = drb->base_alignment;
915 unsigned int base_misalignment = drb->base_misalignment;
917 /* Calculate the maximum of the pooled base address alignment and the
918 alignment that we can compute for DR itself. */
919 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
920 if (entry && base_alignment < (*entry)->base_alignment)
922 base_alignment = (*entry)->base_alignment;
923 base_misalignment = (*entry)->base_misalignment;
926 if (drb->offset_alignment < vector_alignment
927 || !step_preserves_misalignment_p
928 /* We need to know whether the step wrt the vectorized loop is
929 negative when computing the starting misalignment below. */
930 || TREE_CODE (drb->step) != INTEGER_CST)
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
934 "Unknown alignment for access: %T\n", ref);
935 return;
938 if (base_alignment < vector_alignment)
940 unsigned int max_alignment;
941 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
942 if (max_alignment < vector_alignment
943 || !vect_can_force_dr_alignment_p (base,
944 vector_alignment * BITS_PER_UNIT))
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE, vect_location,
948 "can't force alignment of ref: %T\n", ref);
949 return;
952 /* Force the alignment of the decl.
953 NOTE: This is the only change to the code we make during
954 the analysis phase, before deciding to vectorize the loop. */
955 if (dump_enabled_p ())
956 dump_printf_loc (MSG_NOTE, vect_location,
957 "force alignment of %T\n", ref);
959 dr_info->base_decl = base;
960 dr_info->base_misaligned = true;
961 base_misalignment = 0;
963 poly_int64 misalignment
964 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
966 /* If this is a backward running DR then first access in the larger
967 vectype actually is N-1 elements before the address in the DR.
968 Adjust misalign accordingly. */
969 if (tree_int_cst_sgn (drb->step) < 0)
970 /* PLUS because STEP is negative. */
971 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
972 * TREE_INT_CST_LOW (drb->step));
974 unsigned int const_misalignment;
975 if (!known_misalignment (misalignment, vector_alignment,
976 &const_misalignment))
978 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "Non-constant misalignment for access: %T\n", ref);
981 return;
984 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "misalign = %d bytes of ref %T\n",
989 DR_MISALIGNMENT (dr_info), ref);
991 return;
994 /* Function vect_update_misalignment_for_peel.
995 Sets DR_INFO's misalignment
996 - to 0 if it has the same alignment as DR_PEEL_INFO,
997 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
998 - to -1 (unknown) otherwise.
1000 DR_INFO - the data reference whose misalignment is to be adjusted.
1001 DR_PEEL_INFO - the data reference whose misalignment is being made
1002 zero in the vector loop by the peel.
1003 NPEEL - the number of iterations in the peel loop if the misalignment
1004 of DR_PEEL_INFO is known at compile time. */
1006 static void
1007 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1008 dr_vec_info *dr_peel_info, int npeel)
1010 unsigned int i;
1011 vec<dr_p> same_aligned_drs;
1012 struct data_reference *current_dr;
1013 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1015 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1016 it is aligned in the vector loop. */
1017 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1018 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1020 if (current_dr != dr_info->dr)
1021 continue;
1022 gcc_assert (!known_alignment_for_access_p (dr_info)
1023 || !known_alignment_for_access_p (dr_peel_info)
1024 || (DR_MISALIGNMENT (dr_info)
1025 == DR_MISALIGNMENT (dr_peel_info)));
1026 SET_DR_MISALIGNMENT (dr_info, 0);
1027 return;
1030 if (known_alignment_for_access_p (dr_info)
1031 && known_alignment_for_access_p (dr_peel_info))
1033 int misal = DR_MISALIGNMENT (dr_info);
1034 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1035 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1036 SET_DR_MISALIGNMENT (dr_info, misal);
1037 return;
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1042 "to unknown (-1).\n");
1043 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1047 /* Function verify_data_ref_alignment
1049 Return TRUE if DR_INFO can be handled with respect to alignment. */
1051 static opt_result
1052 verify_data_ref_alignment (dr_vec_info *dr_info)
1054 enum dr_alignment_support supportable_dr_alignment
1055 = vect_supportable_dr_alignment (dr_info, false);
1056 if (!supportable_dr_alignment)
1057 return opt_result::failure_at
1058 (dr_info->stmt->stmt,
1059 DR_IS_READ (dr_info->dr)
1060 ? "not vectorized: unsupported unaligned load: %T\n"
1061 : "not vectorized: unsupported unaligned store: %T\n",
1062 DR_REF (dr_info->dr));
1064 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1065 dump_printf_loc (MSG_NOTE, vect_location,
1066 "Vectorizing an unaligned access.\n");
1068 return opt_result::success ();
1071 /* Function vect_verify_datarefs_alignment
1073 Return TRUE if all data references in the loop can be
1074 handled with respect to alignment. */
1076 opt_result
1077 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1079 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1080 struct data_reference *dr;
1081 unsigned int i;
1083 FOR_EACH_VEC_ELT (datarefs, i, dr)
1085 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1086 stmt_vec_info stmt_info = dr_info->stmt;
1088 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1089 continue;
1091 /* For interleaving, only the alignment of the first access matters. */
1092 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1093 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1094 continue;
1096 /* Strided accesses perform only component accesses, alignment is
1097 irrelevant for them. */
1098 if (STMT_VINFO_STRIDED_P (stmt_info)
1099 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1100 continue;
1102 opt_result res = verify_data_ref_alignment (dr_info);
1103 if (!res)
1104 return res;
1107 return opt_result::success ();
1110 /* Given an memory reference EXP return whether its alignment is less
1111 than its size. */
1113 static bool
1114 not_size_aligned (tree exp)
1116 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1117 return true;
1119 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1120 > get_object_alignment (exp));
1123 /* Function vector_alignment_reachable_p
1125 Return true if vector alignment for DR_INFO is reachable by peeling
1126 a few loop iterations. Return false otherwise. */
1128 static bool
1129 vector_alignment_reachable_p (dr_vec_info *dr_info)
1131 stmt_vec_info stmt_info = dr_info->stmt;
1132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1134 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1136 /* For interleaved access we peel only if number of iterations in
1137 the prolog loop ({VF - misalignment}), is a multiple of the
1138 number of the interleaved accesses. */
1139 int elem_size, mis_in_elements;
1141 /* FORNOW: handle only known alignment. */
1142 if (!known_alignment_for_access_p (dr_info))
1143 return false;
1145 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1146 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1147 elem_size = vector_element_size (vector_size, nelements);
1148 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1150 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1151 return false;
1154 /* If misalignment is known at the compile time then allow peeling
1155 only if natural alignment is reachable through peeling. */
1156 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1158 HOST_WIDE_INT elmsize =
1159 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1160 if (dump_enabled_p ())
1162 dump_printf_loc (MSG_NOTE, vect_location,
1163 "data size = %wd. misalignment = %d.\n", elmsize,
1164 DR_MISALIGNMENT (dr_info));
1166 if (DR_MISALIGNMENT (dr_info) % elmsize)
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "data size does not divide the misalignment.\n");
1171 return false;
1175 if (!known_alignment_for_access_p (dr_info))
1177 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1178 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Unknown misalignment, %snaturally aligned\n",
1182 is_packed ? "not " : "");
1183 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1186 return true;
1190 /* Calculate the cost of the memory access represented by DR_INFO. */
1192 static void
1193 vect_get_data_access_cost (dr_vec_info *dr_info,
1194 unsigned int *inside_cost,
1195 unsigned int *outside_cost,
1196 stmt_vector_for_cost *body_cost_vec,
1197 stmt_vector_for_cost *prologue_cost_vec)
1199 stmt_vec_info stmt_info = dr_info->stmt;
1200 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1201 int ncopies;
1203 if (PURE_SLP_STMT (stmt_info))
1204 ncopies = 1;
1205 else
1206 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1208 if (DR_IS_READ (dr_info->dr))
1209 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1210 prologue_cost_vec, body_cost_vec, false);
1211 else
1212 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_NOTE, vect_location,
1216 "vect_get_data_access_cost: inside_cost = %d, "
1217 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1221 typedef struct _vect_peel_info
1223 dr_vec_info *dr_info;
1224 int npeel;
1225 unsigned int count;
1226 } *vect_peel_info;
1228 typedef struct _vect_peel_extended_info
1230 struct _vect_peel_info peel_info;
1231 unsigned int inside_cost;
1232 unsigned int outside_cost;
1233 } *vect_peel_extended_info;
1236 /* Peeling hashtable helpers. */
1238 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1240 static inline hashval_t hash (const _vect_peel_info *);
1241 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1244 inline hashval_t
1245 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1247 return (hashval_t) peel_info->npeel;
1250 inline bool
1251 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1253 return (a->npeel == b->npeel);
1257 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1259 static void
1260 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1261 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1262 int npeel)
1264 struct _vect_peel_info elem, *slot;
1265 _vect_peel_info **new_slot;
1266 bool supportable_dr_alignment
1267 = vect_supportable_dr_alignment (dr_info, true);
1269 elem.npeel = npeel;
1270 slot = peeling_htab->find (&elem);
1271 if (slot)
1272 slot->count++;
1273 else
1275 slot = XNEW (struct _vect_peel_info);
1276 slot->npeel = npeel;
1277 slot->dr_info = dr_info;
1278 slot->count = 1;
1279 new_slot = peeling_htab->find_slot (slot, INSERT);
1280 *new_slot = slot;
1283 if (!supportable_dr_alignment
1284 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1285 slot->count += VECT_MAX_COST;
1289 /* Traverse peeling hash table to find peeling option that aligns maximum
1290 number of data accesses. */
1293 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1294 _vect_peel_extended_info *max)
1296 vect_peel_info elem = *slot;
1298 if (elem->count > max->peel_info.count
1299 || (elem->count == max->peel_info.count
1300 && max->peel_info.npeel > elem->npeel))
1302 max->peel_info.npeel = elem->npeel;
1303 max->peel_info.count = elem->count;
1304 max->peel_info.dr_info = elem->dr_info;
1307 return 1;
1310 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1311 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1312 we assume DR0_INFO's misalignment will be zero after peeling. */
1314 static void
1315 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1316 dr_vec_info *dr0_info,
1317 unsigned int *inside_cost,
1318 unsigned int *outside_cost,
1319 stmt_vector_for_cost *body_cost_vec,
1320 stmt_vector_for_cost *prologue_cost_vec,
1321 unsigned int npeel,
1322 bool unknown_misalignment)
1324 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1325 unsigned i;
1326 data_reference *dr;
1328 FOR_EACH_VEC_ELT (datarefs, i, dr)
1330 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1331 stmt_vec_info stmt_info = dr_info->stmt;
1332 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1333 continue;
1335 /* For interleaving, only the alignment of the first access
1336 matters. */
1337 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1338 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1339 continue;
1341 /* Strided accesses perform only component accesses, alignment is
1342 irrelevant for them. */
1343 if (STMT_VINFO_STRIDED_P (stmt_info)
1344 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1345 continue;
1347 int save_misalignment;
1348 save_misalignment = DR_MISALIGNMENT (dr_info);
1349 if (npeel == 0)
1351 else if (unknown_misalignment && dr_info == dr0_info)
1352 SET_DR_MISALIGNMENT (dr_info, 0);
1353 else
1354 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1355 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1356 body_cost_vec, prologue_cost_vec);
1357 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1361 /* Traverse peeling hash table and calculate cost for each peeling option.
1362 Find the one with the lowest cost. */
1365 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1366 _vect_peel_extended_info *min)
1368 vect_peel_info elem = *slot;
1369 int dummy;
1370 unsigned int inside_cost = 0, outside_cost = 0;
1371 stmt_vec_info stmt_info = elem->dr_info->stmt;
1372 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1373 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1374 epilogue_cost_vec;
1376 prologue_cost_vec.create (2);
1377 body_cost_vec.create (2);
1378 epilogue_cost_vec.create (2);
1380 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1381 &outside_cost, &body_cost_vec,
1382 &prologue_cost_vec, elem->npeel, false);
1384 body_cost_vec.release ();
1386 outside_cost += vect_get_known_peeling_cost
1387 (loop_vinfo, elem->npeel, &dummy,
1388 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1389 &prologue_cost_vec, &epilogue_cost_vec);
1391 /* Prologue and epilogue costs are added to the target model later.
1392 These costs depend only on the scalar iteration cost, the
1393 number of peeling iterations finally chosen, and the number of
1394 misaligned statements. So discard the information found here. */
1395 prologue_cost_vec.release ();
1396 epilogue_cost_vec.release ();
1398 if (inside_cost < min->inside_cost
1399 || (inside_cost == min->inside_cost
1400 && outside_cost < min->outside_cost))
1402 min->inside_cost = inside_cost;
1403 min->outside_cost = outside_cost;
1404 min->peel_info.dr_info = elem->dr_info;
1405 min->peel_info.npeel = elem->npeel;
1406 min->peel_info.count = elem->count;
1409 return 1;
1413 /* Choose best peeling option by traversing peeling hash table and either
1414 choosing an option with the lowest cost (if cost model is enabled) or the
1415 option that aligns as many accesses as possible. */
1417 static struct _vect_peel_extended_info
1418 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1419 loop_vec_info loop_vinfo)
1421 struct _vect_peel_extended_info res;
1423 res.peel_info.dr_info = NULL;
1425 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1427 res.inside_cost = INT_MAX;
1428 res.outside_cost = INT_MAX;
1429 peeling_htab->traverse <_vect_peel_extended_info *,
1430 vect_peeling_hash_get_lowest_cost> (&res);
1432 else
1434 res.peel_info.count = 0;
1435 peeling_htab->traverse <_vect_peel_extended_info *,
1436 vect_peeling_hash_get_most_frequent> (&res);
1437 res.inside_cost = 0;
1438 res.outside_cost = 0;
1441 return res;
1444 /* Return true if the new peeling NPEEL is supported. */
1446 static bool
1447 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1448 unsigned npeel)
1450 unsigned i;
1451 struct data_reference *dr = NULL;
1452 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1453 enum dr_alignment_support supportable_dr_alignment;
1455 /* Ensure that all data refs can be vectorized after the peel. */
1456 FOR_EACH_VEC_ELT (datarefs, i, dr)
1458 int save_misalignment;
1460 if (dr == dr0_info->dr)
1461 continue;
1463 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1464 stmt_vec_info stmt_info = dr_info->stmt;
1465 /* For interleaving, only the alignment of the first access
1466 matters. */
1467 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1468 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1469 continue;
1471 /* Strided accesses perform only component accesses, alignment is
1472 irrelevant for them. */
1473 if (STMT_VINFO_STRIDED_P (stmt_info)
1474 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1475 continue;
1477 save_misalignment = DR_MISALIGNMENT (dr_info);
1478 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1479 supportable_dr_alignment
1480 = vect_supportable_dr_alignment (dr_info, false);
1481 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1483 if (!supportable_dr_alignment)
1484 return false;
1487 return true;
1490 /* Function vect_enhance_data_refs_alignment
1492 This pass will use loop versioning and loop peeling in order to enhance
1493 the alignment of data references in the loop.
1495 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1496 original loop is to be vectorized. Any other loops that are created by
1497 the transformations performed in this pass - are not supposed to be
1498 vectorized. This restriction will be relaxed.
1500 This pass will require a cost model to guide it whether to apply peeling
1501 or versioning or a combination of the two. For example, the scheme that
1502 intel uses when given a loop with several memory accesses, is as follows:
1503 choose one memory access ('p') which alignment you want to force by doing
1504 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1505 other accesses are not necessarily aligned, or (2) use loop versioning to
1506 generate one loop in which all accesses are aligned, and another loop in
1507 which only 'p' is necessarily aligned.
1509 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1510 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1511 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1513 Devising a cost model is the most critical aspect of this work. It will
1514 guide us on which access to peel for, whether to use loop versioning, how
1515 many versions to create, etc. The cost model will probably consist of
1516 generic considerations as well as target specific considerations (on
1517 powerpc for example, misaligned stores are more painful than misaligned
1518 loads).
1520 Here are the general steps involved in alignment enhancements:
1522 -- original loop, before alignment analysis:
1523 for (i=0; i<N; i++){
1524 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1525 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1528 -- After vect_compute_data_refs_alignment:
1529 for (i=0; i<N; i++){
1530 x = q[i]; # DR_MISALIGNMENT(q) = 3
1531 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1534 -- Possibility 1: we do loop versioning:
1535 if (p is aligned) {
1536 for (i=0; i<N; i++){ # loop 1A
1537 x = q[i]; # DR_MISALIGNMENT(q) = 3
1538 p[i] = y; # DR_MISALIGNMENT(p) = 0
1541 else {
1542 for (i=0; i<N; i++){ # loop 1B
1543 x = q[i]; # DR_MISALIGNMENT(q) = 3
1544 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1548 -- Possibility 2: we do loop peeling:
1549 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1550 x = q[i];
1551 p[i] = y;
1553 for (i = 3; i < N; i++){ # loop 2A
1554 x = q[i]; # DR_MISALIGNMENT(q) = 0
1555 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1558 -- Possibility 3: combination of loop peeling and versioning:
1559 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1560 x = q[i];
1561 p[i] = y;
1563 if (p is aligned) {
1564 for (i = 3; i<N; i++){ # loop 3A
1565 x = q[i]; # DR_MISALIGNMENT(q) = 0
1566 p[i] = y; # DR_MISALIGNMENT(p) = 0
1569 else {
1570 for (i = 3; i<N; i++){ # loop 3B
1571 x = q[i]; # DR_MISALIGNMENT(q) = 0
1572 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1576 These loops are later passed to loop_transform to be vectorized. The
1577 vectorizer will use the alignment information to guide the transformation
1578 (whether to generate regular loads/stores, or with special handling for
1579 misalignment). */
1581 opt_result
1582 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1584 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1585 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1586 enum dr_alignment_support supportable_dr_alignment;
1587 dr_vec_info *first_store = NULL;
1588 dr_vec_info *dr0_info = NULL;
1589 struct data_reference *dr;
1590 unsigned int i, j;
1591 bool do_peeling = false;
1592 bool do_versioning = false;
1593 unsigned int npeel = 0;
1594 bool one_misalignment_known = false;
1595 bool one_misalignment_unknown = false;
1596 bool one_dr_unsupportable = false;
1597 dr_vec_info *unsupportable_dr_info = NULL;
1598 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1599 unsigned possible_npeel_number = 1;
1600 tree vectype;
1601 unsigned int mis, same_align_drs_max = 0;
1602 hash_table<peel_info_hasher> peeling_htab (1);
1604 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1606 /* Reset data so we can safely be called multiple times. */
1607 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1608 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1610 /* While cost model enhancements are expected in the future, the high level
1611 view of the code at this time is as follows:
1613 A) If there is a misaligned access then see if peeling to align
1614 this access can make all data references satisfy
1615 vect_supportable_dr_alignment. If so, update data structures
1616 as needed and return true.
1618 B) If peeling wasn't possible and there is a data reference with an
1619 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1620 then see if loop versioning checks can be used to make all data
1621 references satisfy vect_supportable_dr_alignment. If so, update
1622 data structures as needed and return true.
1624 C) If neither peeling nor versioning were successful then return false if
1625 any data reference does not satisfy vect_supportable_dr_alignment.
1627 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1629 Note, Possibility 3 above (which is peeling and versioning together) is not
1630 being done at this time. */
1632 /* (1) Peeling to force alignment. */
1634 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1635 Considerations:
1636 + How many accesses will become aligned due to the peeling
1637 - How many accesses will become unaligned due to the peeling,
1638 and the cost of misaligned accesses.
1639 - The cost of peeling (the extra runtime checks, the increase
1640 in code size). */
1642 FOR_EACH_VEC_ELT (datarefs, i, dr)
1644 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1645 stmt_vec_info stmt_info = dr_info->stmt;
1647 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1648 continue;
1650 /* For interleaving, only the alignment of the first access
1651 matters. */
1652 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1653 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1654 continue;
1656 /* For scatter-gather or invariant accesses there is nothing
1657 to enhance. */
1658 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1659 || integer_zerop (DR_STEP (dr)))
1660 continue;
1662 /* Strided accesses perform only component accesses, alignment is
1663 irrelevant for them. */
1664 if (STMT_VINFO_STRIDED_P (stmt_info)
1665 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1666 continue;
1668 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1669 do_peeling = vector_alignment_reachable_p (dr_info);
1670 if (do_peeling)
1672 if (known_alignment_for_access_p (dr_info))
1674 unsigned int npeel_tmp = 0;
1675 bool negative = tree_int_cst_compare (DR_STEP (dr),
1676 size_zero_node) < 0;
1678 vectype = STMT_VINFO_VECTYPE (stmt_info);
1679 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1680 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1681 mis = (negative
1682 ? DR_MISALIGNMENT (dr_info)
1683 : -DR_MISALIGNMENT (dr_info));
1684 if (DR_MISALIGNMENT (dr_info) != 0)
1685 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1687 /* For multiple types, it is possible that the bigger type access
1688 will have more than one peeling option. E.g., a loop with two
1689 types: one of size (vector size / 4), and the other one of
1690 size (vector size / 8). Vectorization factor will 8. If both
1691 accesses are misaligned by 3, the first one needs one scalar
1692 iteration to be aligned, and the second one needs 5. But the
1693 first one will be aligned also by peeling 5 scalar
1694 iterations, and in that case both accesses will be aligned.
1695 Hence, except for the immediate peeling amount, we also want
1696 to try to add full vector size, while we don't exceed
1697 vectorization factor.
1698 We do this automatically for cost model, since we calculate
1699 cost for every peeling option. */
1700 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1702 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1703 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1704 possible_npeel_number
1705 = vect_get_num_vectors (nscalars, vectype);
1707 /* NPEEL_TMP is 0 when there is no misalignment, but also
1708 allow peeling NELEMENTS. */
1709 if (DR_MISALIGNMENT (dr_info) == 0)
1710 possible_npeel_number++;
1713 /* Save info about DR in the hash table. Also include peeling
1714 amounts according to the explanation above. */
1715 for (j = 0; j < possible_npeel_number; j++)
1717 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1718 dr_info, npeel_tmp);
1719 npeel_tmp += target_align / dr_size;
1722 one_misalignment_known = true;
1724 else
1726 /* If we don't know any misalignment values, we prefer
1727 peeling for data-ref that has the maximum number of data-refs
1728 with the same alignment, unless the target prefers to align
1729 stores over load. */
1730 unsigned same_align_drs
1731 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1732 if (!dr0_info
1733 || same_align_drs_max < same_align_drs)
1735 same_align_drs_max = same_align_drs;
1736 dr0_info = dr_info;
1738 /* For data-refs with the same number of related
1739 accesses prefer the one where the misalign
1740 computation will be invariant in the outermost loop. */
1741 else if (same_align_drs_max == same_align_drs)
1743 struct loop *ivloop0, *ivloop;
1744 ivloop0 = outermost_invariant_loop_for_expr
1745 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1746 ivloop = outermost_invariant_loop_for_expr
1747 (loop, DR_BASE_ADDRESS (dr));
1748 if ((ivloop && !ivloop0)
1749 || (ivloop && ivloop0
1750 && flow_loop_nested_p (ivloop, ivloop0)))
1751 dr0_info = dr_info;
1754 one_misalignment_unknown = true;
1756 /* Check for data refs with unsupportable alignment that
1757 can be peeled. */
1758 if (!supportable_dr_alignment)
1760 one_dr_unsupportable = true;
1761 unsupportable_dr_info = dr_info;
1764 if (!first_store && DR_IS_WRITE (dr))
1765 first_store = dr_info;
1768 else
1770 if (!aligned_access_p (dr_info))
1772 if (dump_enabled_p ())
1773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1774 "vector alignment may not be reachable\n");
1775 break;
1780 /* Check if we can possibly peel the loop. */
1781 if (!vect_can_advance_ivs_p (loop_vinfo)
1782 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1783 || loop->inner)
1784 do_peeling = false;
1786 struct _vect_peel_extended_info peel_for_known_alignment;
1787 struct _vect_peel_extended_info peel_for_unknown_alignment;
1788 struct _vect_peel_extended_info best_peel;
1790 peel_for_unknown_alignment.inside_cost = INT_MAX;
1791 peel_for_unknown_alignment.outside_cost = INT_MAX;
1792 peel_for_unknown_alignment.peel_info.count = 0;
1794 if (do_peeling
1795 && one_misalignment_unknown)
1797 /* Check if the target requires to prefer stores over loads, i.e., if
1798 misaligned stores are more expensive than misaligned loads (taking
1799 drs with same alignment into account). */
1800 unsigned int load_inside_cost = 0;
1801 unsigned int load_outside_cost = 0;
1802 unsigned int store_inside_cost = 0;
1803 unsigned int store_outside_cost = 0;
1804 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1806 stmt_vector_for_cost dummy;
1807 dummy.create (2);
1808 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1809 &load_inside_cost,
1810 &load_outside_cost,
1811 &dummy, &dummy, estimated_npeels, true);
1812 dummy.release ();
1814 if (first_store)
1816 dummy.create (2);
1817 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1818 &store_inside_cost,
1819 &store_outside_cost,
1820 &dummy, &dummy,
1821 estimated_npeels, true);
1822 dummy.release ();
1824 else
1826 store_inside_cost = INT_MAX;
1827 store_outside_cost = INT_MAX;
1830 if (load_inside_cost > store_inside_cost
1831 || (load_inside_cost == store_inside_cost
1832 && load_outside_cost > store_outside_cost))
1834 dr0_info = first_store;
1835 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1836 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1838 else
1840 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1841 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1844 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1845 prologue_cost_vec.create (2);
1846 epilogue_cost_vec.create (2);
1848 int dummy2;
1849 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1850 (loop_vinfo, estimated_npeels, &dummy2,
1851 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1852 &prologue_cost_vec, &epilogue_cost_vec);
1854 prologue_cost_vec.release ();
1855 epilogue_cost_vec.release ();
1857 peel_for_unknown_alignment.peel_info.count = 1
1858 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1861 peel_for_unknown_alignment.peel_info.npeel = 0;
1862 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1864 best_peel = peel_for_unknown_alignment;
1866 peel_for_known_alignment.inside_cost = INT_MAX;
1867 peel_for_known_alignment.outside_cost = INT_MAX;
1868 peel_for_known_alignment.peel_info.count = 0;
1869 peel_for_known_alignment.peel_info.dr_info = NULL;
1871 if (do_peeling && one_misalignment_known)
1873 /* Peeling is possible, but there is no data access that is not supported
1874 unless aligned. So we try to choose the best possible peeling from
1875 the hash table. */
1876 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1877 (&peeling_htab, loop_vinfo);
1880 /* Compare costs of peeling for known and unknown alignment. */
1881 if (peel_for_known_alignment.peel_info.dr_info != NULL
1882 && peel_for_unknown_alignment.inside_cost
1883 >= peel_for_known_alignment.inside_cost)
1885 best_peel = peel_for_known_alignment;
1887 /* If the best peeling for known alignment has NPEEL == 0, perform no
1888 peeling at all except if there is an unsupportable dr that we can
1889 align. */
1890 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1891 do_peeling = false;
1894 /* If there is an unsupportable data ref, prefer this over all choices so far
1895 since we'd have to discard a chosen peeling except when it accidentally
1896 aligned the unsupportable data ref. */
1897 if (one_dr_unsupportable)
1898 dr0_info = unsupportable_dr_info;
1899 else if (do_peeling)
1901 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1902 TODO: Use nopeel_outside_cost or get rid of it? */
1903 unsigned nopeel_inside_cost = 0;
1904 unsigned nopeel_outside_cost = 0;
1906 stmt_vector_for_cost dummy;
1907 dummy.create (2);
1908 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1909 &nopeel_outside_cost, &dummy, &dummy,
1910 0, false);
1911 dummy.release ();
1913 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1914 costs will be recorded. */
1915 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1916 prologue_cost_vec.create (2);
1917 epilogue_cost_vec.create (2);
1919 int dummy2;
1920 nopeel_outside_cost += vect_get_known_peeling_cost
1921 (loop_vinfo, 0, &dummy2,
1922 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1923 &prologue_cost_vec, &epilogue_cost_vec);
1925 prologue_cost_vec.release ();
1926 epilogue_cost_vec.release ();
1928 npeel = best_peel.peel_info.npeel;
1929 dr0_info = best_peel.peel_info.dr_info;
1931 /* If no peeling is not more expensive than the best peeling we
1932 have so far, don't perform any peeling. */
1933 if (nopeel_inside_cost <= best_peel.inside_cost)
1934 do_peeling = false;
1937 if (do_peeling)
1939 stmt_vec_info stmt_info = dr0_info->stmt;
1940 vectype = STMT_VINFO_VECTYPE (stmt_info);
1942 if (known_alignment_for_access_p (dr0_info))
1944 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1945 size_zero_node) < 0;
1946 if (!npeel)
1948 /* Since it's known at compile time, compute the number of
1949 iterations in the peeled loop (the peeling factor) for use in
1950 updating DR_MISALIGNMENT values. The peeling factor is the
1951 vectorization factor minus the misalignment as an element
1952 count. */
1953 mis = (negative
1954 ? DR_MISALIGNMENT (dr0_info)
1955 : -DR_MISALIGNMENT (dr0_info));
1956 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1957 npeel = ((mis & (target_align - 1))
1958 / vect_get_scalar_dr_size (dr0_info));
1961 /* For interleaved data access every iteration accesses all the
1962 members of the group, therefore we divide the number of iterations
1963 by the group size. */
1964 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1965 npeel /= DR_GROUP_SIZE (stmt_info);
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_NOTE, vect_location,
1969 "Try peeling by %d\n", npeel);
1972 /* Ensure that all datarefs can be vectorized after the peel. */
1973 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1974 do_peeling = false;
1976 /* Check if all datarefs are supportable and log. */
1977 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1979 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1980 if (!stat)
1981 do_peeling = false;
1982 else
1983 return stat;
1986 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1987 if (do_peeling)
1989 unsigned max_allowed_peel
1990 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1991 if (max_allowed_peel != (unsigned)-1)
1993 unsigned max_peel = npeel;
1994 if (max_peel == 0)
1996 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1997 max_peel = (target_align
1998 / vect_get_scalar_dr_size (dr0_info) - 1);
2000 if (max_peel > max_allowed_peel)
2002 do_peeling = false;
2003 if (dump_enabled_p ())
2004 dump_printf_loc (MSG_NOTE, vect_location,
2005 "Disable peeling, max peels reached: %d\n", max_peel);
2010 /* Cost model #2 - if peeling may result in a remaining loop not
2011 iterating enough to be vectorized then do not peel. Since this
2012 is a cost heuristic rather than a correctness decision, use the
2013 most likely runtime value for variable vectorization factors. */
2014 if (do_peeling
2015 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2017 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2018 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2019 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2020 < assumed_vf + max_peel)
2021 do_peeling = false;
2024 if (do_peeling)
2026 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2027 If the misalignment of DR_i is identical to that of dr0 then set
2028 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2029 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2030 by the peeling factor times the element size of DR_i (MOD the
2031 vectorization factor times the size). Otherwise, the
2032 misalignment of DR_i must be set to unknown. */
2033 FOR_EACH_VEC_ELT (datarefs, i, dr)
2034 if (dr != dr0_info->dr)
2036 /* Strided accesses perform only component accesses, alignment
2037 is irrelevant for them. */
2038 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2039 stmt_info = dr_info->stmt;
2040 if (STMT_VINFO_STRIDED_P (stmt_info)
2041 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2042 continue;
2044 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2047 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2048 if (npeel)
2049 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2050 else
2051 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2052 = DR_MISALIGNMENT (dr0_info);
2053 SET_DR_MISALIGNMENT (dr0_info, 0);
2054 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "Alignment of access forced using peeling.\n");
2058 dump_printf_loc (MSG_NOTE, vect_location,
2059 "Peeling for alignment will be applied.\n");
2062 /* The inside-loop cost will be accounted for in vectorizable_load
2063 and vectorizable_store correctly with adjusted alignments.
2064 Drop the body_cst_vec on the floor here. */
2065 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2066 gcc_assert (stat);
2067 return stat;
2071 /* (2) Versioning to force alignment. */
2073 /* Try versioning if:
2074 1) optimize loop for speed
2075 2) there is at least one unsupported misaligned data ref with an unknown
2076 misalignment, and
2077 3) all misaligned data refs with a known misalignment are supported, and
2078 4) the number of runtime alignment checks is within reason. */
2080 do_versioning =
2081 optimize_loop_nest_for_speed_p (loop)
2082 && (!loop->inner); /* FORNOW */
2084 if (do_versioning)
2086 FOR_EACH_VEC_ELT (datarefs, i, dr)
2088 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2089 stmt_vec_info stmt_info = dr_info->stmt;
2091 /* For interleaving, only the alignment of the first access
2092 matters. */
2093 if (aligned_access_p (dr_info)
2094 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2095 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2096 continue;
2098 if (STMT_VINFO_STRIDED_P (stmt_info))
2100 /* Strided loads perform only component accesses, alignment is
2101 irrelevant for them. */
2102 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2103 continue;
2104 do_versioning = false;
2105 break;
2108 supportable_dr_alignment
2109 = vect_supportable_dr_alignment (dr_info, false);
2111 if (!supportable_dr_alignment)
2113 int mask;
2114 tree vectype;
2116 if (known_alignment_for_access_p (dr_info)
2117 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2118 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2120 do_versioning = false;
2121 break;
2124 vectype = STMT_VINFO_VECTYPE (stmt_info);
2125 gcc_assert (vectype);
2127 /* At present we don't support versioning for alignment
2128 with variable VF, since there's no guarantee that the
2129 VF is a power of two. We could relax this if we added
2130 a way of enforcing a power-of-two size. */
2131 unsigned HOST_WIDE_INT size;
2132 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2134 do_versioning = false;
2135 break;
2138 /* The rightmost bits of an aligned address must be zeros.
2139 Construct the mask needed for this test. For example,
2140 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2141 mask must be 15 = 0xf. */
2142 mask = size - 1;
2144 /* FORNOW: use the same mask to test all potentially unaligned
2145 references in the loop. The vectorizer currently supports
2146 a single vector size, see the reference to
2147 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2148 vectorization factor is computed. */
2149 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2150 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2151 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2152 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2156 /* Versioning requires at least one misaligned data reference. */
2157 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2158 do_versioning = false;
2159 else if (!do_versioning)
2160 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2163 if (do_versioning)
2165 vec<stmt_vec_info> may_misalign_stmts
2166 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2167 stmt_vec_info stmt_info;
2169 /* It can now be assumed that the data references in the statements
2170 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2171 of the loop being vectorized. */
2172 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2174 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2175 SET_DR_MISALIGNMENT (dr_info, 0);
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_NOTE, vect_location,
2178 "Alignment of access forced using versioning.\n");
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_NOTE, vect_location,
2183 "Versioning for alignment will be applied.\n");
2185 /* Peeling and versioning can't be done together at this time. */
2186 gcc_assert (! (do_peeling && do_versioning));
2188 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2189 gcc_assert (stat);
2190 return stat;
2193 /* This point is reached if neither peeling nor versioning is being done. */
2194 gcc_assert (! (do_peeling || do_versioning));
2196 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2197 return stat;
2201 /* Function vect_find_same_alignment_drs.
2203 Update group and alignment relations in VINFO according to the chosen
2204 vectorization factor. */
2206 static void
2207 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2209 struct data_reference *dra = DDR_A (ddr);
2210 struct data_reference *drb = DDR_B (ddr);
2211 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2212 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2213 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2214 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2216 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2217 return;
2219 if (dra == drb)
2220 return;
2222 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2223 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2224 return;
2226 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229 return;
2231 /* Two references with distance zero have the same alignment. */
2232 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2233 - wi::to_poly_offset (DR_INIT (drb)));
2234 if (maybe_ne (diff, 0))
2236 /* Get the wider of the two alignments. */
2237 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2238 / BITS_PER_UNIT);
2239 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2240 / BITS_PER_UNIT);
2241 unsigned int max_align = MAX (align_a, align_b);
2243 /* Require the gap to be a multiple of the larger vector alignment. */
2244 if (!multiple_p (diff, max_align))
2245 return;
2248 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_NOTE, vect_location,
2252 "accesses have the same alignment: %T and %T\n",
2253 DR_REF (dra), DR_REF (drb));
2257 /* Function vect_analyze_data_refs_alignment
2259 Analyze the alignment of the data-references in the loop.
2260 Return FALSE if a data reference is found that cannot be vectorized. */
2262 opt_result
2263 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2265 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2267 /* Mark groups of data references with same alignment using
2268 data dependence information. */
2269 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2270 struct data_dependence_relation *ddr;
2271 unsigned int i;
2273 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2274 vect_find_same_alignment_drs (vinfo, ddr);
2276 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2277 struct data_reference *dr;
2279 vect_record_base_alignments (vinfo);
2280 FOR_EACH_VEC_ELT (datarefs, i, dr)
2282 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2283 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2284 vect_compute_data_ref_alignment (dr_info);
2287 return opt_result::success ();
2291 /* Analyze alignment of DRs of stmts in NODE. */
2293 static bool
2294 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2296 /* We vectorize from the first scalar stmt in the node unless
2297 the node is permuted in which case we start from the first
2298 element in the group. */
2299 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2300 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2301 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2302 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2304 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2305 vect_compute_data_ref_alignment (dr_info);
2306 /* For creating the data-ref pointer we need alignment of the
2307 first element anyway. */
2308 if (dr_info != first_dr_info)
2309 vect_compute_data_ref_alignment (first_dr_info);
2310 if (! verify_data_ref_alignment (dr_info))
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "not vectorized: bad data alignment in basic "
2315 "block.\n");
2316 return false;
2319 return true;
2322 /* Function vect_slp_analyze_instance_alignment
2324 Analyze the alignment of the data-references in the SLP instance.
2325 Return FALSE if a data reference is found that cannot be vectorized. */
2327 bool
2328 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2330 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2332 slp_tree node;
2333 unsigned i;
2334 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2335 if (! vect_slp_analyze_and_verify_node_alignment (node))
2336 return false;
2338 node = SLP_INSTANCE_TREE (instance);
2339 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2340 && ! vect_slp_analyze_and_verify_node_alignment
2341 (SLP_INSTANCE_TREE (instance)))
2342 return false;
2344 return true;
2348 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2349 accesses of legal size, step, etc. Detect gaps, single element
2350 interleaving, and other special cases. Set grouped access info.
2351 Collect groups of strided stores for further use in SLP analysis.
2352 Worker for vect_analyze_group_access. */
2354 static bool
2355 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2357 data_reference *dr = dr_info->dr;
2358 tree step = DR_STEP (dr);
2359 tree scalar_type = TREE_TYPE (DR_REF (dr));
2360 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2361 stmt_vec_info stmt_info = dr_info->stmt;
2362 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2363 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2364 HOST_WIDE_INT dr_step = -1;
2365 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2366 bool slp_impossible = false;
2368 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2369 size of the interleaving group (including gaps). */
2370 if (tree_fits_shwi_p (step))
2372 dr_step = tree_to_shwi (step);
2373 /* Check that STEP is a multiple of type size. Otherwise there is
2374 a non-element-sized gap at the end of the group which we
2375 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2376 ??? As we can handle non-constant step fine here we should
2377 simply remove uses of DR_GROUP_GAP between the last and first
2378 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2379 simply not include that gap. */
2380 if ((dr_step % type_size) != 0)
2382 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_NOTE, vect_location,
2384 "Step %T is not a multiple of the element size"
2385 " for %T\n",
2386 step, DR_REF (dr));
2387 return false;
2389 groupsize = absu_hwi (dr_step) / type_size;
2391 else
2392 groupsize = 0;
2394 /* Not consecutive access is possible only if it is a part of interleaving. */
2395 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2397 /* Check if it this DR is a part of interleaving, and is a single
2398 element of the group that is accessed in the loop. */
2400 /* Gaps are supported only for loads. STEP must be a multiple of the type
2401 size. */
2402 if (DR_IS_READ (dr)
2403 && (dr_step % type_size) == 0
2404 && groupsize > 0)
2406 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2407 DR_GROUP_SIZE (stmt_info) = groupsize;
2408 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2409 if (dump_enabled_p ())
2410 dump_printf_loc (MSG_NOTE, vect_location,
2411 "Detected single element interleaving %T"
2412 " step %T\n",
2413 DR_REF (dr), step);
2415 return true;
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2420 "not consecutive access %G", stmt_info->stmt);
2422 if (bb_vinfo)
2424 /* Mark the statement as unvectorizable. */
2425 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2426 return true;
2429 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2430 STMT_VINFO_STRIDED_P (stmt_info) = true;
2431 return true;
2434 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2436 /* First stmt in the interleaving chain. Check the chain. */
2437 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2438 struct data_reference *data_ref = dr;
2439 unsigned int count = 1;
2440 tree prev_init = DR_INIT (data_ref);
2441 stmt_vec_info prev = stmt_info;
2442 HOST_WIDE_INT diff, gaps = 0;
2444 /* By construction, all group members have INTEGER_CST DR_INITs. */
2445 while (next)
2447 /* Skip same data-refs. In case that two or more stmts share
2448 data-ref (supported only for loads), we vectorize only the first
2449 stmt, and the rest get their vectorized loads from the first
2450 one. */
2451 if (!tree_int_cst_compare (DR_INIT (data_ref),
2452 DR_INIT (STMT_VINFO_DATA_REF (next))))
2454 if (DR_IS_WRITE (data_ref))
2456 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 "Two store stmts share the same dr.\n");
2459 return false;
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_NOTE, vect_location,
2464 "Two or more load stmts share the same dr.\n");
2466 /* For load use the same data-ref load. */
2467 DR_GROUP_SAME_DR_STMT (next) = prev;
2469 prev = next;
2470 next = DR_GROUP_NEXT_ELEMENT (next);
2471 continue;
2474 prev = next;
2475 data_ref = STMT_VINFO_DATA_REF (next);
2477 /* All group members have the same STEP by construction. */
2478 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2480 /* Check that the distance between two accesses is equal to the type
2481 size. Otherwise, we have gaps. */
2482 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2483 - TREE_INT_CST_LOW (prev_init)) / type_size;
2484 if (diff != 1)
2486 /* FORNOW: SLP of accesses with gaps is not supported. */
2487 slp_impossible = true;
2488 if (DR_IS_WRITE (data_ref))
2490 if (dump_enabled_p ())
2491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2492 "interleaved store with gaps\n");
2493 return false;
2496 gaps += diff - 1;
2499 last_accessed_element += diff;
2501 /* Store the gap from the previous member of the group. If there is no
2502 gap in the access, DR_GROUP_GAP is always 1. */
2503 DR_GROUP_GAP (next) = diff;
2505 prev_init = DR_INIT (data_ref);
2506 next = DR_GROUP_NEXT_ELEMENT (next);
2507 /* Count the number of data-refs in the chain. */
2508 count++;
2511 if (groupsize == 0)
2512 groupsize = count + gaps;
2514 /* This could be UINT_MAX but as we are generating code in a very
2515 inefficient way we have to cap earlier. See PR78699 for example. */
2516 if (groupsize > 4096)
2518 if (dump_enabled_p ())
2519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2520 "group is too large\n");
2521 return false;
2524 /* Check that the size of the interleaving is equal to count for stores,
2525 i.e., that there are no gaps. */
2526 if (groupsize != count
2527 && !DR_IS_READ (dr))
2529 groupsize = count;
2530 STMT_VINFO_STRIDED_P (stmt_info) = true;
2533 /* If there is a gap after the last load in the group it is the
2534 difference between the groupsize and the last accessed
2535 element.
2536 When there is no gap, this difference should be 0. */
2537 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2539 DR_GROUP_SIZE (stmt_info) = groupsize;
2540 if (dump_enabled_p ())
2542 dump_printf_loc (MSG_NOTE, vect_location,
2543 "Detected interleaving ");
2544 if (DR_IS_READ (dr))
2545 dump_printf (MSG_NOTE, "load ");
2546 else if (STMT_VINFO_STRIDED_P (stmt_info))
2547 dump_printf (MSG_NOTE, "strided store ");
2548 else
2549 dump_printf (MSG_NOTE, "store ");
2550 dump_printf (MSG_NOTE, "of size %u\n",
2551 (unsigned)groupsize);
2552 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2553 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2554 while (next)
2556 if (DR_GROUP_GAP (next) != 1)
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "\t<gap of %d elements>\n",
2559 DR_GROUP_GAP (next) - 1);
2560 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2561 next = DR_GROUP_NEXT_ELEMENT (next);
2563 if (DR_GROUP_GAP (stmt_info) != 0)
2564 dump_printf_loc (MSG_NOTE, vect_location,
2565 "\t<gap of %d elements>\n",
2566 DR_GROUP_GAP (stmt_info));
2569 /* SLP: create an SLP data structure for every interleaving group of
2570 stores for further analysis in vect_analyse_slp. */
2571 if (DR_IS_WRITE (dr) && !slp_impossible)
2573 if (loop_vinfo)
2574 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2575 if (bb_vinfo)
2576 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2580 return true;
2583 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2584 accesses of legal size, step, etc. Detect gaps, single element
2585 interleaving, and other special cases. Set grouped access info.
2586 Collect groups of strided stores for further use in SLP analysis. */
2588 static bool
2589 vect_analyze_group_access (dr_vec_info *dr_info)
2591 if (!vect_analyze_group_access_1 (dr_info))
2593 /* Dissolve the group if present. */
2594 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2595 while (stmt_info)
2597 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2598 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2599 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2600 stmt_info = next;
2602 return false;
2604 return true;
2607 /* Analyze the access pattern of the data-reference DR_INFO.
2608 In case of non-consecutive accesses call vect_analyze_group_access() to
2609 analyze groups of accesses. */
2611 static bool
2612 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2614 data_reference *dr = dr_info->dr;
2615 tree step = DR_STEP (dr);
2616 tree scalar_type = TREE_TYPE (DR_REF (dr));
2617 stmt_vec_info stmt_info = dr_info->stmt;
2618 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2619 struct loop *loop = NULL;
2621 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2622 return true;
2624 if (loop_vinfo)
2625 loop = LOOP_VINFO_LOOP (loop_vinfo);
2627 if (loop_vinfo && !step)
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "bad data-ref access in loop\n");
2632 return false;
2635 /* Allow loads with zero step in inner-loop vectorization. */
2636 if (loop_vinfo && integer_zerop (step))
2638 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2639 if (!nested_in_vect_loop_p (loop, stmt_info))
2640 return DR_IS_READ (dr);
2641 /* Allow references with zero step for outer loops marked
2642 with pragma omp simd only - it guarantees absence of
2643 loop-carried dependencies between inner loop iterations. */
2644 if (loop->safelen < 2)
2646 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_NOTE, vect_location,
2648 "zero step in inner loop of nest\n");
2649 return false;
2653 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2655 /* Interleaved accesses are not yet supported within outer-loop
2656 vectorization for references in the inner-loop. */
2657 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2659 /* For the rest of the analysis we use the outer-loop step. */
2660 step = STMT_VINFO_DR_STEP (stmt_info);
2661 if (integer_zerop (step))
2663 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_NOTE, vect_location,
2665 "zero step in outer loop.\n");
2666 return DR_IS_READ (dr);
2670 /* Consecutive? */
2671 if (TREE_CODE (step) == INTEGER_CST)
2673 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2674 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2675 || (dr_step < 0
2676 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2678 /* Mark that it is not interleaving. */
2679 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2680 return true;
2684 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2686 if (dump_enabled_p ())
2687 dump_printf_loc (MSG_NOTE, vect_location,
2688 "grouped access in outer loop.\n");
2689 return false;
2693 /* Assume this is a DR handled by non-constant strided load case. */
2694 if (TREE_CODE (step) != INTEGER_CST)
2695 return (STMT_VINFO_STRIDED_P (stmt_info)
2696 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2697 || vect_analyze_group_access (dr_info)));
2699 /* Not consecutive access - check if it's a part of interleaving group. */
2700 return vect_analyze_group_access (dr_info);
2703 /* Compare two data-references DRA and DRB to group them into chunks
2704 suitable for grouping. */
2706 static int
2707 dr_group_sort_cmp (const void *dra_, const void *drb_)
2709 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2710 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2711 int cmp;
2713 /* Stabilize sort. */
2714 if (dra == drb)
2715 return 0;
2717 /* DRs in different loops never belong to the same group. */
2718 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2719 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2720 if (loopa != loopb)
2721 return loopa->num < loopb->num ? -1 : 1;
2723 /* Ordering of DRs according to base. */
2724 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2725 DR_BASE_ADDRESS (drb));
2726 if (cmp != 0)
2727 return cmp;
2729 /* And according to DR_OFFSET. */
2730 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2731 if (cmp != 0)
2732 return cmp;
2734 /* Put reads before writes. */
2735 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2736 return DR_IS_READ (dra) ? -1 : 1;
2738 /* Then sort after access size. */
2739 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2740 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2741 if (cmp != 0)
2742 return cmp;
2744 /* And after step. */
2745 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2746 if (cmp != 0)
2747 return cmp;
2749 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2750 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2751 if (cmp == 0)
2752 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2753 return cmp;
2756 /* If OP is the result of a conversion, return the unconverted value,
2757 otherwise return null. */
2759 static tree
2760 strip_conversion (tree op)
2762 if (TREE_CODE (op) != SSA_NAME)
2763 return NULL_TREE;
2764 gimple *stmt = SSA_NAME_DEF_STMT (op);
2765 if (!is_gimple_assign (stmt)
2766 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2767 return NULL_TREE;
2768 return gimple_assign_rhs1 (stmt);
2771 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2772 and STMT2_INFO being in a single group. */
2774 static bool
2775 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2777 if (gimple_assign_single_p (stmt1_info->stmt))
2778 return gimple_assign_single_p (stmt2_info->stmt);
2780 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2781 if (call1 && gimple_call_internal_p (call1))
2783 /* Check for two masked loads or two masked stores. */
2784 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2785 if (!call2 || !gimple_call_internal_p (call2))
2786 return false;
2787 internal_fn ifn = gimple_call_internal_fn (call1);
2788 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2789 return false;
2790 if (ifn != gimple_call_internal_fn (call2))
2791 return false;
2793 /* Check that the masks are the same. Cope with casts of masks,
2794 like those created by build_mask_conversion. */
2795 tree mask1 = gimple_call_arg (call1, 2);
2796 tree mask2 = gimple_call_arg (call2, 2);
2797 if (!operand_equal_p (mask1, mask2, 0))
2799 mask1 = strip_conversion (mask1);
2800 if (!mask1)
2801 return false;
2802 mask2 = strip_conversion (mask2);
2803 if (!mask2)
2804 return false;
2805 if (!operand_equal_p (mask1, mask2, 0))
2806 return false;
2808 return true;
2811 return false;
2814 /* Function vect_analyze_data_ref_accesses.
2816 Analyze the access pattern of all the data references in the loop.
2818 FORNOW: the only access pattern that is considered vectorizable is a
2819 simple step 1 (consecutive) access.
2821 FORNOW: handle only arrays and pointer accesses. */
2823 opt_result
2824 vect_analyze_data_ref_accesses (vec_info *vinfo)
2826 unsigned int i;
2827 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2828 struct data_reference *dr;
2830 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2832 if (datarefs.is_empty ())
2833 return opt_result::success ();
2835 /* Sort the array of datarefs to make building the interleaving chains
2836 linear. Don't modify the original vector's order, it is needed for
2837 determining what dependencies are reversed. */
2838 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2839 datarefs_copy.qsort (dr_group_sort_cmp);
2840 hash_set<stmt_vec_info> to_fixup;
2842 /* Build the interleaving chains. */
2843 for (i = 0; i < datarefs_copy.length () - 1;)
2845 data_reference_p dra = datarefs_copy[i];
2846 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2847 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2848 stmt_vec_info lastinfo = NULL;
2849 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2850 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2852 ++i;
2853 continue;
2855 for (i = i + 1; i < datarefs_copy.length (); ++i)
2857 data_reference_p drb = datarefs_copy[i];
2858 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2859 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2860 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2861 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2862 break;
2864 /* ??? Imperfect sorting (non-compatible types, non-modulo
2865 accesses, same accesses) can lead to a group to be artificially
2866 split here as we don't just skip over those. If it really
2867 matters we can push those to a worklist and re-iterate
2868 over them. The we can just skip ahead to the next DR here. */
2870 /* DRs in a different loop should not be put into the same
2871 interleaving group. */
2872 if (gimple_bb (DR_STMT (dra))->loop_father
2873 != gimple_bb (DR_STMT (drb))->loop_father)
2874 break;
2876 /* Check that the data-refs have same first location (except init)
2877 and they are both either store or load (not load and store,
2878 not masked loads or stores). */
2879 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2880 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2881 DR_BASE_ADDRESS (drb)) != 0
2882 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2883 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2884 break;
2886 /* Check that the data-refs have the same constant size. */
2887 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2888 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2889 if (!tree_fits_uhwi_p (sza)
2890 || !tree_fits_uhwi_p (szb)
2891 || !tree_int_cst_equal (sza, szb))
2892 break;
2894 /* Check that the data-refs have the same step. */
2895 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2896 break;
2898 /* Check the types are compatible.
2899 ??? We don't distinguish this during sorting. */
2900 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2901 TREE_TYPE (DR_REF (drb))))
2902 break;
2904 /* Check that the DR_INITs are compile-time constants. */
2905 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2906 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2907 break;
2909 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2910 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2911 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2912 HOST_WIDE_INT init_prev
2913 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2914 gcc_assert (init_a <= init_b
2915 && init_a <= init_prev
2916 && init_prev <= init_b);
2918 /* Do not place the same access in the interleaving chain twice. */
2919 if (init_b == init_prev)
2921 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2922 < gimple_uid (DR_STMT (drb)));
2923 /* Simply link in duplicates and fix up the chain below. */
2925 else
2927 /* If init_b == init_a + the size of the type * k, we have an
2928 interleaving, and DRA is accessed before DRB. */
2929 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2930 if (type_size_a == 0
2931 || (init_b - init_a) % type_size_a != 0)
2932 break;
2934 /* If we have a store, the accesses are adjacent. This splits
2935 groups into chunks we support (we don't support vectorization
2936 of stores with gaps). */
2937 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2938 break;
2940 /* If the step (if not zero or non-constant) is greater than the
2941 difference between data-refs' inits this splits groups into
2942 suitable sizes. */
2943 if (tree_fits_shwi_p (DR_STEP (dra)))
2945 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2946 if (step != 0 && step <= (init_b - init_a))
2947 break;
2951 if (dump_enabled_p ())
2952 dump_printf_loc (MSG_NOTE, vect_location,
2953 DR_IS_READ (dra)
2954 ? "Detected interleaving load %T and %T\n"
2955 : "Detected interleaving store %T and %T\n",
2956 DR_REF (dra), DR_REF (drb));
2958 /* Link the found element into the group list. */
2959 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2961 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2962 lastinfo = stmtinfo_a;
2964 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2965 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2966 lastinfo = stmtinfo_b;
2968 if (init_b == init_prev
2969 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2970 && dump_enabled_p ())
2971 dump_printf_loc (MSG_NOTE, vect_location,
2972 "Queuing group with duplicate access for fixup\n");
2976 /* Fixup groups with duplicate entries by splitting it. */
2977 while (1)
2979 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
2980 if (!(it != to_fixup.end ()))
2981 break;
2982 stmt_vec_info grp = *it;
2983 to_fixup.remove (grp);
2985 /* Find the earliest duplicate group member. */
2986 unsigned first_duplicate = -1u;
2987 stmt_vec_info next, g = grp;
2988 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
2990 if ((DR_INIT (STMT_VINFO_DR_INFO (next)->dr)
2991 == DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
2992 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
2993 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
2994 g = next;
2996 if (first_duplicate == -1U)
2997 continue;
2999 /* Then move all stmts after the first duplicate to a new group.
3000 Note this is a heuristic but one with the property that *it
3001 is fixed up completely. */
3002 g = grp;
3003 stmt_vec_info newgroup = NULL, ng = grp;
3004 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3006 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3008 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3009 if (!newgroup)
3010 newgroup = next;
3011 else
3012 DR_GROUP_NEXT_ELEMENT (ng) = next;
3013 ng = next;
3014 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3016 else
3017 g = DR_GROUP_NEXT_ELEMENT (g);
3019 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3021 /* Fixup the new group which still may contain duplicates. */
3022 to_fixup.add (newgroup);
3025 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3027 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3028 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3029 && !vect_analyze_data_ref_access (dr_info))
3031 if (dump_enabled_p ())
3032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3033 "not vectorized: complicated access pattern.\n");
3035 if (is_a <bb_vec_info> (vinfo))
3037 /* Mark the statement as not vectorizable. */
3038 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3039 continue;
3041 else
3043 datarefs_copy.release ();
3044 return opt_result::failure_at (dr_info->stmt->stmt,
3045 "not vectorized:"
3046 " complicated access pattern.\n");
3051 datarefs_copy.release ();
3052 return opt_result::success ();
3055 /* Function vect_vfa_segment_size.
3057 Input:
3058 DR_INFO: The data reference.
3059 LENGTH_FACTOR: segment length to consider.
3061 Return a value suitable for the dr_with_seg_len::seg_len field.
3062 This is the "distance travelled" by the pointer from the first
3063 iteration in the segment to the last. Note that it does not include
3064 the size of the access; in effect it only describes the first byte. */
3066 static tree
3067 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3069 length_factor = size_binop (MINUS_EXPR,
3070 fold_convert (sizetype, length_factor),
3071 size_one_node);
3072 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3073 length_factor);
3076 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3077 gives the worst-case number of bytes covered by the segment. */
3079 static unsigned HOST_WIDE_INT
3080 vect_vfa_access_size (dr_vec_info *dr_info)
3082 stmt_vec_info stmt_vinfo = dr_info->stmt;
3083 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3084 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3085 unsigned HOST_WIDE_INT access_size = ref_size;
3086 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3088 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3089 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3091 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3092 && (vect_supportable_dr_alignment (dr_info, false)
3093 == dr_explicit_realign_optimized))
3095 /* We might access a full vector's worth. */
3096 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3097 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3099 return access_size;
3102 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3103 describes. */
3105 static unsigned int
3106 vect_vfa_align (dr_vec_info *dr_info)
3108 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3111 /* Function vect_no_alias_p.
3113 Given data references A and B with equal base and offset, see whether
3114 the alias relation can be decided at compilation time. Return 1 if
3115 it can and the references alias, 0 if it can and the references do
3116 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3117 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3118 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3120 static int
3121 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3122 tree segment_length_a, tree segment_length_b,
3123 unsigned HOST_WIDE_INT access_size_a,
3124 unsigned HOST_WIDE_INT access_size_b)
3126 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3127 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3128 poly_uint64 const_length_a;
3129 poly_uint64 const_length_b;
3131 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3132 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3133 [a, a+12) */
3134 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3136 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3137 offset_a = (offset_a + access_size_a) - const_length_a;
3139 else
3140 const_length_a = tree_to_poly_uint64 (segment_length_a);
3141 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3143 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3144 offset_b = (offset_b + access_size_b) - const_length_b;
3146 else
3147 const_length_b = tree_to_poly_uint64 (segment_length_b);
3149 const_length_a += access_size_a;
3150 const_length_b += access_size_b;
3152 if (ranges_known_overlap_p (offset_a, const_length_a,
3153 offset_b, const_length_b))
3154 return 1;
3156 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3157 offset_b, const_length_b))
3158 return 0;
3160 return -1;
3163 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3164 in DDR is >= VF. */
3166 static bool
3167 dependence_distance_ge_vf (data_dependence_relation *ddr,
3168 unsigned int loop_depth, poly_uint64 vf)
3170 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3171 || DDR_NUM_DIST_VECTS (ddr) == 0)
3172 return false;
3174 /* If the dependence is exact, we should have limited the VF instead. */
3175 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3177 unsigned int i;
3178 lambda_vector dist_v;
3179 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3181 HOST_WIDE_INT dist = dist_v[loop_depth];
3182 if (dist != 0
3183 && !(dist > 0 && DDR_REVERSED_P (ddr))
3184 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3185 return false;
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "dependence distance between %T and %T is >= VF\n",
3191 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3193 return true;
3196 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3198 static void
3199 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3201 dump_printf (dump_kind, "%s (%T) >= ",
3202 lower_bound.unsigned_p ? "unsigned" : "abs",
3203 lower_bound.expr);
3204 dump_dec (dump_kind, lower_bound.min_value);
3207 /* Record that the vectorized loop requires the vec_lower_bound described
3208 by EXPR, UNSIGNED_P and MIN_VALUE. */
3210 static void
3211 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3212 poly_uint64 min_value)
3214 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3215 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3216 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3218 unsigned_p &= lower_bounds[i].unsigned_p;
3219 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3220 if (lower_bounds[i].unsigned_p != unsigned_p
3221 || maybe_lt (lower_bounds[i].min_value, min_value))
3223 lower_bounds[i].unsigned_p = unsigned_p;
3224 lower_bounds[i].min_value = min_value;
3225 if (dump_enabled_p ())
3227 dump_printf_loc (MSG_NOTE, vect_location,
3228 "updating run-time check to ");
3229 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3230 dump_printf (MSG_NOTE, "\n");
3233 return;
3236 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3237 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3240 dump_lower_bound (MSG_NOTE, lower_bound);
3241 dump_printf (MSG_NOTE, "\n");
3243 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3246 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3247 will span fewer than GAP bytes. */
3249 static bool
3250 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3251 poly_int64 gap)
3253 stmt_vec_info stmt_info = dr_info->stmt;
3254 HOST_WIDE_INT count
3255 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3256 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3257 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3258 return (estimated_poly_value (gap)
3259 <= count * vect_get_scalar_dr_size (dr_info));
3262 /* Return true if we know that there is no alias between DR_INFO_A and
3263 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3264 When returning true, set *LOWER_BOUND_OUT to this N. */
3266 static bool
3267 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3268 poly_uint64 *lower_bound_out)
3270 /* Check that there is a constant gap of known sign between DR_A
3271 and DR_B. */
3272 data_reference *dr_a = dr_info_a->dr;
3273 data_reference *dr_b = dr_info_b->dr;
3274 poly_int64 init_a, init_b;
3275 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3276 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3277 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3278 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3279 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3280 || !ordered_p (init_a, init_b))
3281 return false;
3283 /* Sort DR_A and DR_B by the address they access. */
3284 if (maybe_lt (init_b, init_a))
3286 std::swap (init_a, init_b);
3287 std::swap (dr_info_a, dr_info_b);
3288 std::swap (dr_a, dr_b);
3291 /* If the two accesses could be dependent within a scalar iteration,
3292 make sure that we'd retain their order. */
3293 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3294 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3295 return false;
3297 /* There is no alias if abs (DR_STEP) is greater than or equal to
3298 the bytes spanned by the combination of the two accesses. */
3299 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3300 return true;
3303 /* Function vect_prune_runtime_alias_test_list.
3305 Prune a list of ddrs to be tested at run-time by versioning for alias.
3306 Merge several alias checks into one if possible.
3307 Return FALSE if resulting list of ddrs is longer then allowed by
3308 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3310 opt_result
3311 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3313 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3314 hash_set <tree_pair_hash> compared_objects;
3316 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3317 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3318 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3319 vec<vec_object_pair> &check_unequal_addrs
3320 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3321 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3322 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3324 ddr_p ddr;
3325 unsigned int i;
3326 tree length_factor;
3328 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3330 /* Step values are irrelevant for aliasing if the number of vector
3331 iterations is equal to the number of scalar iterations (which can
3332 happen for fully-SLP loops). */
3333 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3335 if (!ignore_step_p)
3337 /* Convert the checks for nonzero steps into bound tests. */
3338 tree value;
3339 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3340 vect_check_lower_bound (loop_vinfo, value, true, 1);
3343 if (may_alias_ddrs.is_empty ())
3344 return opt_result::success ();
3346 comp_alias_ddrs.create (may_alias_ddrs.length ());
3348 unsigned int loop_depth
3349 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3350 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3352 /* First, we collect all data ref pairs for aliasing checks. */
3353 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3355 int comp_res;
3356 poly_uint64 lower_bound;
3357 tree segment_length_a, segment_length_b;
3358 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3359 unsigned int align_a, align_b;
3361 /* Ignore the alias if the VF we chose ended up being no greater
3362 than the dependence distance. */
3363 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3364 continue;
3366 if (DDR_OBJECT_A (ddr))
3368 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3369 if (!compared_objects.add (new_pair))
3371 if (dump_enabled_p ())
3372 dump_printf_loc (MSG_NOTE, vect_location,
3373 "checking that %T and %T"
3374 " have different addresses\n",
3375 new_pair.first, new_pair.second);
3376 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3378 continue;
3381 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3382 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3384 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3385 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3387 /* Skip the pair if inter-iteration dependencies are irrelevant
3388 and intra-iteration dependencies are guaranteed to be honored. */
3389 if (ignore_step_p
3390 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3391 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3392 &lower_bound)))
3394 if (dump_enabled_p ())
3395 dump_printf_loc (MSG_NOTE, vect_location,
3396 "no need for alias check between "
3397 "%T and %T when VF is 1\n",
3398 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3399 continue;
3402 /* See whether we can handle the alias using a bounds check on
3403 the step, and whether that's likely to be the best approach.
3404 (It might not be, for example, if the minimum step is much larger
3405 than the number of bytes handled by one vector iteration.) */
3406 if (!ignore_step_p
3407 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3408 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3409 &lower_bound)
3410 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3411 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3413 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3417 "%T and %T when the step %T is outside ",
3418 DR_REF (dr_info_a->dr),
3419 DR_REF (dr_info_b->dr),
3420 DR_STEP (dr_info_a->dr));
3421 if (unsigned_p)
3422 dump_printf (MSG_NOTE, "[0");
3423 else
3425 dump_printf (MSG_NOTE, "(");
3426 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3428 dump_printf (MSG_NOTE, ", ");
3429 dump_dec (MSG_NOTE, lower_bound);
3430 dump_printf (MSG_NOTE, ")\n");
3432 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3433 unsigned_p, lower_bound);
3434 continue;
3437 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3438 if (dr_group_first_a)
3440 stmt_info_a = dr_group_first_a;
3441 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3444 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3445 if (dr_group_first_b)
3447 stmt_info_b = dr_group_first_b;
3448 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3451 if (ignore_step_p)
3453 segment_length_a = size_zero_node;
3454 segment_length_b = size_zero_node;
3456 else
3458 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3459 DR_STEP (dr_info_b->dr), 0))
3460 length_factor = scalar_loop_iters;
3461 else
3462 length_factor = size_int (vect_factor);
3463 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3464 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3466 access_size_a = vect_vfa_access_size (dr_info_a);
3467 access_size_b = vect_vfa_access_size (dr_info_b);
3468 align_a = vect_vfa_align (dr_info_a);
3469 align_b = vect_vfa_align (dr_info_b);
3471 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3472 DR_BASE_ADDRESS (dr_info_b->dr));
3473 if (comp_res == 0)
3474 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3475 DR_OFFSET (dr_info_b->dr));
3477 /* See whether the alias is known at compilation time. */
3478 if (comp_res == 0
3479 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3480 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3481 && poly_int_tree_p (segment_length_a)
3482 && poly_int_tree_p (segment_length_b))
3484 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3485 segment_length_a,
3486 segment_length_b,
3487 access_size_a,
3488 access_size_b);
3489 if (res >= 0 && dump_enabled_p ())
3491 dump_printf_loc (MSG_NOTE, vect_location,
3492 "can tell at compile time that %T and %T",
3493 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3494 if (res == 0)
3495 dump_printf (MSG_NOTE, " do not alias\n");
3496 else
3497 dump_printf (MSG_NOTE, " alias\n");
3500 if (res == 0)
3501 continue;
3503 if (res == 1)
3504 return opt_result::failure_at (stmt_info_b->stmt,
3505 "not vectorized:"
3506 " compilation time alias: %G%G",
3507 stmt_info_a->stmt,
3508 stmt_info_b->stmt);
3511 dr_with_seg_len_pair_t dr_with_seg_len_pair
3512 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3513 access_size_a, align_a),
3514 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3515 access_size_b, align_b));
3517 /* Canonicalize pairs by sorting the two DR members. */
3518 if (comp_res > 0)
3519 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3521 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3524 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3526 unsigned int count = (comp_alias_ddrs.length ()
3527 + check_unequal_addrs.length ());
3529 dump_printf_loc (MSG_NOTE, vect_location,
3530 "improved number of alias checks from %d to %d\n",
3531 may_alias_ddrs.length (), count);
3532 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3533 return opt_result::failure_at
3534 (vect_location,
3535 "number of versioning for alias "
3536 "run-time tests exceeds %d "
3537 "(--param vect-max-version-for-alias-checks)\n",
3538 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3540 return opt_result::success ();
3543 /* Check whether we can use an internal function for a gather load
3544 or scatter store. READ_P is true for loads and false for stores.
3545 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3546 the type of the memory elements being loaded or stored. OFFSET_BITS
3547 is the number of bits in each scalar offset and OFFSET_SIGN is the
3548 sign of the offset. SCALE is the amount by which the offset should
3549 be multiplied *after* it has been converted to address width.
3551 Return true if the function is supported, storing the function
3552 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3554 bool
3555 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3556 tree memory_type, unsigned int offset_bits,
3557 signop offset_sign, int scale,
3558 internal_fn *ifn_out, tree *element_type_out)
3560 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3561 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3562 if (offset_bits > element_bits)
3563 /* Internal functions require the offset to be the same width as
3564 the vector elements. We can extend narrower offsets, but it isn't
3565 safe to truncate wider offsets. */
3566 return false;
3568 if (element_bits != memory_bits)
3569 /* For now the vector elements must be the same width as the
3570 memory elements. */
3571 return false;
3573 /* Work out which function we need. */
3574 internal_fn ifn;
3575 if (read_p)
3576 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3577 else
3578 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3580 /* Test whether the target supports this combination. */
3581 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3582 offset_sign, scale))
3583 return false;
3585 *ifn_out = ifn;
3586 *element_type_out = TREE_TYPE (vectype);
3587 return true;
3590 /* STMT_INFO is a call to an internal gather load or scatter store function.
3591 Describe the operation in INFO. */
3593 static void
3594 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3595 gather_scatter_info *info)
3597 gcall *call = as_a <gcall *> (stmt_info->stmt);
3598 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3599 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3601 info->ifn = gimple_call_internal_fn (call);
3602 info->decl = NULL_TREE;
3603 info->base = gimple_call_arg (call, 0);
3604 info->offset = gimple_call_arg (call, 1);
3605 info->offset_dt = vect_unknown_def_type;
3606 info->offset_vectype = NULL_TREE;
3607 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3608 info->element_type = TREE_TYPE (vectype);
3609 info->memory_type = TREE_TYPE (DR_REF (dr));
3612 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3613 gather load or scatter store. Describe the operation in *INFO if so. */
3615 bool
3616 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3617 gather_scatter_info *info)
3619 HOST_WIDE_INT scale = 1;
3620 poly_int64 pbitpos, pbitsize;
3621 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3622 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3623 tree offtype = NULL_TREE;
3624 tree decl = NULL_TREE, base, off;
3625 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3626 tree memory_type = TREE_TYPE (DR_REF (dr));
3627 machine_mode pmode;
3628 int punsignedp, reversep, pvolatilep = 0;
3629 internal_fn ifn;
3630 tree element_type;
3631 bool masked_p = false;
3633 /* See whether this is already a call to a gather/scatter internal function.
3634 If not, see whether it's a masked load or store. */
3635 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3636 if (call && gimple_call_internal_p (call))
3638 ifn = gimple_call_internal_fn (call);
3639 if (internal_gather_scatter_fn_p (ifn))
3641 vect_describe_gather_scatter_call (stmt_info, info);
3642 return true;
3644 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3647 /* True if we should aim to use internal functions rather than
3648 built-in functions. */
3649 bool use_ifn_p = (DR_IS_READ (dr)
3650 ? supports_vec_gather_load_p ()
3651 : supports_vec_scatter_store_p ());
3653 base = DR_REF (dr);
3654 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3655 see if we can use the def stmt of the address. */
3656 if (masked_p
3657 && TREE_CODE (base) == MEM_REF
3658 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3659 && integer_zerop (TREE_OPERAND (base, 1))
3660 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3662 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3663 if (is_gimple_assign (def_stmt)
3664 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3665 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3668 /* The gather and scatter builtins need address of the form
3669 loop_invariant + vector * {1, 2, 4, 8}
3671 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3672 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3673 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3674 multiplications and additions in it. To get a vector, we need
3675 a single SSA_NAME that will be defined in the loop and will
3676 contain everything that is not loop invariant and that can be
3677 vectorized. The following code attempts to find such a preexistng
3678 SSA_NAME OFF and put the loop invariants into a tree BASE
3679 that can be gimplified before the loop. */
3680 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3681 &punsignedp, &reversep, &pvolatilep);
3682 if (reversep)
3683 return false;
3685 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3687 if (TREE_CODE (base) == MEM_REF)
3689 if (!integer_zerop (TREE_OPERAND (base, 1)))
3691 if (off == NULL_TREE)
3692 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3693 else
3694 off = size_binop (PLUS_EXPR, off,
3695 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3697 base = TREE_OPERAND (base, 0);
3699 else
3700 base = build_fold_addr_expr (base);
3702 if (off == NULL_TREE)
3703 off = size_zero_node;
3705 /* If base is not loop invariant, either off is 0, then we start with just
3706 the constant offset in the loop invariant BASE and continue with base
3707 as OFF, otherwise give up.
3708 We could handle that case by gimplifying the addition of base + off
3709 into some SSA_NAME and use that as off, but for now punt. */
3710 if (!expr_invariant_in_loop_p (loop, base))
3712 if (!integer_zerop (off))
3713 return false;
3714 off = base;
3715 base = size_int (pbytepos);
3717 /* Otherwise put base + constant offset into the loop invariant BASE
3718 and continue with OFF. */
3719 else
3721 base = fold_convert (sizetype, base);
3722 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3725 /* OFF at this point may be either a SSA_NAME or some tree expression
3726 from get_inner_reference. Try to peel off loop invariants from it
3727 into BASE as long as possible. */
3728 STRIP_NOPS (off);
3729 while (offtype == NULL_TREE)
3731 enum tree_code code;
3732 tree op0, op1, add = NULL_TREE;
3734 if (TREE_CODE (off) == SSA_NAME)
3736 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3738 if (expr_invariant_in_loop_p (loop, off))
3739 return false;
3741 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3742 break;
3744 op0 = gimple_assign_rhs1 (def_stmt);
3745 code = gimple_assign_rhs_code (def_stmt);
3746 op1 = gimple_assign_rhs2 (def_stmt);
3748 else
3750 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3751 return false;
3752 code = TREE_CODE (off);
3753 extract_ops_from_tree (off, &code, &op0, &op1);
3755 switch (code)
3757 case POINTER_PLUS_EXPR:
3758 case PLUS_EXPR:
3759 if (expr_invariant_in_loop_p (loop, op0))
3761 add = op0;
3762 off = op1;
3763 do_add:
3764 add = fold_convert (sizetype, add);
3765 if (scale != 1)
3766 add = size_binop (MULT_EXPR, add, size_int (scale));
3767 base = size_binop (PLUS_EXPR, base, add);
3768 continue;
3770 if (expr_invariant_in_loop_p (loop, op1))
3772 add = op1;
3773 off = op0;
3774 goto do_add;
3776 break;
3777 case MINUS_EXPR:
3778 if (expr_invariant_in_loop_p (loop, op1))
3780 add = fold_convert (sizetype, op1);
3781 add = size_binop (MINUS_EXPR, size_zero_node, add);
3782 off = op0;
3783 goto do_add;
3785 break;
3786 case MULT_EXPR:
3787 if (scale == 1 && tree_fits_shwi_p (op1))
3789 int new_scale = tree_to_shwi (op1);
3790 /* Only treat this as a scaling operation if the target
3791 supports it. */
3792 if (use_ifn_p
3793 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3794 vectype, memory_type, 1,
3795 TYPE_SIGN (TREE_TYPE (op0)),
3796 new_scale, &ifn,
3797 &element_type))
3798 break;
3799 scale = new_scale;
3800 off = op0;
3801 continue;
3803 break;
3804 case SSA_NAME:
3805 off = op0;
3806 continue;
3807 CASE_CONVERT:
3808 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3809 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3810 break;
3811 if (TYPE_PRECISION (TREE_TYPE (op0))
3812 == TYPE_PRECISION (TREE_TYPE (off)))
3814 off = op0;
3815 continue;
3818 /* The internal functions need the offset to be the same width
3819 as the elements of VECTYPE. Don't include operations that
3820 cast the offset from that width to a different width. */
3821 if (use_ifn_p
3822 && (int_size_in_bytes (TREE_TYPE (vectype))
3823 == int_size_in_bytes (TREE_TYPE (off))))
3824 break;
3826 if (TYPE_PRECISION (TREE_TYPE (op0))
3827 < TYPE_PRECISION (TREE_TYPE (off)))
3829 off = op0;
3830 offtype = TREE_TYPE (off);
3831 STRIP_NOPS (off);
3832 continue;
3834 break;
3835 default:
3836 break;
3838 break;
3841 /* If at the end OFF still isn't a SSA_NAME or isn't
3842 defined in the loop, punt. */
3843 if (TREE_CODE (off) != SSA_NAME
3844 || expr_invariant_in_loop_p (loop, off))
3845 return false;
3847 if (offtype == NULL_TREE)
3848 offtype = TREE_TYPE (off);
3850 if (use_ifn_p)
3852 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3853 memory_type, TYPE_PRECISION (offtype),
3854 TYPE_SIGN (offtype), scale, &ifn,
3855 &element_type))
3856 return false;
3858 else
3860 if (DR_IS_READ (dr))
3862 if (targetm.vectorize.builtin_gather)
3863 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3865 else
3867 if (targetm.vectorize.builtin_scatter)
3868 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3871 if (!decl)
3872 return false;
3874 ifn = IFN_LAST;
3875 element_type = TREE_TYPE (vectype);
3878 info->ifn = ifn;
3879 info->decl = decl;
3880 info->base = base;
3881 info->offset = off;
3882 info->offset_dt = vect_unknown_def_type;
3883 info->offset_vectype = NULL_TREE;
3884 info->scale = scale;
3885 info->element_type = element_type;
3886 info->memory_type = memory_type;
3887 return true;
3890 /* Find the data references in STMT, analyze them with respect to LOOP and
3891 append them to DATAREFS. Return false if datarefs in this stmt cannot
3892 be handled. */
3894 opt_result
3895 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3896 vec<data_reference_p> *datarefs)
3898 /* We can ignore clobbers for dataref analysis - they are removed during
3899 loop vectorization and BB vectorization checks dependences with a
3900 stmt walk. */
3901 if (gimple_clobber_p (stmt))
3902 return opt_result::success ();
3904 if (gimple_has_volatile_ops (stmt))
3905 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3906 stmt);
3908 if (stmt_can_throw_internal (cfun, stmt))
3909 return opt_result::failure_at (stmt,
3910 "not vectorized:"
3911 " statement can throw an exception: %G",
3912 stmt);
3914 auto_vec<data_reference_p, 2> refs;
3915 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3916 if (!res)
3917 return res;
3919 if (refs.is_empty ())
3920 return opt_result::success ();
3922 if (refs.length () > 1)
3923 return opt_result::failure_at (stmt,
3924 "not vectorized:"
3925 " more than one data ref in stmt: %G", stmt);
3927 if (gcall *call = dyn_cast <gcall *> (stmt))
3928 if (!gimple_call_internal_p (call)
3929 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3930 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3931 return opt_result::failure_at (stmt,
3932 "not vectorized: dr in a call %G", stmt);
3934 data_reference_p dr = refs.pop ();
3935 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3936 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3937 return opt_result::failure_at (stmt,
3938 "not vectorized:"
3939 " statement is bitfield access %G", stmt);
3941 if (DR_BASE_ADDRESS (dr)
3942 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3943 return opt_result::failure_at (stmt,
3944 "not vectorized:"
3945 " base addr of dr is a constant\n");
3947 /* Check whether this may be a SIMD lane access and adjust the
3948 DR to make it easier for us to handle it. */
3949 if (loop
3950 && loop->simduid
3951 && (!DR_BASE_ADDRESS (dr)
3952 || !DR_OFFSET (dr)
3953 || !DR_INIT (dr)
3954 || !DR_STEP (dr)))
3956 struct data_reference *newdr
3957 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3958 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3959 if (DR_BASE_ADDRESS (newdr)
3960 && DR_OFFSET (newdr)
3961 && DR_INIT (newdr)
3962 && DR_STEP (newdr)
3963 && integer_zerop (DR_STEP (newdr)))
3965 tree off = DR_OFFSET (newdr);
3966 STRIP_NOPS (off);
3967 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3968 && TREE_CODE (off) == MULT_EXPR
3969 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3971 tree step = TREE_OPERAND (off, 1);
3972 off = TREE_OPERAND (off, 0);
3973 STRIP_NOPS (off);
3974 if (CONVERT_EXPR_P (off)
3975 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
3976 < TYPE_PRECISION (TREE_TYPE (off))))
3977 off = TREE_OPERAND (off, 0);
3978 if (TREE_CODE (off) == SSA_NAME)
3980 gimple *def = SSA_NAME_DEF_STMT (off);
3981 tree reft = TREE_TYPE (DR_REF (newdr));
3982 if (is_gimple_call (def)
3983 && gimple_call_internal_p (def)
3984 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
3986 tree arg = gimple_call_arg (def, 0);
3987 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3988 arg = SSA_NAME_VAR (arg);
3989 if (arg == loop->simduid
3990 /* For now. */
3991 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
3993 DR_OFFSET (newdr) = ssize_int (0);
3994 DR_STEP (newdr) = step;
3995 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
3996 DR_STEP_ALIGNMENT (newdr)
3997 = highest_pow2_factor (step);
3998 /* Mark as simd-lane access. */
3999 newdr->aux = (void *)-1;
4000 free_data_ref (dr);
4001 datarefs->safe_push (newdr);
4002 return opt_result::success ();
4008 free_data_ref (newdr);
4011 datarefs->safe_push (dr);
4012 return opt_result::success ();
4015 /* Function vect_analyze_data_refs.
4017 Find all the data references in the loop or basic block.
4019 The general structure of the analysis of data refs in the vectorizer is as
4020 follows:
4021 1- vect_analyze_data_refs(loop/bb): call
4022 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4023 in the loop/bb and their dependences.
4024 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4025 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4026 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4030 opt_result
4031 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4033 struct loop *loop = NULL;
4034 unsigned int i;
4035 struct data_reference *dr;
4036 tree scalar_type;
4038 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4040 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4041 loop = LOOP_VINFO_LOOP (loop_vinfo);
4043 /* Go through the data-refs, check that the analysis succeeded. Update
4044 pointer from stmt_vec_info struct to DR and vectype. */
4046 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4047 FOR_EACH_VEC_ELT (datarefs, i, dr)
4049 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4050 poly_uint64 vf;
4052 gcc_assert (DR_REF (dr));
4053 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4054 gcc_assert (!stmt_info->dr_aux.dr);
4055 stmt_info->dr_aux.dr = dr;
4056 stmt_info->dr_aux.stmt = stmt_info;
4058 /* Check that analysis of the data-ref succeeded. */
4059 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4060 || !DR_STEP (dr))
4062 bool maybe_gather
4063 = DR_IS_READ (dr)
4064 && !TREE_THIS_VOLATILE (DR_REF (dr))
4065 && (targetm.vectorize.builtin_gather != NULL
4066 || supports_vec_gather_load_p ());
4067 bool maybe_scatter
4068 = DR_IS_WRITE (dr)
4069 && !TREE_THIS_VOLATILE (DR_REF (dr))
4070 && (targetm.vectorize.builtin_scatter != NULL
4071 || supports_vec_scatter_store_p ());
4073 /* If target supports vector gather loads or scatter stores,
4074 see if they can't be used. */
4075 if (is_a <loop_vec_info> (vinfo)
4076 && !nested_in_vect_loop_p (loop, stmt_info))
4078 if (maybe_gather || maybe_scatter)
4080 if (maybe_gather)
4081 gatherscatter = GATHER;
4082 else
4083 gatherscatter = SCATTER;
4087 if (gatherscatter == SG_NONE)
4089 if (dump_enabled_p ())
4090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4091 "not vectorized: data ref analysis "
4092 "failed %G", stmt_info->stmt);
4093 if (is_a <bb_vec_info> (vinfo))
4095 /* In BB vectorization the ref can still participate
4096 in dependence analysis, we just can't vectorize it. */
4097 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4098 continue;
4100 return opt_result::failure_at (stmt_info->stmt,
4101 "not vectorized:"
4102 " data ref analysis failed: %G",
4103 stmt_info->stmt);
4107 /* See if this was detected as SIMD lane access. */
4108 if (dr->aux == (void *)-1)
4110 if (nested_in_vect_loop_p (loop, stmt_info))
4111 return opt_result::failure_at (stmt_info->stmt,
4112 "not vectorized:"
4113 " data ref analysis failed: %G",
4114 stmt_info->stmt);
4115 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4118 tree base = get_base_address (DR_REF (dr));
4119 if (base && VAR_P (base) && DECL_NONALIASED (base))
4121 if (dump_enabled_p ())
4122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4123 "not vectorized: base object not addressable "
4124 "for stmt: %G", stmt_info->stmt);
4125 if (is_a <bb_vec_info> (vinfo))
4127 /* In BB vectorization the ref can still participate
4128 in dependence analysis, we just can't vectorize it. */
4129 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4130 continue;
4132 return opt_result::failure_at (stmt_info->stmt,
4133 "not vectorized: base object not"
4134 " addressable for stmt: %G",
4135 stmt_info->stmt);
4138 if (is_a <loop_vec_info> (vinfo)
4139 && DR_STEP (dr)
4140 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4142 if (nested_in_vect_loop_p (loop, stmt_info))
4143 return opt_result::failure_at (stmt_info->stmt,
4144 "not vectorized:"
4145 "not suitable for strided load %G",
4146 stmt_info->stmt);
4147 STMT_VINFO_STRIDED_P (stmt_info) = true;
4150 /* Update DR field in stmt_vec_info struct. */
4152 /* If the dataref is in an inner-loop of the loop that is considered for
4153 for vectorization, we also want to analyze the access relative to
4154 the outer-loop (DR contains information only relative to the
4155 inner-most enclosing loop). We do that by building a reference to the
4156 first location accessed by the inner-loop, and analyze it relative to
4157 the outer-loop. */
4158 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4160 /* Build a reference to the first location accessed by the
4161 inner loop: *(BASE + INIT + OFFSET). By construction,
4162 this address must be invariant in the inner loop, so we
4163 can consider it as being used in the outer loop. */
4164 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4165 tree offset = unshare_expr (DR_OFFSET (dr));
4166 tree init = unshare_expr (DR_INIT (dr));
4167 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4168 init, offset);
4169 tree init_addr = fold_build_pointer_plus (base, init_offset);
4170 tree init_ref = build_fold_indirect_ref (init_addr);
4172 if (dump_enabled_p ())
4173 dump_printf_loc (MSG_NOTE, vect_location,
4174 "analyze in outer loop: %T\n", init_ref);
4176 opt_result res
4177 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4178 init_ref, loop, stmt_info->stmt);
4179 if (!res)
4180 /* dr_analyze_innermost already explained the failure. */
4181 return res;
4183 if (dump_enabled_p ())
4184 dump_printf_loc (MSG_NOTE, vect_location,
4185 "\touter base_address: %T\n"
4186 "\touter offset from base address: %T\n"
4187 "\touter constant offset from base address: %T\n"
4188 "\touter step: %T\n"
4189 "\touter base alignment: %d\n\n"
4190 "\touter base misalignment: %d\n"
4191 "\touter offset alignment: %d\n"
4192 "\touter step alignment: %d\n",
4193 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4194 STMT_VINFO_DR_OFFSET (stmt_info),
4195 STMT_VINFO_DR_INIT (stmt_info),
4196 STMT_VINFO_DR_STEP (stmt_info),
4197 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4198 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4199 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4200 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4203 /* Set vectype for STMT. */
4204 scalar_type = TREE_TYPE (DR_REF (dr));
4205 STMT_VINFO_VECTYPE (stmt_info)
4206 = get_vectype_for_scalar_type (scalar_type);
4207 if (!STMT_VINFO_VECTYPE (stmt_info))
4209 if (dump_enabled_p ())
4211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4212 "not vectorized: no vectype for stmt: %G",
4213 stmt_info->stmt);
4214 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4215 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4216 scalar_type);
4217 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4220 if (is_a <bb_vec_info> (vinfo))
4222 /* No vector type is fine, the ref can still participate
4223 in dependence analysis, we just can't vectorize it. */
4224 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4225 continue;
4227 return opt_result::failure_at (stmt_info->stmt,
4228 "not vectorized:"
4229 " no vectype for stmt: %G"
4230 " scalar_type: %T\n",
4231 stmt_info->stmt, scalar_type);
4233 else
4235 if (dump_enabled_p ())
4236 dump_printf_loc (MSG_NOTE, vect_location,
4237 "got vectype for stmt: %G%T\n",
4238 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4241 /* Adjust the minimal vectorization factor according to the
4242 vector type. */
4243 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4244 *min_vf = upper_bound (*min_vf, vf);
4246 if (gatherscatter != SG_NONE)
4248 gather_scatter_info gs_info;
4249 if (!vect_check_gather_scatter (stmt_info,
4250 as_a <loop_vec_info> (vinfo),
4251 &gs_info)
4252 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4253 return opt_result::failure_at
4254 (stmt_info->stmt,
4255 (gatherscatter == GATHER) ?
4256 "not vectorized: not suitable for gather load %G" :
4257 "not vectorized: not suitable for scatter store %G",
4258 stmt_info->stmt);
4259 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4263 /* We used to stop processing and prune the list here. Verify we no
4264 longer need to. */
4265 gcc_assert (i == datarefs.length ());
4267 return opt_result::success ();
4271 /* Function vect_get_new_vect_var.
4273 Returns a name for a new variable. The current naming scheme appends the
4274 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4275 the name of vectorizer generated variables, and appends that to NAME if
4276 provided. */
4278 tree
4279 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4281 const char *prefix;
4282 tree new_vect_var;
4284 switch (var_kind)
4286 case vect_simple_var:
4287 prefix = "vect";
4288 break;
4289 case vect_scalar_var:
4290 prefix = "stmp";
4291 break;
4292 case vect_mask_var:
4293 prefix = "mask";
4294 break;
4295 case vect_pointer_var:
4296 prefix = "vectp";
4297 break;
4298 default:
4299 gcc_unreachable ();
4302 if (name)
4304 char* tmp = concat (prefix, "_", name, NULL);
4305 new_vect_var = create_tmp_reg (type, tmp);
4306 free (tmp);
4308 else
4309 new_vect_var = create_tmp_reg (type, prefix);
4311 return new_vect_var;
4314 /* Like vect_get_new_vect_var but return an SSA name. */
4316 tree
4317 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4319 const char *prefix;
4320 tree new_vect_var;
4322 switch (var_kind)
4324 case vect_simple_var:
4325 prefix = "vect";
4326 break;
4327 case vect_scalar_var:
4328 prefix = "stmp";
4329 break;
4330 case vect_pointer_var:
4331 prefix = "vectp";
4332 break;
4333 default:
4334 gcc_unreachable ();
4337 if (name)
4339 char* tmp = concat (prefix, "_", name, NULL);
4340 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4341 free (tmp);
4343 else
4344 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4346 return new_vect_var;
4349 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4351 static void
4352 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4354 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4355 int misalign = DR_MISALIGNMENT (dr_info);
4356 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4357 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4358 else
4359 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4360 DR_TARGET_ALIGNMENT (dr_info), misalign);
4363 /* Function vect_create_addr_base_for_vector_ref.
4365 Create an expression that computes the address of the first memory location
4366 that will be accessed for a data reference.
4368 Input:
4369 STMT_INFO: The statement containing the data reference.
4370 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4371 OFFSET: Optional. If supplied, it is be added to the initial address.
4372 LOOP: Specify relative to which loop-nest should the address be computed.
4373 For example, when the dataref is in an inner-loop nested in an
4374 outer-loop that is now being vectorized, LOOP can be either the
4375 outer-loop, or the inner-loop. The first memory location accessed
4376 by the following dataref ('in' points to short):
4378 for (i=0; i<N; i++)
4379 for (j=0; j<M; j++)
4380 s += in[i+j]
4382 is as follows:
4383 if LOOP=i_loop: &in (relative to i_loop)
4384 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4385 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4386 initial address. Unlike OFFSET, which is number of elements to
4387 be added, BYTE_OFFSET is measured in bytes.
4389 Output:
4390 1. Return an SSA_NAME whose value is the address of the memory location of
4391 the first vector of the data reference.
4392 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4393 these statement(s) which define the returned SSA_NAME.
4395 FORNOW: We are only handling array accesses with step 1. */
4397 tree
4398 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4399 gimple_seq *new_stmt_list,
4400 tree offset,
4401 tree byte_offset)
4403 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4404 struct data_reference *dr = dr_info->dr;
4405 const char *base_name;
4406 tree addr_base;
4407 tree dest;
4408 gimple_seq seq = NULL;
4409 tree vect_ptr_type;
4410 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4411 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4412 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4414 tree data_ref_base = unshare_expr (drb->base_address);
4415 tree base_offset = unshare_expr (drb->offset);
4416 tree init = unshare_expr (drb->init);
4418 if (loop_vinfo)
4419 base_name = get_name (data_ref_base);
4420 else
4422 base_offset = ssize_int (0);
4423 init = ssize_int (0);
4424 base_name = get_name (DR_REF (dr));
4427 /* Create base_offset */
4428 base_offset = size_binop (PLUS_EXPR,
4429 fold_convert (sizetype, base_offset),
4430 fold_convert (sizetype, init));
4432 if (offset)
4434 offset = fold_build2 (MULT_EXPR, sizetype,
4435 fold_convert (sizetype, offset), step);
4436 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4437 base_offset, offset);
4439 if (byte_offset)
4441 byte_offset = fold_convert (sizetype, byte_offset);
4442 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4443 base_offset, byte_offset);
4446 /* base + base_offset */
4447 if (loop_vinfo)
4448 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4449 else
4451 addr_base = build1 (ADDR_EXPR,
4452 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4453 unshare_expr (DR_REF (dr)));
4456 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4457 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4458 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4459 gimple_seq_add_seq (new_stmt_list, seq);
4461 if (DR_PTR_INFO (dr)
4462 && TREE_CODE (addr_base) == SSA_NAME
4463 && !SSA_NAME_PTR_INFO (addr_base))
4465 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4466 if (offset || byte_offset)
4467 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4473 return addr_base;
4477 /* Function vect_create_data_ref_ptr.
4479 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4480 location accessed in the loop by STMT_INFO, along with the def-use update
4481 chain to appropriately advance the pointer through the loop iterations.
4482 Also set aliasing information for the pointer. This pointer is used by
4483 the callers to this function to create a memory reference expression for
4484 vector load/store access.
4486 Input:
4487 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4488 GIMPLE_ASSIGN <name, data-ref> or
4489 GIMPLE_ASSIGN <data-ref, name>.
4490 2. AGGR_TYPE: the type of the reference, which should be either a vector
4491 or an array.
4492 3. AT_LOOP: the loop where the vector memref is to be created.
4493 4. OFFSET (optional): an offset to be added to the initial address accessed
4494 by the data-ref in STMT_INFO.
4495 5. BSI: location where the new stmts are to be placed if there is no loop
4496 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4497 pointing to the initial address.
4498 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4499 to the initial address accessed by the data-ref in STMT_INFO. This is
4500 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4501 in bytes.
4502 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4503 to the IV during each iteration of the loop. NULL says to move
4504 by one copy of AGGR_TYPE up or down, depending on the step of the
4505 data reference.
4507 Output:
4508 1. Declare a new ptr to vector_type, and have it point to the base of the
4509 data reference (initial addressed accessed by the data reference).
4510 For example, for vector of type V8HI, the following code is generated:
4512 v8hi *ap;
4513 ap = (v8hi *)initial_address;
4515 if OFFSET is not supplied:
4516 initial_address = &a[init];
4517 if OFFSET is supplied:
4518 initial_address = &a[init + OFFSET];
4519 if BYTE_OFFSET is supplied:
4520 initial_address = &a[init] + BYTE_OFFSET;
4522 Return the initial_address in INITIAL_ADDRESS.
4524 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4525 update the pointer in each iteration of the loop.
4527 Return the increment stmt that updates the pointer in PTR_INCR.
4529 3. Return the pointer. */
4531 tree
4532 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4533 struct loop *at_loop, tree offset,
4534 tree *initial_address, gimple_stmt_iterator *gsi,
4535 gimple **ptr_incr, bool only_init,
4536 tree byte_offset, tree iv_step)
4538 const char *base_name;
4539 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4540 struct loop *loop = NULL;
4541 bool nested_in_vect_loop = false;
4542 struct loop *containing_loop = NULL;
4543 tree aggr_ptr_type;
4544 tree aggr_ptr;
4545 tree new_temp;
4546 gimple_seq new_stmt_list = NULL;
4547 edge pe = NULL;
4548 basic_block new_bb;
4549 tree aggr_ptr_init;
4550 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4551 struct data_reference *dr = dr_info->dr;
4552 tree aptr;
4553 gimple_stmt_iterator incr_gsi;
4554 bool insert_after;
4555 tree indx_before_incr, indx_after_incr;
4556 gimple *incr;
4557 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4559 gcc_assert (iv_step != NULL_TREE
4560 || TREE_CODE (aggr_type) == ARRAY_TYPE
4561 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4563 if (loop_vinfo)
4565 loop = LOOP_VINFO_LOOP (loop_vinfo);
4566 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4567 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4568 pe = loop_preheader_edge (loop);
4570 else
4572 gcc_assert (bb_vinfo);
4573 only_init = true;
4574 *ptr_incr = NULL;
4577 /* Create an expression for the first address accessed by this load
4578 in LOOP. */
4579 base_name = get_name (DR_BASE_ADDRESS (dr));
4581 if (dump_enabled_p ())
4583 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4584 dump_printf_loc (MSG_NOTE, vect_location,
4585 "create %s-pointer variable to type: %T",
4586 get_tree_code_name (TREE_CODE (aggr_type)),
4587 aggr_type);
4588 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4589 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4590 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4591 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4592 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4593 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4594 else
4595 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4596 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4599 /* (1) Create the new aggregate-pointer variable.
4600 Vector and array types inherit the alias set of their component
4601 type by default so we need to use a ref-all pointer if the data
4602 reference does not conflict with the created aggregated data
4603 reference because it is not addressable. */
4604 bool need_ref_all = false;
4605 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4606 get_alias_set (DR_REF (dr))))
4607 need_ref_all = true;
4608 /* Likewise for any of the data references in the stmt group. */
4609 else if (DR_GROUP_SIZE (stmt_info) > 1)
4611 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4614 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4615 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4616 get_alias_set (DR_REF (sdr))))
4618 need_ref_all = true;
4619 break;
4621 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4623 while (sinfo);
4625 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4626 need_ref_all);
4627 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4630 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4631 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4632 def-use update cycles for the pointer: one relative to the outer-loop
4633 (LOOP), which is what steps (3) and (4) below do. The other is relative
4634 to the inner-loop (which is the inner-most loop containing the dataref),
4635 and this is done be step (5) below.
4637 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4638 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4639 redundant. Steps (3),(4) create the following:
4641 vp0 = &base_addr;
4642 LOOP: vp1 = phi(vp0,vp2)
4645 vp2 = vp1 + step
4646 goto LOOP
4648 If there is an inner-loop nested in loop, then step (5) will also be
4649 applied, and an additional update in the inner-loop will be created:
4651 vp0 = &base_addr;
4652 LOOP: vp1 = phi(vp0,vp2)
4654 inner: vp3 = phi(vp1,vp4)
4655 vp4 = vp3 + inner_step
4656 if () goto inner
4658 vp2 = vp1 + step
4659 if () goto LOOP */
4661 /* (2) Calculate the initial address of the aggregate-pointer, and set
4662 the aggregate-pointer to point to it before the loop. */
4664 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4666 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4667 offset, byte_offset);
4668 if (new_stmt_list)
4670 if (pe)
4672 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4673 gcc_assert (!new_bb);
4675 else
4676 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4679 *initial_address = new_temp;
4680 aggr_ptr_init = new_temp;
4682 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4683 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4684 inner-loop nested in LOOP (during outer-loop vectorization). */
4686 /* No update in loop is required. */
4687 if (only_init && (!loop_vinfo || at_loop == loop))
4688 aptr = aggr_ptr_init;
4689 else
4691 /* Accesses to invariant addresses should be handled specially
4692 by the caller. */
4693 tree step = vect_dr_behavior (dr_info)->step;
4694 gcc_assert (!integer_zerop (step));
4696 if (iv_step == NULL_TREE)
4698 /* The step of the aggregate pointer is the type size,
4699 negated for downward accesses. */
4700 iv_step = TYPE_SIZE_UNIT (aggr_type);
4701 if (tree_int_cst_sgn (step) == -1)
4702 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4705 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4707 create_iv (aggr_ptr_init,
4708 fold_convert (aggr_ptr_type, iv_step),
4709 aggr_ptr, loop, &incr_gsi, insert_after,
4710 &indx_before_incr, &indx_after_incr);
4711 incr = gsi_stmt (incr_gsi);
4712 loop_vinfo->add_stmt (incr);
4714 /* Copy the points-to information if it exists. */
4715 if (DR_PTR_INFO (dr))
4717 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4718 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4720 if (ptr_incr)
4721 *ptr_incr = incr;
4723 aptr = indx_before_incr;
4726 if (!nested_in_vect_loop || only_init)
4727 return aptr;
4730 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4731 nested in LOOP, if exists. */
4733 gcc_assert (nested_in_vect_loop);
4734 if (!only_init)
4736 standard_iv_increment_position (containing_loop, &incr_gsi,
4737 &insert_after);
4738 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4739 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4740 &indx_after_incr);
4741 incr = gsi_stmt (incr_gsi);
4742 loop_vinfo->add_stmt (incr);
4744 /* Copy the points-to information if it exists. */
4745 if (DR_PTR_INFO (dr))
4747 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4748 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4750 if (ptr_incr)
4751 *ptr_incr = incr;
4753 return indx_before_incr;
4755 else
4756 gcc_unreachable ();
4760 /* Function bump_vector_ptr
4762 Increment a pointer (to a vector type) by vector-size. If requested,
4763 i.e. if PTR-INCR is given, then also connect the new increment stmt
4764 to the existing def-use update-chain of the pointer, by modifying
4765 the PTR_INCR as illustrated below:
4767 The pointer def-use update-chain before this function:
4768 DATAREF_PTR = phi (p_0, p_2)
4769 ....
4770 PTR_INCR: p_2 = DATAREF_PTR + step
4772 The pointer def-use update-chain after this function:
4773 DATAREF_PTR = phi (p_0, p_2)
4774 ....
4775 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4776 ....
4777 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4779 Input:
4780 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4781 in the loop.
4782 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4783 the loop. The increment amount across iterations is expected
4784 to be vector_size.
4785 BSI - location where the new update stmt is to be placed.
4786 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4787 BUMP - optional. The offset by which to bump the pointer. If not given,
4788 the offset is assumed to be vector_size.
4790 Output: Return NEW_DATAREF_PTR as illustrated above.
4794 tree
4795 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4796 stmt_vec_info stmt_info, tree bump)
4798 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4799 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4800 tree update = TYPE_SIZE_UNIT (vectype);
4801 gassign *incr_stmt;
4802 ssa_op_iter iter;
4803 use_operand_p use_p;
4804 tree new_dataref_ptr;
4806 if (bump)
4807 update = bump;
4809 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4810 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4811 else
4812 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4813 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4814 dataref_ptr, update);
4815 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4817 /* Copy the points-to information if it exists. */
4818 if (DR_PTR_INFO (dr))
4820 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4821 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4824 if (!ptr_incr)
4825 return new_dataref_ptr;
4827 /* Update the vector-pointer's cross-iteration increment. */
4828 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4830 tree use = USE_FROM_PTR (use_p);
4832 if (use == dataref_ptr)
4833 SET_USE (use_p, new_dataref_ptr);
4834 else
4835 gcc_assert (operand_equal_p (use, update, 0));
4838 return new_dataref_ptr;
4842 /* Copy memory reference info such as base/clique from the SRC reference
4843 to the DEST MEM_REF. */
4845 void
4846 vect_copy_ref_info (tree dest, tree src)
4848 if (TREE_CODE (dest) != MEM_REF)
4849 return;
4851 tree src_base = src;
4852 while (handled_component_p (src_base))
4853 src_base = TREE_OPERAND (src_base, 0);
4854 if (TREE_CODE (src_base) != MEM_REF
4855 && TREE_CODE (src_base) != TARGET_MEM_REF)
4856 return;
4858 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4859 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4863 /* Function vect_create_destination_var.
4865 Create a new temporary of type VECTYPE. */
4867 tree
4868 vect_create_destination_var (tree scalar_dest, tree vectype)
4870 tree vec_dest;
4871 const char *name;
4872 char *new_name;
4873 tree type;
4874 enum vect_var_kind kind;
4876 kind = vectype
4877 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4878 ? vect_mask_var
4879 : vect_simple_var
4880 : vect_scalar_var;
4881 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4883 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4885 name = get_name (scalar_dest);
4886 if (name)
4887 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4888 else
4889 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4890 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4891 free (new_name);
4893 return vec_dest;
4896 /* Function vect_grouped_store_supported.
4898 Returns TRUE if interleave high and interleave low permutations
4899 are supported, and FALSE otherwise. */
4901 bool
4902 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4904 machine_mode mode = TYPE_MODE (vectype);
4906 /* vect_permute_store_chain requires the group size to be equal to 3 or
4907 be a power of two. */
4908 if (count != 3 && exact_log2 (count) == -1)
4910 if (dump_enabled_p ())
4911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4912 "the size of the group of accesses"
4913 " is not a power of 2 or not eqaul to 3\n");
4914 return false;
4917 /* Check that the permutation is supported. */
4918 if (VECTOR_MODE_P (mode))
4920 unsigned int i;
4921 if (count == 3)
4923 unsigned int j0 = 0, j1 = 0, j2 = 0;
4924 unsigned int i, j;
4926 unsigned int nelt;
4927 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4929 if (dump_enabled_p ())
4930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4931 "cannot handle groups of 3 stores for"
4932 " variable-length vectors\n");
4933 return false;
4936 vec_perm_builder sel (nelt, nelt, 1);
4937 sel.quick_grow (nelt);
4938 vec_perm_indices indices;
4939 for (j = 0; j < 3; j++)
4941 int nelt0 = ((3 - j) * nelt) % 3;
4942 int nelt1 = ((3 - j) * nelt + 1) % 3;
4943 int nelt2 = ((3 - j) * nelt + 2) % 3;
4944 for (i = 0; i < nelt; i++)
4946 if (3 * i + nelt0 < nelt)
4947 sel[3 * i + nelt0] = j0++;
4948 if (3 * i + nelt1 < nelt)
4949 sel[3 * i + nelt1] = nelt + j1++;
4950 if (3 * i + nelt2 < nelt)
4951 sel[3 * i + nelt2] = 0;
4953 indices.new_vector (sel, 2, nelt);
4954 if (!can_vec_perm_const_p (mode, indices))
4956 if (dump_enabled_p ())
4957 dump_printf (MSG_MISSED_OPTIMIZATION,
4958 "permutation op not supported by target.\n");
4959 return false;
4962 for (i = 0; i < nelt; i++)
4964 if (3 * i + nelt0 < nelt)
4965 sel[3 * i + nelt0] = 3 * i + nelt0;
4966 if (3 * i + nelt1 < nelt)
4967 sel[3 * i + nelt1] = 3 * i + nelt1;
4968 if (3 * i + nelt2 < nelt)
4969 sel[3 * i + nelt2] = nelt + j2++;
4971 indices.new_vector (sel, 2, nelt);
4972 if (!can_vec_perm_const_p (mode, indices))
4974 if (dump_enabled_p ())
4975 dump_printf (MSG_MISSED_OPTIMIZATION,
4976 "permutation op not supported by target.\n");
4977 return false;
4980 return true;
4982 else
4984 /* If length is not equal to 3 then only power of 2 is supported. */
4985 gcc_assert (pow2p_hwi (count));
4986 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4988 /* The encoding has 2 interleaved stepped patterns. */
4989 vec_perm_builder sel (nelt, 2, 3);
4990 sel.quick_grow (6);
4991 for (i = 0; i < 3; i++)
4993 sel[i * 2] = i;
4994 sel[i * 2 + 1] = i + nelt;
4996 vec_perm_indices indices (sel, 2, nelt);
4997 if (can_vec_perm_const_p (mode, indices))
4999 for (i = 0; i < 6; i++)
5000 sel[i] += exact_div (nelt, 2);
5001 indices.new_vector (sel, 2, nelt);
5002 if (can_vec_perm_const_p (mode, indices))
5003 return true;
5008 if (dump_enabled_p ())
5009 dump_printf (MSG_MISSED_OPTIMIZATION,
5010 "permutation op not supported by target.\n");
5011 return false;
5015 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5016 type VECTYPE. MASKED_P says whether the masked form is needed. */
5018 bool
5019 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5020 bool masked_p)
5022 if (masked_p)
5023 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5024 vec_mask_store_lanes_optab,
5025 vectype, count);
5026 else
5027 return vect_lanes_optab_supported_p ("vec_store_lanes",
5028 vec_store_lanes_optab,
5029 vectype, count);
5033 /* Function vect_permute_store_chain.
5035 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5036 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5037 the data correctly for the stores. Return the final references for stores
5038 in RESULT_CHAIN.
5040 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5041 The input is 4 vectors each containing 8 elements. We assign a number to
5042 each element, the input sequence is:
5044 1st vec: 0 1 2 3 4 5 6 7
5045 2nd vec: 8 9 10 11 12 13 14 15
5046 3rd vec: 16 17 18 19 20 21 22 23
5047 4th vec: 24 25 26 27 28 29 30 31
5049 The output sequence should be:
5051 1st vec: 0 8 16 24 1 9 17 25
5052 2nd vec: 2 10 18 26 3 11 19 27
5053 3rd vec: 4 12 20 28 5 13 21 30
5054 4th vec: 6 14 22 30 7 15 23 31
5056 i.e., we interleave the contents of the four vectors in their order.
5058 We use interleave_high/low instructions to create such output. The input of
5059 each interleave_high/low operation is two vectors:
5060 1st vec 2nd vec
5061 0 1 2 3 4 5 6 7
5062 the even elements of the result vector are obtained left-to-right from the
5063 high/low elements of the first vector. The odd elements of the result are
5064 obtained left-to-right from the high/low elements of the second vector.
5065 The output of interleave_high will be: 0 4 1 5
5066 and of interleave_low: 2 6 3 7
5069 The permutation is done in log LENGTH stages. In each stage interleave_high
5070 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5071 where the first argument is taken from the first half of DR_CHAIN and the
5072 second argument from it's second half.
5073 In our example,
5075 I1: interleave_high (1st vec, 3rd vec)
5076 I2: interleave_low (1st vec, 3rd vec)
5077 I3: interleave_high (2nd vec, 4th vec)
5078 I4: interleave_low (2nd vec, 4th vec)
5080 The output for the first stage is:
5082 I1: 0 16 1 17 2 18 3 19
5083 I2: 4 20 5 21 6 22 7 23
5084 I3: 8 24 9 25 10 26 11 27
5085 I4: 12 28 13 29 14 30 15 31
5087 The output of the second stage, i.e. the final result is:
5089 I1: 0 8 16 24 1 9 17 25
5090 I2: 2 10 18 26 3 11 19 27
5091 I3: 4 12 20 28 5 13 21 30
5092 I4: 6 14 22 30 7 15 23 31. */
5094 void
5095 vect_permute_store_chain (vec<tree> dr_chain,
5096 unsigned int length,
5097 stmt_vec_info stmt_info,
5098 gimple_stmt_iterator *gsi,
5099 vec<tree> *result_chain)
5101 tree vect1, vect2, high, low;
5102 gimple *perm_stmt;
5103 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5104 tree perm_mask_low, perm_mask_high;
5105 tree data_ref;
5106 tree perm3_mask_low, perm3_mask_high;
5107 unsigned int i, j, n, log_length = exact_log2 (length);
5109 result_chain->quick_grow (length);
5110 memcpy (result_chain->address (), dr_chain.address (),
5111 length * sizeof (tree));
5113 if (length == 3)
5115 /* vect_grouped_store_supported ensures that this is constant. */
5116 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5117 unsigned int j0 = 0, j1 = 0, j2 = 0;
5119 vec_perm_builder sel (nelt, nelt, 1);
5120 sel.quick_grow (nelt);
5121 vec_perm_indices indices;
5122 for (j = 0; j < 3; j++)
5124 int nelt0 = ((3 - j) * nelt) % 3;
5125 int nelt1 = ((3 - j) * nelt + 1) % 3;
5126 int nelt2 = ((3 - j) * nelt + 2) % 3;
5128 for (i = 0; i < nelt; i++)
5130 if (3 * i + nelt0 < nelt)
5131 sel[3 * i + nelt0] = j0++;
5132 if (3 * i + nelt1 < nelt)
5133 sel[3 * i + nelt1] = nelt + j1++;
5134 if (3 * i + nelt2 < nelt)
5135 sel[3 * i + nelt2] = 0;
5137 indices.new_vector (sel, 2, nelt);
5138 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5140 for (i = 0; i < nelt; i++)
5142 if (3 * i + nelt0 < nelt)
5143 sel[3 * i + nelt0] = 3 * i + nelt0;
5144 if (3 * i + nelt1 < nelt)
5145 sel[3 * i + nelt1] = 3 * i + nelt1;
5146 if (3 * i + nelt2 < nelt)
5147 sel[3 * i + nelt2] = nelt + j2++;
5149 indices.new_vector (sel, 2, nelt);
5150 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5152 vect1 = dr_chain[0];
5153 vect2 = dr_chain[1];
5155 /* Create interleaving stmt:
5156 low = VEC_PERM_EXPR <vect1, vect2,
5157 {j, nelt, *, j + 1, nelt + j + 1, *,
5158 j + 2, nelt + j + 2, *, ...}> */
5159 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5160 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5161 vect2, perm3_mask_low);
5162 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5164 vect1 = data_ref;
5165 vect2 = dr_chain[2];
5166 /* Create interleaving stmt:
5167 low = VEC_PERM_EXPR <vect1, vect2,
5168 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5169 6, 7, nelt + j + 2, ...}> */
5170 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5171 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5172 vect2, perm3_mask_high);
5173 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5174 (*result_chain)[j] = data_ref;
5177 else
5179 /* If length is not equal to 3 then only power of 2 is supported. */
5180 gcc_assert (pow2p_hwi (length));
5182 /* The encoding has 2 interleaved stepped patterns. */
5183 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5184 vec_perm_builder sel (nelt, 2, 3);
5185 sel.quick_grow (6);
5186 for (i = 0; i < 3; i++)
5188 sel[i * 2] = i;
5189 sel[i * 2 + 1] = i + nelt;
5191 vec_perm_indices indices (sel, 2, nelt);
5192 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5194 for (i = 0; i < 6; i++)
5195 sel[i] += exact_div (nelt, 2);
5196 indices.new_vector (sel, 2, nelt);
5197 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5199 for (i = 0, n = log_length; i < n; i++)
5201 for (j = 0; j < length/2; j++)
5203 vect1 = dr_chain[j];
5204 vect2 = dr_chain[j+length/2];
5206 /* Create interleaving stmt:
5207 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5208 ...}> */
5209 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5210 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5211 vect2, perm_mask_high);
5212 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5213 (*result_chain)[2*j] = high;
5215 /* Create interleaving stmt:
5216 low = VEC_PERM_EXPR <vect1, vect2,
5217 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5218 ...}> */
5219 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5220 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5221 vect2, perm_mask_low);
5222 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5223 (*result_chain)[2*j+1] = low;
5225 memcpy (dr_chain.address (), result_chain->address (),
5226 length * sizeof (tree));
5231 /* Function vect_setup_realignment
5233 This function is called when vectorizing an unaligned load using
5234 the dr_explicit_realign[_optimized] scheme.
5235 This function generates the following code at the loop prolog:
5237 p = initial_addr;
5238 x msq_init = *(floor(p)); # prolog load
5239 realignment_token = call target_builtin;
5240 loop:
5241 x msq = phi (msq_init, ---)
5243 The stmts marked with x are generated only for the case of
5244 dr_explicit_realign_optimized.
5246 The code above sets up a new (vector) pointer, pointing to the first
5247 location accessed by STMT_INFO, and a "floor-aligned" load using that
5248 pointer. It also generates code to compute the "realignment-token"
5249 (if the relevant target hook was defined), and creates a phi-node at the
5250 loop-header bb whose arguments are the result of the prolog-load (created
5251 by this function) and the result of a load that takes place in the loop
5252 (to be created by the caller to this function).
5254 For the case of dr_explicit_realign_optimized:
5255 The caller to this function uses the phi-result (msq) to create the
5256 realignment code inside the loop, and sets up the missing phi argument,
5257 as follows:
5258 loop:
5259 msq = phi (msq_init, lsq)
5260 lsq = *(floor(p')); # load in loop
5261 result = realign_load (msq, lsq, realignment_token);
5263 For the case of dr_explicit_realign:
5264 loop:
5265 msq = *(floor(p)); # load in loop
5266 p' = p + (VS-1);
5267 lsq = *(floor(p')); # load in loop
5268 result = realign_load (msq, lsq, realignment_token);
5270 Input:
5271 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5272 a memory location that may be unaligned.
5273 BSI - place where new code is to be inserted.
5274 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5275 is used.
5277 Output:
5278 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5279 target hook, if defined.
5280 Return value - the result of the loop-header phi node. */
5282 tree
5283 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5284 tree *realignment_token,
5285 enum dr_alignment_support alignment_support_scheme,
5286 tree init_addr,
5287 struct loop **at_loop)
5289 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5290 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5291 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5292 struct data_reference *dr = dr_info->dr;
5293 struct loop *loop = NULL;
5294 edge pe = NULL;
5295 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5296 tree vec_dest;
5297 gimple *inc;
5298 tree ptr;
5299 tree data_ref;
5300 basic_block new_bb;
5301 tree msq_init = NULL_TREE;
5302 tree new_temp;
5303 gphi *phi_stmt;
5304 tree msq = NULL_TREE;
5305 gimple_seq stmts = NULL;
5306 bool compute_in_loop = false;
5307 bool nested_in_vect_loop = false;
5308 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5309 struct loop *loop_for_initial_load = NULL;
5311 if (loop_vinfo)
5313 loop = LOOP_VINFO_LOOP (loop_vinfo);
5314 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5317 gcc_assert (alignment_support_scheme == dr_explicit_realign
5318 || alignment_support_scheme == dr_explicit_realign_optimized);
5320 /* We need to generate three things:
5321 1. the misalignment computation
5322 2. the extra vector load (for the optimized realignment scheme).
5323 3. the phi node for the two vectors from which the realignment is
5324 done (for the optimized realignment scheme). */
5326 /* 1. Determine where to generate the misalignment computation.
5328 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5329 calculation will be generated by this function, outside the loop (in the
5330 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5331 caller, inside the loop.
5333 Background: If the misalignment remains fixed throughout the iterations of
5334 the loop, then both realignment schemes are applicable, and also the
5335 misalignment computation can be done outside LOOP. This is because we are
5336 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5337 are a multiple of VS (the Vector Size), and therefore the misalignment in
5338 different vectorized LOOP iterations is always the same.
5339 The problem arises only if the memory access is in an inner-loop nested
5340 inside LOOP, which is now being vectorized using outer-loop vectorization.
5341 This is the only case when the misalignment of the memory access may not
5342 remain fixed throughout the iterations of the inner-loop (as explained in
5343 detail in vect_supportable_dr_alignment). In this case, not only is the
5344 optimized realignment scheme not applicable, but also the misalignment
5345 computation (and generation of the realignment token that is passed to
5346 REALIGN_LOAD) have to be done inside the loop.
5348 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5349 or not, which in turn determines if the misalignment is computed inside
5350 the inner-loop, or outside LOOP. */
5352 if (init_addr != NULL_TREE || !loop_vinfo)
5354 compute_in_loop = true;
5355 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5359 /* 2. Determine where to generate the extra vector load.
5361 For the optimized realignment scheme, instead of generating two vector
5362 loads in each iteration, we generate a single extra vector load in the
5363 preheader of the loop, and in each iteration reuse the result of the
5364 vector load from the previous iteration. In case the memory access is in
5365 an inner-loop nested inside LOOP, which is now being vectorized using
5366 outer-loop vectorization, we need to determine whether this initial vector
5367 load should be generated at the preheader of the inner-loop, or can be
5368 generated at the preheader of LOOP. If the memory access has no evolution
5369 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5370 to be generated inside LOOP (in the preheader of the inner-loop). */
5372 if (nested_in_vect_loop)
5374 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5375 bool invariant_in_outerloop =
5376 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5377 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5379 else
5380 loop_for_initial_load = loop;
5381 if (at_loop)
5382 *at_loop = loop_for_initial_load;
5384 if (loop_for_initial_load)
5385 pe = loop_preheader_edge (loop_for_initial_load);
5387 /* 3. For the case of the optimized realignment, create the first vector
5388 load at the loop preheader. */
5390 if (alignment_support_scheme == dr_explicit_realign_optimized)
5392 /* Create msq_init = *(floor(p1)) in the loop preheader */
5393 gassign *new_stmt;
5395 gcc_assert (!compute_in_loop);
5396 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5397 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5398 loop_for_initial_load, NULL_TREE,
5399 &init_addr, NULL, &inc, true);
5400 if (TREE_CODE (ptr) == SSA_NAME)
5401 new_temp = copy_ssa_name (ptr);
5402 else
5403 new_temp = make_ssa_name (TREE_TYPE (ptr));
5404 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5405 new_stmt = gimple_build_assign
5406 (new_temp, BIT_AND_EXPR, ptr,
5407 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5408 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5409 gcc_assert (!new_bb);
5410 data_ref
5411 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5412 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5413 vect_copy_ref_info (data_ref, DR_REF (dr));
5414 new_stmt = gimple_build_assign (vec_dest, data_ref);
5415 new_temp = make_ssa_name (vec_dest, new_stmt);
5416 gimple_assign_set_lhs (new_stmt, new_temp);
5417 if (pe)
5419 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5420 gcc_assert (!new_bb);
5422 else
5423 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5425 msq_init = gimple_assign_lhs (new_stmt);
5428 /* 4. Create realignment token using a target builtin, if available.
5429 It is done either inside the containing loop, or before LOOP (as
5430 determined above). */
5432 if (targetm.vectorize.builtin_mask_for_load)
5434 gcall *new_stmt;
5435 tree builtin_decl;
5437 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5438 if (!init_addr)
5440 /* Generate the INIT_ADDR computation outside LOOP. */
5441 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5442 NULL_TREE);
5443 if (loop)
5445 pe = loop_preheader_edge (loop);
5446 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5447 gcc_assert (!new_bb);
5449 else
5450 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5453 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5454 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5455 vec_dest =
5456 vect_create_destination_var (scalar_dest,
5457 gimple_call_return_type (new_stmt));
5458 new_temp = make_ssa_name (vec_dest, new_stmt);
5459 gimple_call_set_lhs (new_stmt, new_temp);
5461 if (compute_in_loop)
5462 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5463 else
5465 /* Generate the misalignment computation outside LOOP. */
5466 pe = loop_preheader_edge (loop);
5467 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5468 gcc_assert (!new_bb);
5471 *realignment_token = gimple_call_lhs (new_stmt);
5473 /* The result of the CALL_EXPR to this builtin is determined from
5474 the value of the parameter and no global variables are touched
5475 which makes the builtin a "const" function. Requiring the
5476 builtin to have the "const" attribute makes it unnecessary
5477 to call mark_call_clobbered. */
5478 gcc_assert (TREE_READONLY (builtin_decl));
5481 if (alignment_support_scheme == dr_explicit_realign)
5482 return msq;
5484 gcc_assert (!compute_in_loop);
5485 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5488 /* 5. Create msq = phi <msq_init, lsq> in loop */
5490 pe = loop_preheader_edge (containing_loop);
5491 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5492 msq = make_ssa_name (vec_dest);
5493 phi_stmt = create_phi_node (msq, containing_loop->header);
5494 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5496 return msq;
5500 /* Function vect_grouped_load_supported.
5502 COUNT is the size of the load group (the number of statements plus the
5503 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5504 only one statement, with a gap of COUNT - 1.
5506 Returns true if a suitable permute exists. */
5508 bool
5509 vect_grouped_load_supported (tree vectype, bool single_element_p,
5510 unsigned HOST_WIDE_INT count)
5512 machine_mode mode = TYPE_MODE (vectype);
5514 /* If this is single-element interleaving with an element distance
5515 that leaves unused vector loads around punt - we at least create
5516 very sub-optimal code in that case (and blow up memory,
5517 see PR65518). */
5518 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5520 if (dump_enabled_p ())
5521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522 "single-element interleaving not supported "
5523 "for not adjacent vector loads\n");
5524 return false;
5527 /* vect_permute_load_chain requires the group size to be equal to 3 or
5528 be a power of two. */
5529 if (count != 3 && exact_log2 (count) == -1)
5531 if (dump_enabled_p ())
5532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5533 "the size of the group of accesses"
5534 " is not a power of 2 or not equal to 3\n");
5535 return false;
5538 /* Check that the permutation is supported. */
5539 if (VECTOR_MODE_P (mode))
5541 unsigned int i, j;
5542 if (count == 3)
5544 unsigned int nelt;
5545 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5547 if (dump_enabled_p ())
5548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5549 "cannot handle groups of 3 loads for"
5550 " variable-length vectors\n");
5551 return false;
5554 vec_perm_builder sel (nelt, nelt, 1);
5555 sel.quick_grow (nelt);
5556 vec_perm_indices indices;
5557 unsigned int k;
5558 for (k = 0; k < 3; k++)
5560 for (i = 0; i < nelt; i++)
5561 if (3 * i + k < 2 * nelt)
5562 sel[i] = 3 * i + k;
5563 else
5564 sel[i] = 0;
5565 indices.new_vector (sel, 2, nelt);
5566 if (!can_vec_perm_const_p (mode, indices))
5568 if (dump_enabled_p ())
5569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5570 "shuffle of 3 loads is not supported by"
5571 " target\n");
5572 return false;
5574 for (i = 0, j = 0; i < nelt; i++)
5575 if (3 * i + k < 2 * nelt)
5576 sel[i] = i;
5577 else
5578 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5579 indices.new_vector (sel, 2, nelt);
5580 if (!can_vec_perm_const_p (mode, indices))
5582 if (dump_enabled_p ())
5583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5584 "shuffle of 3 loads is not supported by"
5585 " target\n");
5586 return false;
5589 return true;
5591 else
5593 /* If length is not equal to 3 then only power of 2 is supported. */
5594 gcc_assert (pow2p_hwi (count));
5595 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5597 /* The encoding has a single stepped pattern. */
5598 vec_perm_builder sel (nelt, 1, 3);
5599 sel.quick_grow (3);
5600 for (i = 0; i < 3; i++)
5601 sel[i] = i * 2;
5602 vec_perm_indices indices (sel, 2, nelt);
5603 if (can_vec_perm_const_p (mode, indices))
5605 for (i = 0; i < 3; i++)
5606 sel[i] = i * 2 + 1;
5607 indices.new_vector (sel, 2, nelt);
5608 if (can_vec_perm_const_p (mode, indices))
5609 return true;
5614 if (dump_enabled_p ())
5615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5616 "extract even/odd not supported by target\n");
5617 return false;
5620 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5621 type VECTYPE. MASKED_P says whether the masked form is needed. */
5623 bool
5624 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5625 bool masked_p)
5627 if (masked_p)
5628 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5629 vec_mask_load_lanes_optab,
5630 vectype, count);
5631 else
5632 return vect_lanes_optab_supported_p ("vec_load_lanes",
5633 vec_load_lanes_optab,
5634 vectype, count);
5637 /* Function vect_permute_load_chain.
5639 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5640 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5641 the input data correctly. Return the final references for loads in
5642 RESULT_CHAIN.
5644 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5645 The input is 4 vectors each containing 8 elements. We assign a number to each
5646 element, the input sequence is:
5648 1st vec: 0 1 2 3 4 5 6 7
5649 2nd vec: 8 9 10 11 12 13 14 15
5650 3rd vec: 16 17 18 19 20 21 22 23
5651 4th vec: 24 25 26 27 28 29 30 31
5653 The output sequence should be:
5655 1st vec: 0 4 8 12 16 20 24 28
5656 2nd vec: 1 5 9 13 17 21 25 29
5657 3rd vec: 2 6 10 14 18 22 26 30
5658 4th vec: 3 7 11 15 19 23 27 31
5660 i.e., the first output vector should contain the first elements of each
5661 interleaving group, etc.
5663 We use extract_even/odd instructions to create such output. The input of
5664 each extract_even/odd operation is two vectors
5665 1st vec 2nd vec
5666 0 1 2 3 4 5 6 7
5668 and the output is the vector of extracted even/odd elements. The output of
5669 extract_even will be: 0 2 4 6
5670 and of extract_odd: 1 3 5 7
5673 The permutation is done in log LENGTH stages. In each stage extract_even
5674 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5675 their order. In our example,
5677 E1: extract_even (1st vec, 2nd vec)
5678 E2: extract_odd (1st vec, 2nd vec)
5679 E3: extract_even (3rd vec, 4th vec)
5680 E4: extract_odd (3rd vec, 4th vec)
5682 The output for the first stage will be:
5684 E1: 0 2 4 6 8 10 12 14
5685 E2: 1 3 5 7 9 11 13 15
5686 E3: 16 18 20 22 24 26 28 30
5687 E4: 17 19 21 23 25 27 29 31
5689 In order to proceed and create the correct sequence for the next stage (or
5690 for the correct output, if the second stage is the last one, as in our
5691 example), we first put the output of extract_even operation and then the
5692 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5693 The input for the second stage is:
5695 1st vec (E1): 0 2 4 6 8 10 12 14
5696 2nd vec (E3): 16 18 20 22 24 26 28 30
5697 3rd vec (E2): 1 3 5 7 9 11 13 15
5698 4th vec (E4): 17 19 21 23 25 27 29 31
5700 The output of the second stage:
5702 E1: 0 4 8 12 16 20 24 28
5703 E2: 2 6 10 14 18 22 26 30
5704 E3: 1 5 9 13 17 21 25 29
5705 E4: 3 7 11 15 19 23 27 31
5707 And RESULT_CHAIN after reordering:
5709 1st vec (E1): 0 4 8 12 16 20 24 28
5710 2nd vec (E3): 1 5 9 13 17 21 25 29
5711 3rd vec (E2): 2 6 10 14 18 22 26 30
5712 4th vec (E4): 3 7 11 15 19 23 27 31. */
5714 static void
5715 vect_permute_load_chain (vec<tree> dr_chain,
5716 unsigned int length,
5717 stmt_vec_info stmt_info,
5718 gimple_stmt_iterator *gsi,
5719 vec<tree> *result_chain)
5721 tree data_ref, first_vect, second_vect;
5722 tree perm_mask_even, perm_mask_odd;
5723 tree perm3_mask_low, perm3_mask_high;
5724 gimple *perm_stmt;
5725 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5726 unsigned int i, j, log_length = exact_log2 (length);
5728 result_chain->quick_grow (length);
5729 memcpy (result_chain->address (), dr_chain.address (),
5730 length * sizeof (tree));
5732 if (length == 3)
5734 /* vect_grouped_load_supported ensures that this is constant. */
5735 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5736 unsigned int k;
5738 vec_perm_builder sel (nelt, nelt, 1);
5739 sel.quick_grow (nelt);
5740 vec_perm_indices indices;
5741 for (k = 0; k < 3; k++)
5743 for (i = 0; i < nelt; i++)
5744 if (3 * i + k < 2 * nelt)
5745 sel[i] = 3 * i + k;
5746 else
5747 sel[i] = 0;
5748 indices.new_vector (sel, 2, nelt);
5749 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5751 for (i = 0, j = 0; i < nelt; i++)
5752 if (3 * i + k < 2 * nelt)
5753 sel[i] = i;
5754 else
5755 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5756 indices.new_vector (sel, 2, nelt);
5757 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5759 first_vect = dr_chain[0];
5760 second_vect = dr_chain[1];
5762 /* Create interleaving stmt (low part of):
5763 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5764 ...}> */
5765 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5766 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5767 second_vect, perm3_mask_low);
5768 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5770 /* Create interleaving stmt (high part of):
5771 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5772 ...}> */
5773 first_vect = data_ref;
5774 second_vect = dr_chain[2];
5775 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5776 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5777 second_vect, perm3_mask_high);
5778 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5779 (*result_chain)[k] = data_ref;
5782 else
5784 /* If length is not equal to 3 then only power of 2 is supported. */
5785 gcc_assert (pow2p_hwi (length));
5787 /* The encoding has a single stepped pattern. */
5788 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5789 vec_perm_builder sel (nelt, 1, 3);
5790 sel.quick_grow (3);
5791 for (i = 0; i < 3; ++i)
5792 sel[i] = i * 2;
5793 vec_perm_indices indices (sel, 2, nelt);
5794 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5796 for (i = 0; i < 3; ++i)
5797 sel[i] = i * 2 + 1;
5798 indices.new_vector (sel, 2, nelt);
5799 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5801 for (i = 0; i < log_length; i++)
5803 for (j = 0; j < length; j += 2)
5805 first_vect = dr_chain[j];
5806 second_vect = dr_chain[j+1];
5808 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5809 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5810 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5811 first_vect, second_vect,
5812 perm_mask_even);
5813 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5814 (*result_chain)[j/2] = data_ref;
5816 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5817 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5818 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5819 first_vect, second_vect,
5820 perm_mask_odd);
5821 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5822 (*result_chain)[j/2+length/2] = data_ref;
5824 memcpy (dr_chain.address (), result_chain->address (),
5825 length * sizeof (tree));
5830 /* Function vect_shift_permute_load_chain.
5832 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5833 sequence of stmts to reorder the input data accordingly.
5834 Return the final references for loads in RESULT_CHAIN.
5835 Return true if successed, false otherwise.
5837 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5838 The input is 3 vectors each containing 8 elements. We assign a
5839 number to each element, the input sequence is:
5841 1st vec: 0 1 2 3 4 5 6 7
5842 2nd vec: 8 9 10 11 12 13 14 15
5843 3rd vec: 16 17 18 19 20 21 22 23
5845 The output sequence should be:
5847 1st vec: 0 3 6 9 12 15 18 21
5848 2nd vec: 1 4 7 10 13 16 19 22
5849 3rd vec: 2 5 8 11 14 17 20 23
5851 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5853 First we shuffle all 3 vectors to get correct elements order:
5855 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5856 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5857 3rd vec: (16 19 22) (17 20 23) (18 21)
5859 Next we unite and shift vector 3 times:
5861 1st step:
5862 shift right by 6 the concatenation of:
5863 "1st vec" and "2nd vec"
5864 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5865 "2nd vec" and "3rd vec"
5866 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5867 "3rd vec" and "1st vec"
5868 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5869 | New vectors |
5871 So that now new vectors are:
5873 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5874 2nd vec: (10 13) (16 19 22) (17 20 23)
5875 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5877 2nd step:
5878 shift right by 5 the concatenation of:
5879 "1st vec" and "3rd vec"
5880 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5881 "2nd vec" and "1st vec"
5882 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5883 "3rd vec" and "2nd vec"
5884 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5885 | New vectors |
5887 So that now new vectors are:
5889 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5890 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5891 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5893 3rd step:
5894 shift right by 5 the concatenation of:
5895 "1st vec" and "1st vec"
5896 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5897 shift right by 3 the concatenation of:
5898 "2nd vec" and "2nd vec"
5899 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5900 | New vectors |
5902 So that now all vectors are READY:
5903 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5904 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5905 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5907 This algorithm is faster than one in vect_permute_load_chain if:
5908 1. "shift of a concatination" is faster than general permutation.
5909 This is usually so.
5910 2. The TARGET machine can't execute vector instructions in parallel.
5911 This is because each step of the algorithm depends on previous.
5912 The algorithm in vect_permute_load_chain is much more parallel.
5914 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5917 static bool
5918 vect_shift_permute_load_chain (vec<tree> dr_chain,
5919 unsigned int length,
5920 stmt_vec_info stmt_info,
5921 gimple_stmt_iterator *gsi,
5922 vec<tree> *result_chain)
5924 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5925 tree perm2_mask1, perm2_mask2, perm3_mask;
5926 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5927 gimple *perm_stmt;
5929 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5930 unsigned int i;
5931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5933 unsigned HOST_WIDE_INT nelt, vf;
5934 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5935 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5936 /* Not supported for variable-length vectors. */
5937 return false;
5939 vec_perm_builder sel (nelt, nelt, 1);
5940 sel.quick_grow (nelt);
5942 result_chain->quick_grow (length);
5943 memcpy (result_chain->address (), dr_chain.address (),
5944 length * sizeof (tree));
5946 if (pow2p_hwi (length) && vf > 4)
5948 unsigned int j, log_length = exact_log2 (length);
5949 for (i = 0; i < nelt / 2; ++i)
5950 sel[i] = i * 2;
5951 for (i = 0; i < nelt / 2; ++i)
5952 sel[nelt / 2 + i] = i * 2 + 1;
5953 vec_perm_indices indices (sel, 2, nelt);
5954 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5956 if (dump_enabled_p ())
5957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5958 "shuffle of 2 fields structure is not \
5959 supported by target\n");
5960 return false;
5962 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5964 for (i = 0; i < nelt / 2; ++i)
5965 sel[i] = i * 2 + 1;
5966 for (i = 0; i < nelt / 2; ++i)
5967 sel[nelt / 2 + i] = i * 2;
5968 indices.new_vector (sel, 2, nelt);
5969 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5971 if (dump_enabled_p ())
5972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5973 "shuffle of 2 fields structure is not \
5974 supported by target\n");
5975 return false;
5977 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5979 /* Generating permutation constant to shift all elements.
5980 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5981 for (i = 0; i < nelt; i++)
5982 sel[i] = nelt / 2 + i;
5983 indices.new_vector (sel, 2, nelt);
5984 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5986 if (dump_enabled_p ())
5987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5988 "shift permutation is not supported by target\n");
5989 return false;
5991 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5993 /* Generating permutation constant to select vector from 2.
5994 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5995 for (i = 0; i < nelt / 2; i++)
5996 sel[i] = i;
5997 for (i = nelt / 2; i < nelt; i++)
5998 sel[i] = nelt + i;
5999 indices.new_vector (sel, 2, nelt);
6000 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6002 if (dump_enabled_p ())
6003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6004 "select is not supported by target\n");
6005 return false;
6007 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6009 for (i = 0; i < log_length; i++)
6011 for (j = 0; j < length; j += 2)
6013 first_vect = dr_chain[j];
6014 second_vect = dr_chain[j + 1];
6016 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6017 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6018 first_vect, first_vect,
6019 perm2_mask1);
6020 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6021 vect[0] = data_ref;
6023 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6024 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6025 second_vect, second_vect,
6026 perm2_mask2);
6027 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6028 vect[1] = data_ref;
6030 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6031 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6032 vect[0], vect[1], shift1_mask);
6033 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6034 (*result_chain)[j/2 + length/2] = data_ref;
6036 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6037 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6038 vect[0], vect[1], select_mask);
6039 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6040 (*result_chain)[j/2] = data_ref;
6042 memcpy (dr_chain.address (), result_chain->address (),
6043 length * sizeof (tree));
6045 return true;
6047 if (length == 3 && vf > 2)
6049 unsigned int k = 0, l = 0;
6051 /* Generating permutation constant to get all elements in rigth order.
6052 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6053 for (i = 0; i < nelt; i++)
6055 if (3 * k + (l % 3) >= nelt)
6057 k = 0;
6058 l += (3 - (nelt % 3));
6060 sel[i] = 3 * k + (l % 3);
6061 k++;
6063 vec_perm_indices indices (sel, 2, nelt);
6064 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6066 if (dump_enabled_p ())
6067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6068 "shuffle of 3 fields structure is not \
6069 supported by target\n");
6070 return false;
6072 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6074 /* Generating permutation constant to shift all elements.
6075 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6076 for (i = 0; i < nelt; i++)
6077 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6078 indices.new_vector (sel, 2, nelt);
6079 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6081 if (dump_enabled_p ())
6082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6083 "shift permutation is not supported by target\n");
6084 return false;
6086 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6088 /* Generating permutation constant to shift all elements.
6089 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6090 for (i = 0; i < nelt; i++)
6091 sel[i] = 2 * (nelt / 3) + 1 + i;
6092 indices.new_vector (sel, 2, nelt);
6093 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6095 if (dump_enabled_p ())
6096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6097 "shift permutation is not supported by target\n");
6098 return false;
6100 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6102 /* Generating permutation constant to shift all elements.
6103 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6104 for (i = 0; i < nelt; i++)
6105 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6106 indices.new_vector (sel, 2, nelt);
6107 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6109 if (dump_enabled_p ())
6110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6111 "shift permutation is not supported by target\n");
6112 return false;
6114 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6116 /* Generating permutation constant to shift all elements.
6117 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6118 for (i = 0; i < nelt; i++)
6119 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6120 indices.new_vector (sel, 2, nelt);
6121 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6123 if (dump_enabled_p ())
6124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6125 "shift permutation is not supported by target\n");
6126 return false;
6128 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6130 for (k = 0; k < 3; k++)
6132 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6133 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6134 dr_chain[k], dr_chain[k],
6135 perm3_mask);
6136 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6137 vect[k] = data_ref;
6140 for (k = 0; k < 3; k++)
6142 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6143 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6144 vect[k % 3], vect[(k + 1) % 3],
6145 shift1_mask);
6146 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6147 vect_shift[k] = data_ref;
6150 for (k = 0; k < 3; k++)
6152 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6153 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6154 vect_shift[(4 - k) % 3],
6155 vect_shift[(3 - k) % 3],
6156 shift2_mask);
6157 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6158 vect[k] = data_ref;
6161 (*result_chain)[3 - (nelt % 3)] = vect[2];
6163 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6164 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6165 vect[0], shift3_mask);
6166 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6167 (*result_chain)[nelt % 3] = data_ref;
6169 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6170 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6171 vect[1], shift4_mask);
6172 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6173 (*result_chain)[0] = data_ref;
6174 return true;
6176 return false;
6179 /* Function vect_transform_grouped_load.
6181 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6182 to perform their permutation and ascribe the result vectorized statements to
6183 the scalar statements.
6186 void
6187 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6188 int size, gimple_stmt_iterator *gsi)
6190 machine_mode mode;
6191 vec<tree> result_chain = vNULL;
6193 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6194 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6195 vectors, that are ready for vector computation. */
6196 result_chain.create (size);
6198 /* If reassociation width for vector type is 2 or greater target machine can
6199 execute 2 or more vector instructions in parallel. Otherwise try to
6200 get chain for loads group using vect_shift_permute_load_chain. */
6201 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6202 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6203 || pow2p_hwi (size)
6204 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6205 gsi, &result_chain))
6206 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6207 vect_record_grouped_load_vectors (stmt_info, result_chain);
6208 result_chain.release ();
6211 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6212 generated as part of the vectorization of STMT_INFO. Assign the statement
6213 for each vector to the associated scalar statement. */
6215 void
6216 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6217 vec<tree> result_chain)
6219 vec_info *vinfo = stmt_info->vinfo;
6220 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6221 unsigned int i, gap_count;
6222 tree tmp_data_ref;
6224 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6225 Since we scan the chain starting from it's first node, their order
6226 corresponds the order of data-refs in RESULT_CHAIN. */
6227 stmt_vec_info next_stmt_info = first_stmt_info;
6228 gap_count = 1;
6229 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6231 if (!next_stmt_info)
6232 break;
6234 /* Skip the gaps. Loads created for the gaps will be removed by dead
6235 code elimination pass later. No need to check for the first stmt in
6236 the group, since it always exists.
6237 DR_GROUP_GAP is the number of steps in elements from the previous
6238 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6239 correspond to the gaps. */
6240 if (next_stmt_info != first_stmt_info
6241 && gap_count < DR_GROUP_GAP (next_stmt_info))
6243 gap_count++;
6244 continue;
6247 while (next_stmt_info)
6249 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6250 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6251 copies, and we put the new vector statement in the first available
6252 RELATED_STMT. */
6253 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6254 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6255 else
6257 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6259 stmt_vec_info prev_stmt_info
6260 = STMT_VINFO_VEC_STMT (next_stmt_info);
6261 stmt_vec_info rel_stmt_info
6262 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6263 while (rel_stmt_info)
6265 prev_stmt_info = rel_stmt_info;
6266 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6269 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6273 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6274 gap_count = 1;
6275 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6276 put the same TMP_DATA_REF as its vectorized statement; otherwise
6277 get the next data-ref from RESULT_CHAIN. */
6278 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6279 break;
6284 /* Function vect_force_dr_alignment_p.
6286 Returns whether the alignment of a DECL can be forced to be aligned
6287 on ALIGNMENT bit boundary. */
6289 bool
6290 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6292 if (!VAR_P (decl))
6293 return false;
6295 if (decl_in_symtab_p (decl)
6296 && !symtab_node::get (decl)->can_increase_alignment_p ())
6297 return false;
6299 if (TREE_STATIC (decl))
6300 return (alignment <= MAX_OFILE_ALIGNMENT);
6301 else
6302 return (alignment <= MAX_STACK_ALIGNMENT);
6306 /* Return whether the data reference DR_INFO is supported with respect to its
6307 alignment.
6308 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6309 it is aligned, i.e., check if it is possible to vectorize it with different
6310 alignment. */
6312 enum dr_alignment_support
6313 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6314 bool check_aligned_accesses)
6316 data_reference *dr = dr_info->dr;
6317 stmt_vec_info stmt_info = dr_info->stmt;
6318 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6319 machine_mode mode = TYPE_MODE (vectype);
6320 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6321 struct loop *vect_loop = NULL;
6322 bool nested_in_vect_loop = false;
6324 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6325 return dr_aligned;
6327 /* For now assume all conditional loads/stores support unaligned
6328 access without any special code. */
6329 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6330 if (gimple_call_internal_p (stmt)
6331 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6332 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6333 return dr_unaligned_supported;
6335 if (loop_vinfo)
6337 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6338 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6341 /* Possibly unaligned access. */
6343 /* We can choose between using the implicit realignment scheme (generating
6344 a misaligned_move stmt) and the explicit realignment scheme (generating
6345 aligned loads with a REALIGN_LOAD). There are two variants to the
6346 explicit realignment scheme: optimized, and unoptimized.
6347 We can optimize the realignment only if the step between consecutive
6348 vector loads is equal to the vector size. Since the vector memory
6349 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6350 is guaranteed that the misalignment amount remains the same throughout the
6351 execution of the vectorized loop. Therefore, we can create the
6352 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6353 at the loop preheader.
6355 However, in the case of outer-loop vectorization, when vectorizing a
6356 memory access in the inner-loop nested within the LOOP that is now being
6357 vectorized, while it is guaranteed that the misalignment of the
6358 vectorized memory access will remain the same in different outer-loop
6359 iterations, it is *not* guaranteed that is will remain the same throughout
6360 the execution of the inner-loop. This is because the inner-loop advances
6361 with the original scalar step (and not in steps of VS). If the inner-loop
6362 step happens to be a multiple of VS, then the misalignment remains fixed
6363 and we can use the optimized realignment scheme. For example:
6365 for (i=0; i<N; i++)
6366 for (j=0; j<M; j++)
6367 s += a[i+j];
6369 When vectorizing the i-loop in the above example, the step between
6370 consecutive vector loads is 1, and so the misalignment does not remain
6371 fixed across the execution of the inner-loop, and the realignment cannot
6372 be optimized (as illustrated in the following pseudo vectorized loop):
6374 for (i=0; i<N; i+=4)
6375 for (j=0; j<M; j++){
6376 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6377 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6378 // (assuming that we start from an aligned address).
6381 We therefore have to use the unoptimized realignment scheme:
6383 for (i=0; i<N; i+=4)
6384 for (j=k; j<M; j+=4)
6385 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6386 // that the misalignment of the initial address is
6387 // 0).
6389 The loop can then be vectorized as follows:
6391 for (k=0; k<4; k++){
6392 rt = get_realignment_token (&vp[k]);
6393 for (i=0; i<N; i+=4){
6394 v1 = vp[i+k];
6395 for (j=k; j<M; j+=4){
6396 v2 = vp[i+j+VS-1];
6397 va = REALIGN_LOAD <v1,v2,rt>;
6398 vs += va;
6399 v1 = v2;
6402 } */
6404 if (DR_IS_READ (dr))
6406 bool is_packed = false;
6407 tree type = (TREE_TYPE (DR_REF (dr)));
6409 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6410 && (!targetm.vectorize.builtin_mask_for_load
6411 || targetm.vectorize.builtin_mask_for_load ()))
6413 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6415 /* If we are doing SLP then the accesses need not have the
6416 same alignment, instead it depends on the SLP group size. */
6417 if (loop_vinfo
6418 && STMT_SLP_TYPE (stmt_info)
6419 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6420 * (DR_GROUP_SIZE
6421 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6422 TYPE_VECTOR_SUBPARTS (vectype)))
6424 else if (!loop_vinfo
6425 || (nested_in_vect_loop
6426 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6427 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6428 return dr_explicit_realign;
6429 else
6430 return dr_explicit_realign_optimized;
6432 if (!known_alignment_for_access_p (dr_info))
6433 is_packed = not_size_aligned (DR_REF (dr));
6435 if (targetm.vectorize.support_vector_misalignment
6436 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6437 /* Can't software pipeline the loads, but can at least do them. */
6438 return dr_unaligned_supported;
6440 else
6442 bool is_packed = false;
6443 tree type = (TREE_TYPE (DR_REF (dr)));
6445 if (!known_alignment_for_access_p (dr_info))
6446 is_packed = not_size_aligned (DR_REF (dr));
6448 if (targetm.vectorize.support_vector_misalignment
6449 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6450 return dr_unaligned_supported;
6453 /* Unsupported. */
6454 return dr_unaligned_unsupported;