Daily bump.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob1c27c5034cb5621d4711ba91365c77d6564a9567
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
309 group size. */
310 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
311 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
312 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
313 return false;
315 /* Even if we have an anti-dependence then, as the vectorized loop covers at
316 least two scalar iterations, there is always also a true dependence.
317 As the vectorizer does not re-order loads and stores we can ignore
318 the anti-dependence if TBAA can disambiguate both DRs similar to the
319 case with known negative distance anti-dependences (positive
320 distance anti-dependences would violate TBAA constraints). */
321 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
322 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
323 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
324 get_alias_set (DR_REF (drb))))
325 return false;
327 /* Unknown data dependence. */
328 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
330 /* If user asserted safelen consecutive iterations can be
331 executed concurrently, assume independence. */
332 if (loop->safelen >= 2)
334 if ((unsigned int) loop->safelen < *max_vf)
335 *max_vf = loop->safelen;
336 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
337 return false;
340 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
341 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
343 if (dump_enabled_p ())
345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
346 "versioning for alias not supported for: "
347 "can't determine dependence between ");
348 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
349 DR_REF (dra));
350 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
351 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
352 DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
355 return true;
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
361 "versioning for alias required: "
362 "can't determine dependence between ");
363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
364 DR_REF (dra));
365 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
366 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
367 DR_REF (drb));
368 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371 /* Add to list of ddrs that need to be tested at run-time. */
372 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
375 /* Known data dependence. */
376 if (DDR_NUM_DIST_VECTS (ddr) == 0)
378 /* If user asserted safelen consecutive iterations can be
379 executed concurrently, assume independence. */
380 if (loop->safelen >= 2)
382 if ((unsigned int) loop->safelen < *max_vf)
383 *max_vf = loop->safelen;
384 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
385 return false;
388 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
389 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "versioning for alias not supported for: "
395 "bad dist vector for ");
396 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
397 DR_REF (dra));
398 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
399 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
400 DR_REF (drb));
401 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
403 return true;
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "versioning for alias required: "
410 "bad dist vector for ");
411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
412 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
413 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
416 /* Add to list of ddrs that need to be tested at run-time. */
417 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
420 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
422 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
423 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
424 loop_depth, max_vf))
425 return false;
427 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
429 int dist = dist_v[loop_depth];
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_NOTE, vect_location,
433 "dependence distance = %d.\n", dist);
435 if (dist == 0)
437 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "dependence distance == 0 between ");
441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
442 dump_printf (MSG_NOTE, " and ");
443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
444 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
447 /* When we perform grouped accesses and perform implicit CSE
448 by detecting equal accesses and doing disambiguation with
449 runtime alias tests like for
450 .. = a[i];
451 .. = a[i+1];
452 a[i] = ..;
453 a[i+1] = ..;
454 *p = ..;
455 .. = a[i];
456 .. = a[i+1];
457 where we will end up loading { a[i], a[i+1] } once, make
458 sure that inserting group loads before the first load and
459 stores after the last store will do the right thing.
460 Similar for groups like
461 a[i] = ...;
462 ... = a[i];
463 a[i+1] = ...;
464 where loads from the group interleave with the store. */
465 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "READ_WRITE dependence in interleaving.\n");
470 return true;
473 if (loop->safelen < 2)
475 tree indicator = dr_zero_step_indicator (dra);
476 if (TREE_CODE (indicator) != INTEGER_CST)
477 vect_check_nonzero_value (loop_vinfo, indicator);
478 else if (integer_zerop (indicator))
480 if (dump_enabled_p ())
481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
482 "access also has a zero step\n");
483 return true;
486 continue;
489 if (dist > 0 && DDR_REVERSED_P (ddr))
491 /* If DDR_REVERSED_P the order of the data-refs in DDR was
492 reversed (to make distance vector positive), and the actual
493 distance is negative. */
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 "dependence distance negative.\n");
497 /* Record a negative dependence distance to later limit the
498 amount of stmt copying / unrolling we can perform.
499 Only need to handle read-after-write dependence. */
500 if (DR_IS_READ (drb)
501 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
502 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
503 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
504 continue;
507 unsigned int abs_dist = abs (dist);
508 if (abs_dist >= 2 && abs_dist < *max_vf)
510 /* The dependence distance requires reduction of the maximal
511 vectorization factor. */
512 *max_vf = abs (dist);
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "adjusting maximal vectorization factor to %i\n",
516 *max_vf);
519 if (abs_dist >= *max_vf)
521 /* Dependence distance does not create dependence, as far as
522 vectorization is concerned, in this case. */
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "dependence distance >= VF.\n");
526 continue;
529 if (dump_enabled_p ())
531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
532 "not vectorized, possible dependence "
533 "between data-refs ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
543 return false;
546 /* Function vect_analyze_data_ref_dependences.
548 Examine all the data references in the loop, and make sure there do not
549 exist any data dependences between them. Set *MAX_VF according to
550 the maximum vectorization factor the data dependences allow. */
552 bool
553 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
554 unsigned int *max_vf)
556 unsigned int i;
557 struct data_dependence_relation *ddr;
559 if (dump_enabled_p ())
560 dump_printf_loc (MSG_NOTE, vect_location,
561 "=== vect_analyze_data_ref_dependences ===\n");
563 LOOP_VINFO_DDRS (loop_vinfo)
564 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
565 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
566 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
567 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
568 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
569 &LOOP_VINFO_DDRS (loop_vinfo),
570 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
571 return false;
573 /* For epilogues we either have no aliases or alias versioning
574 was applied to original loop. Therefore we may just get max_vf
575 using VF of original loop. */
576 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
577 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
578 else
579 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
580 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
581 return false;
583 return true;
587 /* Function vect_slp_analyze_data_ref_dependence.
589 Return TRUE if there (might) exist a dependence between a memory-reference
590 DRA and a memory-reference DRB. When versioning for alias may check a
591 dependence at run-time, return FALSE. Adjust *MAX_VF according to
592 the data dependence. */
594 static bool
595 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
597 struct data_reference *dra = DDR_A (ddr);
598 struct data_reference *drb = DDR_B (ddr);
600 /* We need to check dependences of statements marked as unvectorizable
601 as well, they still can prohibit vectorization. */
603 /* Independent data accesses. */
604 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
605 return false;
607 if (dra == drb)
608 return false;
610 /* Read-read is OK. */
611 if (DR_IS_READ (dra) && DR_IS_READ (drb))
612 return false;
614 /* If dra and drb are part of the same interleaving chain consider
615 them independent. */
616 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
617 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
618 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
619 return false;
621 /* Unknown data dependence. */
622 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "can't determine dependence between ");
628 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
629 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
630 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
631 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
634 else if (dump_enabled_p ())
636 dump_printf_loc (MSG_NOTE, vect_location,
637 "determined dependence between ");
638 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
639 dump_printf (MSG_NOTE, " and ");
640 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
641 dump_printf (MSG_NOTE, "\n");
644 return true;
648 /* Analyze dependences involved in the transform of SLP NODE. STORES
649 contain the vector of scalar stores of this instance if we are
650 disambiguating the loads. */
652 static bool
653 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
654 vec<gimple *> stores, gimple *last_store)
656 /* This walks over all stmts involved in the SLP load/store done
657 in NODE verifying we can sink them up to the last stmt in the
658 group. */
659 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
660 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
662 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
663 if (access == last_access)
664 continue;
665 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
666 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
667 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
669 gimple *stmt = gsi_stmt (gsi);
670 if (! gimple_vuse (stmt)
671 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
672 continue;
674 /* If we couldn't record a (single) data reference for this
675 stmt we have to give up. */
676 /* ??? Here and below if dependence analysis fails we can resort
677 to the alias oracle which can handle more kinds of stmts. */
678 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
679 if (!dr_b)
680 return false;
682 bool dependent = false;
683 /* If we run into a store of this same instance (we've just
684 marked those) then delay dependence checking until we run
685 into the last store because this is where it will have
686 been sunk to (and we verify if we can do that as well). */
687 if (gimple_visited_p (stmt))
689 if (stmt != last_store)
690 continue;
691 unsigned i;
692 gimple *store;
693 FOR_EACH_VEC_ELT (stores, i, store)
695 data_reference *store_dr
696 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
697 ddr_p ddr = initialize_data_dependence_relation
698 (dr_a, store_dr, vNULL);
699 dependent = vect_slp_analyze_data_ref_dependence (ddr);
700 free_dependence_relation (ddr);
701 if (dependent)
702 break;
705 else
707 ddr_p ddr = initialize_data_dependence_relation (dr_a,
708 dr_b, vNULL);
709 dependent = vect_slp_analyze_data_ref_dependence (ddr);
710 free_dependence_relation (ddr);
712 if (dependent)
713 return false;
716 return true;
720 /* Function vect_analyze_data_ref_dependences.
722 Examine all the data references in the basic-block, and make sure there
723 do not exist any data dependences between them. Set *MAX_VF according to
724 the maximum vectorization factor the data dependences allow. */
726 bool
727 vect_slp_analyze_instance_dependence (slp_instance instance)
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE, vect_location,
731 "=== vect_slp_analyze_instance_dependence ===\n");
733 /* The stores of this instance are at the root of the SLP tree. */
734 slp_tree store = SLP_INSTANCE_TREE (instance);
735 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
736 store = NULL;
738 /* Verify we can sink stores to the vectorized stmt insert location. */
739 gimple *last_store = NULL;
740 if (store)
742 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
743 return false;
745 /* Mark stores in this instance and remember the last one. */
746 last_store = vect_find_last_scalar_stmt_in_slp (store);
747 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
748 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
751 bool res = true;
753 /* Verify we can sink loads to the vectorized stmt insert location,
754 special-casing stores of this instance. */
755 slp_tree load;
756 unsigned int i;
757 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
758 if (! vect_slp_analyze_node_dependences (instance, load,
759 store
760 ? SLP_TREE_SCALAR_STMTS (store)
761 : vNULL, last_store))
763 res = false;
764 break;
767 /* Unset the visited flag. */
768 if (store)
769 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
770 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
772 return res;
775 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
776 the statement that contains DRB, which is useful for recording in the
777 dump file. */
779 static void
780 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
781 innermost_loop_behavior *drb)
783 bool existed;
784 innermost_loop_behavior *&entry
785 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
786 if (!existed || entry->base_alignment < drb->base_alignment)
788 entry = drb;
789 if (dump_enabled_p ())
791 dump_printf_loc (MSG_NOTE, vect_location,
792 "recording new base alignment for ");
793 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
794 dump_printf (MSG_NOTE, "\n");
795 dump_printf_loc (MSG_NOTE, vect_location,
796 " alignment: %d\n", drb->base_alignment);
797 dump_printf_loc (MSG_NOTE, vect_location,
798 " misalignment: %d\n", drb->base_misalignment);
799 dump_printf_loc (MSG_NOTE, vect_location,
800 " based on: ");
801 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
806 /* If the region we're going to vectorize is reached, all unconditional
807 data references occur at least once. We can therefore pool the base
808 alignment guarantees from each unconditional reference. Do this by
809 going through all the data references in VINFO and checking whether
810 the containing statement makes the reference unconditionally. If so,
811 record the alignment of the base address in VINFO so that it can be
812 used for all other references with the same base. */
814 void
815 vect_record_base_alignments (vec_info *vinfo)
817 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
818 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
819 data_reference *dr;
820 unsigned int i;
821 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
822 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
824 gimple *stmt = DR_STMT (dr);
825 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
827 /* If DR is nested in the loop that is being vectorized, we can also
828 record the alignment of the base wrt the outer loop. */
829 if (loop && nested_in_vect_loop_p (loop, stmt))
831 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
832 vect_record_base_alignment
833 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
838 /* Return the target alignment for the vectorized form of DR. */
840 static unsigned int
841 vect_calculate_target_alignment (struct data_reference *dr)
843 gimple *stmt = DR_STMT (dr);
844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
845 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
846 return targetm.vectorize.preferred_vector_alignment (vectype);
849 /* Function vect_compute_data_ref_alignment
851 Compute the misalignment of the data reference DR.
853 Output:
854 1. If during the misalignment computation it is found that the data reference
855 cannot be vectorized then false is returned.
856 2. DR_MISALIGNMENT (DR) is defined.
858 FOR NOW: No analysis is actually performed. Misalignment is calculated
859 only for trivial cases. TODO. */
861 bool
862 vect_compute_data_ref_alignment (struct data_reference *dr)
864 gimple *stmt = DR_STMT (dr);
865 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
866 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
867 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
868 struct loop *loop = NULL;
869 tree ref = DR_REF (dr);
870 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "vect_compute_data_ref_alignment:\n");
876 if (loop_vinfo)
877 loop = LOOP_VINFO_LOOP (loop_vinfo);
879 /* Initialize misalignment to unknown. */
880 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
882 innermost_loop_behavior *drb = vect_dr_behavior (dr);
883 bool step_preserves_misalignment_p;
885 unsigned HOST_WIDE_INT vector_alignment
886 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
887 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
889 /* No step for BB vectorization. */
890 if (!loop)
892 gcc_assert (integer_zerop (drb->step));
893 step_preserves_misalignment_p = true;
896 /* In case the dataref is in an inner-loop of the loop that is being
897 vectorized (LOOP), we use the base and misalignment information
898 relative to the outer-loop (LOOP). This is ok only if the misalignment
899 stays the same throughout the execution of the inner-loop, which is why
900 we have to check that the stride of the dataref in the inner-loop evenly
901 divides by the vector alignment. */
902 else if (nested_in_vect_loop_p (loop, stmt))
904 step_preserves_misalignment_p
905 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
907 if (dump_enabled_p ())
909 if (step_preserves_misalignment_p)
910 dump_printf_loc (MSG_NOTE, vect_location,
911 "inner step divides the vector alignment.\n");
912 else
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "inner step doesn't divide the vector"
915 " alignment.\n");
919 /* Similarly we can only use base and misalignment information relative to
920 an innermost loop if the misalignment stays the same throughout the
921 execution of the loop. As above, this is the case if the stride of
922 the dataref evenly divides by the alignment. */
923 else
925 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
926 step_preserves_misalignment_p
927 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
929 if (!step_preserves_misalignment_p && dump_enabled_p ())
930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
931 "step doesn't divide the vector alignment.\n");
934 unsigned int base_alignment = drb->base_alignment;
935 unsigned int base_misalignment = drb->base_misalignment;
937 /* Calculate the maximum of the pooled base address alignment and the
938 alignment that we can compute for DR itself. */
939 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
940 if (entry && base_alignment < (*entry)->base_alignment)
942 base_alignment = (*entry)->base_alignment;
943 base_misalignment = (*entry)->base_misalignment;
946 if (drb->offset_alignment < vector_alignment
947 || !step_preserves_misalignment_p
948 /* We need to know whether the step wrt the vectorized loop is
949 negative when computing the starting misalignment below. */
950 || TREE_CODE (drb->step) != INTEGER_CST)
952 if (dump_enabled_p ())
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "Unknown alignment for access: ");
956 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
957 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
959 return true;
962 if (base_alignment < vector_alignment)
964 unsigned int max_alignment;
965 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
966 if (max_alignment < vector_alignment
967 || !vect_can_force_dr_alignment_p (base,
968 vector_alignment * BITS_PER_UNIT))
970 if (dump_enabled_p ())
972 dump_printf_loc (MSG_NOTE, vect_location,
973 "can't force alignment of ref: ");
974 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
975 dump_printf (MSG_NOTE, "\n");
977 return true;
980 /* Force the alignment of the decl.
981 NOTE: This is the only change to the code we make during
982 the analysis phase, before deciding to vectorize the loop. */
983 if (dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
986 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
987 dump_printf (MSG_NOTE, "\n");
990 DR_VECT_AUX (dr)->base_decl = base;
991 DR_VECT_AUX (dr)->base_misaligned = true;
992 base_misalignment = 0;
994 poly_int64 misalignment
995 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
997 /* If this is a backward running DR then first access in the larger
998 vectype actually is N-1 elements before the address in the DR.
999 Adjust misalign accordingly. */
1000 if (tree_int_cst_sgn (drb->step) < 0)
1001 /* PLUS because STEP is negative. */
1002 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1003 * TREE_INT_CST_LOW (drb->step));
1005 unsigned int const_misalignment;
1006 if (!known_misalignment (misalignment, vector_alignment,
1007 &const_misalignment))
1009 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "Non-constant misalignment for access: ");
1013 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1014 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1016 return true;
1019 SET_DR_MISALIGNMENT (dr, const_misalignment);
1021 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1025 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1026 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1029 return true;
1032 /* Function vect_update_misalignment_for_peel.
1033 Sets DR's misalignment
1034 - to 0 if it has the same alignment as DR_PEEL,
1035 - to the misalignment computed using NPEEL if DR's salignment is known,
1036 - to -1 (unknown) otherwise.
1038 DR - the data reference whose misalignment is to be adjusted.
1039 DR_PEEL - the data reference whose misalignment is being made
1040 zero in the vector loop by the peel.
1041 NPEEL - the number of iterations in the peel loop if the misalignment
1042 of DR_PEEL is known at compile time. */
1044 static void
1045 vect_update_misalignment_for_peel (struct data_reference *dr,
1046 struct data_reference *dr_peel, int npeel)
1048 unsigned int i;
1049 vec<dr_p> same_aligned_drs;
1050 struct data_reference *current_dr;
1051 int dr_size = vect_get_scalar_dr_size (dr);
1052 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1053 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1054 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1056 /* For interleaved data accesses the step in the loop must be multiplied by
1057 the size of the interleaving group. */
1058 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1059 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1060 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1061 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1063 /* It can be assumed that the data refs with the same alignment as dr_peel
1064 are aligned in the vector loop. */
1065 same_aligned_drs
1066 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1067 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1069 if (current_dr != dr)
1070 continue;
1071 gcc_assert (!known_alignment_for_access_p (dr)
1072 || !known_alignment_for_access_p (dr_peel)
1073 || (DR_MISALIGNMENT (dr) / dr_size
1074 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1075 SET_DR_MISALIGNMENT (dr, 0);
1076 return;
1079 if (known_alignment_for_access_p (dr)
1080 && known_alignment_for_access_p (dr_peel))
1082 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1083 int misal = DR_MISALIGNMENT (dr);
1084 misal += negative ? -npeel * dr_size : npeel * dr_size;
1085 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1086 SET_DR_MISALIGNMENT (dr, misal);
1087 return;
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1092 "to unknown (-1).\n");
1093 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1097 /* Function verify_data_ref_alignment
1099 Return TRUE if DR can be handled with respect to alignment. */
1101 static bool
1102 verify_data_ref_alignment (data_reference_p dr)
1104 enum dr_alignment_support supportable_dr_alignment
1105 = vect_supportable_dr_alignment (dr, false);
1106 if (!supportable_dr_alignment)
1108 if (dump_enabled_p ())
1110 if (DR_IS_READ (dr))
1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112 "not vectorized: unsupported unaligned load.");
1113 else
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "not vectorized: unsupported unaligned "
1116 "store.");
1118 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1119 DR_REF (dr));
1120 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1122 return false;
1125 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE, vect_location,
1127 "Vectorizing an unaligned access.\n");
1129 return true;
1132 /* Function vect_verify_datarefs_alignment
1134 Return TRUE if all data references in the loop can be
1135 handled with respect to alignment. */
1137 bool
1138 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1140 vec<data_reference_p> datarefs = vinfo->datarefs;
1141 struct data_reference *dr;
1142 unsigned int i;
1144 FOR_EACH_VEC_ELT (datarefs, i, dr)
1146 gimple *stmt = DR_STMT (dr);
1147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1150 continue;
1152 /* For interleaving, only the alignment of the first access matters. */
1153 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1154 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1155 continue;
1157 /* Strided accesses perform only component accesses, alignment is
1158 irrelevant for them. */
1159 if (STMT_VINFO_STRIDED_P (stmt_info)
1160 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1161 continue;
1163 if (! verify_data_ref_alignment (dr))
1164 return false;
1167 return true;
1170 /* Given an memory reference EXP return whether its alignment is less
1171 than its size. */
1173 static bool
1174 not_size_aligned (tree exp)
1176 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1177 return true;
1179 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1180 > get_object_alignment (exp));
1183 /* Function vector_alignment_reachable_p
1185 Return true if vector alignment for DR is reachable by peeling
1186 a few loop iterations. Return false otherwise. */
1188 static bool
1189 vector_alignment_reachable_p (struct data_reference *dr)
1191 gimple *stmt = DR_STMT (dr);
1192 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1193 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1195 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1197 /* For interleaved access we peel only if number of iterations in
1198 the prolog loop ({VF - misalignment}), is a multiple of the
1199 number of the interleaved accesses. */
1200 int elem_size, mis_in_elements;
1202 /* FORNOW: handle only known alignment. */
1203 if (!known_alignment_for_access_p (dr))
1204 return false;
1206 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1207 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1208 elem_size = vector_element_size (vector_size, nelements);
1209 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1211 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1212 return false;
1215 /* If misalignment is known at the compile time then allow peeling
1216 only if natural alignment is reachable through peeling. */
1217 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1219 HOST_WIDE_INT elmsize =
1220 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1221 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_NOTE, vect_location,
1224 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1225 dump_printf (MSG_NOTE,
1226 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1228 if (DR_MISALIGNMENT (dr) % elmsize)
1230 if (dump_enabled_p ())
1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232 "data size does not divide the misalignment.\n");
1233 return false;
1237 if (!known_alignment_for_access_p (dr))
1239 tree type = TREE_TYPE (DR_REF (dr));
1240 bool is_packed = not_size_aligned (DR_REF (dr));
1241 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243 "Unknown misalignment, %snaturally aligned\n",
1244 is_packed ? "not " : "");
1245 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1248 return true;
1252 /* Calculate the cost of the memory access represented by DR. */
1254 static void
1255 vect_get_data_access_cost (struct data_reference *dr,
1256 unsigned int *inside_cost,
1257 unsigned int *outside_cost,
1258 stmt_vector_for_cost *body_cost_vec)
1260 gimple *stmt = DR_STMT (dr);
1261 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1262 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1263 int ncopies;
1265 if (PURE_SLP_STMT (stmt_info))
1266 ncopies = 1;
1267 else
1268 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1270 if (DR_IS_READ (dr))
1271 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1272 NULL, body_cost_vec, false);
1273 else
1274 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1276 if (dump_enabled_p ())
1277 dump_printf_loc (MSG_NOTE, vect_location,
1278 "vect_get_data_access_cost: inside_cost = %d, "
1279 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1283 typedef struct _vect_peel_info
1285 struct data_reference *dr;
1286 int npeel;
1287 unsigned int count;
1288 } *vect_peel_info;
1290 typedef struct _vect_peel_extended_info
1292 struct _vect_peel_info peel_info;
1293 unsigned int inside_cost;
1294 unsigned int outside_cost;
1295 } *vect_peel_extended_info;
1298 /* Peeling hashtable helpers. */
1300 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1302 static inline hashval_t hash (const _vect_peel_info *);
1303 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1306 inline hashval_t
1307 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1309 return (hashval_t) peel_info->npeel;
1312 inline bool
1313 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1315 return (a->npeel == b->npeel);
1319 /* Insert DR into peeling hash table with NPEEL as key. */
1321 static void
1322 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1323 loop_vec_info loop_vinfo, struct data_reference *dr,
1324 int npeel)
1326 struct _vect_peel_info elem, *slot;
1327 _vect_peel_info **new_slot;
1328 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1330 elem.npeel = npeel;
1331 slot = peeling_htab->find (&elem);
1332 if (slot)
1333 slot->count++;
1334 else
1336 slot = XNEW (struct _vect_peel_info);
1337 slot->npeel = npeel;
1338 slot->dr = dr;
1339 slot->count = 1;
1340 new_slot = peeling_htab->find_slot (slot, INSERT);
1341 *new_slot = slot;
1344 if (!supportable_dr_alignment
1345 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1346 slot->count += VECT_MAX_COST;
1350 /* Traverse peeling hash table to find peeling option that aligns maximum
1351 number of data accesses. */
1354 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1355 _vect_peel_extended_info *max)
1357 vect_peel_info elem = *slot;
1359 if (elem->count > max->peel_info.count
1360 || (elem->count == max->peel_info.count
1361 && max->peel_info.npeel > elem->npeel))
1363 max->peel_info.npeel = elem->npeel;
1364 max->peel_info.count = elem->count;
1365 max->peel_info.dr = elem->dr;
1368 return 1;
1371 /* Get the costs of peeling NPEEL iterations checking data access costs
1372 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1373 misalignment will be zero after peeling. */
1375 static void
1376 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1377 struct data_reference *dr0,
1378 unsigned int *inside_cost,
1379 unsigned int *outside_cost,
1380 stmt_vector_for_cost *body_cost_vec,
1381 unsigned int npeel,
1382 bool unknown_misalignment)
1384 unsigned i;
1385 data_reference *dr;
1387 FOR_EACH_VEC_ELT (datarefs, i, dr)
1389 gimple *stmt = DR_STMT (dr);
1390 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1392 continue;
1394 /* For interleaving, only the alignment of the first access
1395 matters. */
1396 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1397 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1398 continue;
1400 /* Strided accesses perform only component accesses, alignment is
1401 irrelevant for them. */
1402 if (STMT_VINFO_STRIDED_P (stmt_info)
1403 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1404 continue;
1406 int save_misalignment;
1407 save_misalignment = DR_MISALIGNMENT (dr);
1408 if (npeel == 0)
1410 else if (unknown_misalignment && dr == dr0)
1411 SET_DR_MISALIGNMENT (dr, 0);
1412 else
1413 vect_update_misalignment_for_peel (dr, dr0, npeel);
1414 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1415 body_cost_vec);
1416 SET_DR_MISALIGNMENT (dr, save_misalignment);
1420 /* Traverse peeling hash table and calculate cost for each peeling option.
1421 Find the one with the lowest cost. */
1424 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1425 _vect_peel_extended_info *min)
1427 vect_peel_info elem = *slot;
1428 int dummy;
1429 unsigned int inside_cost = 0, outside_cost = 0;
1430 gimple *stmt = DR_STMT (elem->dr);
1431 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1432 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1433 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1434 epilogue_cost_vec;
1436 prologue_cost_vec.create (2);
1437 body_cost_vec.create (2);
1438 epilogue_cost_vec.create (2);
1440 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1441 elem->dr, &inside_cost, &outside_cost,
1442 &body_cost_vec, elem->npeel, false);
1444 body_cost_vec.release ();
1446 outside_cost += vect_get_known_peeling_cost
1447 (loop_vinfo, elem->npeel, &dummy,
1448 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1449 &prologue_cost_vec, &epilogue_cost_vec);
1451 /* Prologue and epilogue costs are added to the target model later.
1452 These costs depend only on the scalar iteration cost, the
1453 number of peeling iterations finally chosen, and the number of
1454 misaligned statements. So discard the information found here. */
1455 prologue_cost_vec.release ();
1456 epilogue_cost_vec.release ();
1458 if (inside_cost < min->inside_cost
1459 || (inside_cost == min->inside_cost
1460 && outside_cost < min->outside_cost))
1462 min->inside_cost = inside_cost;
1463 min->outside_cost = outside_cost;
1464 min->peel_info.dr = elem->dr;
1465 min->peel_info.npeel = elem->npeel;
1466 min->peel_info.count = elem->count;
1469 return 1;
1473 /* Choose best peeling option by traversing peeling hash table and either
1474 choosing an option with the lowest cost (if cost model is enabled) or the
1475 option that aligns as many accesses as possible. */
1477 static struct _vect_peel_extended_info
1478 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1479 loop_vec_info loop_vinfo)
1481 struct _vect_peel_extended_info res;
1483 res.peel_info.dr = NULL;
1485 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1487 res.inside_cost = INT_MAX;
1488 res.outside_cost = INT_MAX;
1489 peeling_htab->traverse <_vect_peel_extended_info *,
1490 vect_peeling_hash_get_lowest_cost> (&res);
1492 else
1494 res.peel_info.count = 0;
1495 peeling_htab->traverse <_vect_peel_extended_info *,
1496 vect_peeling_hash_get_most_frequent> (&res);
1497 res.inside_cost = 0;
1498 res.outside_cost = 0;
1501 return res;
1504 /* Return true if the new peeling NPEEL is supported. */
1506 static bool
1507 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1508 unsigned npeel)
1510 unsigned i;
1511 struct data_reference *dr = NULL;
1512 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1513 gimple *stmt;
1514 stmt_vec_info stmt_info;
1515 enum dr_alignment_support supportable_dr_alignment;
1517 /* Ensure that all data refs can be vectorized after the peel. */
1518 FOR_EACH_VEC_ELT (datarefs, i, dr)
1520 int save_misalignment;
1522 if (dr == dr0)
1523 continue;
1525 stmt = DR_STMT (dr);
1526 stmt_info = vinfo_for_stmt (stmt);
1527 /* For interleaving, only the alignment of the first access
1528 matters. */
1529 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1530 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1531 continue;
1533 /* Strided accesses perform only component accesses, alignment is
1534 irrelevant for them. */
1535 if (STMT_VINFO_STRIDED_P (stmt_info)
1536 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1537 continue;
1539 save_misalignment = DR_MISALIGNMENT (dr);
1540 vect_update_misalignment_for_peel (dr, dr0, npeel);
1541 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1542 SET_DR_MISALIGNMENT (dr, save_misalignment);
1544 if (!supportable_dr_alignment)
1545 return false;
1548 return true;
1551 /* Function vect_enhance_data_refs_alignment
1553 This pass will use loop versioning and loop peeling in order to enhance
1554 the alignment of data references in the loop.
1556 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1557 original loop is to be vectorized. Any other loops that are created by
1558 the transformations performed in this pass - are not supposed to be
1559 vectorized. This restriction will be relaxed.
1561 This pass will require a cost model to guide it whether to apply peeling
1562 or versioning or a combination of the two. For example, the scheme that
1563 intel uses when given a loop with several memory accesses, is as follows:
1564 choose one memory access ('p') which alignment you want to force by doing
1565 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1566 other accesses are not necessarily aligned, or (2) use loop versioning to
1567 generate one loop in which all accesses are aligned, and another loop in
1568 which only 'p' is necessarily aligned.
1570 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1571 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1572 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1574 Devising a cost model is the most critical aspect of this work. It will
1575 guide us on which access to peel for, whether to use loop versioning, how
1576 many versions to create, etc. The cost model will probably consist of
1577 generic considerations as well as target specific considerations (on
1578 powerpc for example, misaligned stores are more painful than misaligned
1579 loads).
1581 Here are the general steps involved in alignment enhancements:
1583 -- original loop, before alignment analysis:
1584 for (i=0; i<N; i++){
1585 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1586 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1589 -- After vect_compute_data_refs_alignment:
1590 for (i=0; i<N; i++){
1591 x = q[i]; # DR_MISALIGNMENT(q) = 3
1592 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1595 -- Possibility 1: we do loop versioning:
1596 if (p is aligned) {
1597 for (i=0; i<N; i++){ # loop 1A
1598 x = q[i]; # DR_MISALIGNMENT(q) = 3
1599 p[i] = y; # DR_MISALIGNMENT(p) = 0
1602 else {
1603 for (i=0; i<N; i++){ # loop 1B
1604 x = q[i]; # DR_MISALIGNMENT(q) = 3
1605 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1609 -- Possibility 2: we do loop peeling:
1610 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1611 x = q[i];
1612 p[i] = y;
1614 for (i = 3; i < N; i++){ # loop 2A
1615 x = q[i]; # DR_MISALIGNMENT(q) = 0
1616 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1619 -- Possibility 3: combination of loop peeling and versioning:
1620 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1621 x = q[i];
1622 p[i] = y;
1624 if (p is aligned) {
1625 for (i = 3; i<N; i++){ # loop 3A
1626 x = q[i]; # DR_MISALIGNMENT(q) = 0
1627 p[i] = y; # DR_MISALIGNMENT(p) = 0
1630 else {
1631 for (i = 3; i<N; i++){ # loop 3B
1632 x = q[i]; # DR_MISALIGNMENT(q) = 0
1633 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1637 These loops are later passed to loop_transform to be vectorized. The
1638 vectorizer will use the alignment information to guide the transformation
1639 (whether to generate regular loads/stores, or with special handling for
1640 misalignment). */
1642 bool
1643 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1645 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1646 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1647 enum dr_alignment_support supportable_dr_alignment;
1648 struct data_reference *dr0 = NULL, *first_store = NULL;
1649 struct data_reference *dr;
1650 unsigned int i, j;
1651 bool do_peeling = false;
1652 bool do_versioning = false;
1653 bool stat;
1654 gimple *stmt;
1655 stmt_vec_info stmt_info;
1656 unsigned int npeel = 0;
1657 bool one_misalignment_known = false;
1658 bool one_misalignment_unknown = false;
1659 bool one_dr_unsupportable = false;
1660 struct data_reference *unsupportable_dr = NULL;
1661 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1662 unsigned possible_npeel_number = 1;
1663 tree vectype;
1664 unsigned int mis, same_align_drs_max = 0;
1665 hash_table<peel_info_hasher> peeling_htab (1);
1667 if (dump_enabled_p ())
1668 dump_printf_loc (MSG_NOTE, vect_location,
1669 "=== vect_enhance_data_refs_alignment ===\n");
1671 /* Reset data so we can safely be called multiple times. */
1672 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1673 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1675 /* While cost model enhancements are expected in the future, the high level
1676 view of the code at this time is as follows:
1678 A) If there is a misaligned access then see if peeling to align
1679 this access can make all data references satisfy
1680 vect_supportable_dr_alignment. If so, update data structures
1681 as needed and return true.
1683 B) If peeling wasn't possible and there is a data reference with an
1684 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1685 then see if loop versioning checks can be used to make all data
1686 references satisfy vect_supportable_dr_alignment. If so, update
1687 data structures as needed and return true.
1689 C) If neither peeling nor versioning were successful then return false if
1690 any data reference does not satisfy vect_supportable_dr_alignment.
1692 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1694 Note, Possibility 3 above (which is peeling and versioning together) is not
1695 being done at this time. */
1697 /* (1) Peeling to force alignment. */
1699 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1700 Considerations:
1701 + How many accesses will become aligned due to the peeling
1702 - How many accesses will become unaligned due to the peeling,
1703 and the cost of misaligned accesses.
1704 - The cost of peeling (the extra runtime checks, the increase
1705 in code size). */
1707 FOR_EACH_VEC_ELT (datarefs, i, dr)
1709 stmt = DR_STMT (dr);
1710 stmt_info = vinfo_for_stmt (stmt);
1712 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1713 continue;
1715 /* For interleaving, only the alignment of the first access
1716 matters. */
1717 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1718 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1719 continue;
1721 /* For invariant accesses there is nothing to enhance. */
1722 if (integer_zerop (DR_STEP (dr)))
1723 continue;
1725 /* Strided accesses perform only component accesses, alignment is
1726 irrelevant for them. */
1727 if (STMT_VINFO_STRIDED_P (stmt_info)
1728 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1729 continue;
1731 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1732 do_peeling = vector_alignment_reachable_p (dr);
1733 if (do_peeling)
1735 if (known_alignment_for_access_p (dr))
1737 unsigned int npeel_tmp = 0;
1738 bool negative = tree_int_cst_compare (DR_STEP (dr),
1739 size_zero_node) < 0;
1741 vectype = STMT_VINFO_VECTYPE (stmt_info);
1742 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1743 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1744 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1745 if (DR_MISALIGNMENT (dr) != 0)
1746 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1748 /* For multiple types, it is possible that the bigger type access
1749 will have more than one peeling option. E.g., a loop with two
1750 types: one of size (vector size / 4), and the other one of
1751 size (vector size / 8). Vectorization factor will 8. If both
1752 accesses are misaligned by 3, the first one needs one scalar
1753 iteration to be aligned, and the second one needs 5. But the
1754 first one will be aligned also by peeling 5 scalar
1755 iterations, and in that case both accesses will be aligned.
1756 Hence, except for the immediate peeling amount, we also want
1757 to try to add full vector size, while we don't exceed
1758 vectorization factor.
1759 We do this automatically for cost model, since we calculate
1760 cost for every peeling option. */
1761 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1763 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1764 ? vf * GROUP_SIZE (stmt_info) : vf);
1765 possible_npeel_number
1766 = vect_get_num_vectors (nscalars, vectype);
1768 /* NPEEL_TMP is 0 when there is no misalignment, but also
1769 allow peeling NELEMENTS. */
1770 if (DR_MISALIGNMENT (dr) == 0)
1771 possible_npeel_number++;
1774 /* Save info about DR in the hash table. Also include peeling
1775 amounts according to the explanation above. */
1776 for (j = 0; j < possible_npeel_number; j++)
1778 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1779 dr, npeel_tmp);
1780 npeel_tmp += target_align / dr_size;
1783 one_misalignment_known = true;
1785 else
1787 /* If we don't know any misalignment values, we prefer
1788 peeling for data-ref that has the maximum number of data-refs
1789 with the same alignment, unless the target prefers to align
1790 stores over load. */
1791 unsigned same_align_drs
1792 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1793 if (!dr0
1794 || same_align_drs_max < same_align_drs)
1796 same_align_drs_max = same_align_drs;
1797 dr0 = dr;
1799 /* For data-refs with the same number of related
1800 accesses prefer the one where the misalign
1801 computation will be invariant in the outermost loop. */
1802 else if (same_align_drs_max == same_align_drs)
1804 struct loop *ivloop0, *ivloop;
1805 ivloop0 = outermost_invariant_loop_for_expr
1806 (loop, DR_BASE_ADDRESS (dr0));
1807 ivloop = outermost_invariant_loop_for_expr
1808 (loop, DR_BASE_ADDRESS (dr));
1809 if ((ivloop && !ivloop0)
1810 || (ivloop && ivloop0
1811 && flow_loop_nested_p (ivloop, ivloop0)))
1812 dr0 = dr;
1815 one_misalignment_unknown = true;
1817 /* Check for data refs with unsupportable alignment that
1818 can be peeled. */
1819 if (!supportable_dr_alignment)
1821 one_dr_unsupportable = true;
1822 unsupportable_dr = dr;
1825 if (!first_store && DR_IS_WRITE (dr))
1826 first_store = dr;
1829 else
1831 if (!aligned_access_p (dr))
1833 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1835 "vector alignment may not be reachable\n");
1836 break;
1841 /* Check if we can possibly peel the loop. */
1842 if (!vect_can_advance_ivs_p (loop_vinfo)
1843 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1844 || loop->inner)
1845 do_peeling = false;
1847 struct _vect_peel_extended_info peel_for_known_alignment;
1848 struct _vect_peel_extended_info peel_for_unknown_alignment;
1849 struct _vect_peel_extended_info best_peel;
1851 peel_for_unknown_alignment.inside_cost = INT_MAX;
1852 peel_for_unknown_alignment.outside_cost = INT_MAX;
1853 peel_for_unknown_alignment.peel_info.count = 0;
1855 if (do_peeling
1856 && one_misalignment_unknown)
1858 /* Check if the target requires to prefer stores over loads, i.e., if
1859 misaligned stores are more expensive than misaligned loads (taking
1860 drs with same alignment into account). */
1861 unsigned int load_inside_cost = 0;
1862 unsigned int load_outside_cost = 0;
1863 unsigned int store_inside_cost = 0;
1864 unsigned int store_outside_cost = 0;
1865 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1867 stmt_vector_for_cost dummy;
1868 dummy.create (2);
1869 vect_get_peeling_costs_all_drs (datarefs, dr0,
1870 &load_inside_cost,
1871 &load_outside_cost,
1872 &dummy, estimated_npeels, true);
1873 dummy.release ();
1875 if (first_store)
1877 dummy.create (2);
1878 vect_get_peeling_costs_all_drs (datarefs, first_store,
1879 &store_inside_cost,
1880 &store_outside_cost,
1881 &dummy, estimated_npeels, true);
1882 dummy.release ();
1884 else
1886 store_inside_cost = INT_MAX;
1887 store_outside_cost = INT_MAX;
1890 if (load_inside_cost > store_inside_cost
1891 || (load_inside_cost == store_inside_cost
1892 && load_outside_cost > store_outside_cost))
1894 dr0 = first_store;
1895 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1896 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1898 else
1900 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1901 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1904 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1905 prologue_cost_vec.create (2);
1906 epilogue_cost_vec.create (2);
1908 int dummy2;
1909 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1910 (loop_vinfo, estimated_npeels, &dummy2,
1911 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1912 &prologue_cost_vec, &epilogue_cost_vec);
1914 prologue_cost_vec.release ();
1915 epilogue_cost_vec.release ();
1917 peel_for_unknown_alignment.peel_info.count = 1
1918 + STMT_VINFO_SAME_ALIGN_REFS
1919 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1922 peel_for_unknown_alignment.peel_info.npeel = 0;
1923 peel_for_unknown_alignment.peel_info.dr = dr0;
1925 best_peel = peel_for_unknown_alignment;
1927 peel_for_known_alignment.inside_cost = INT_MAX;
1928 peel_for_known_alignment.outside_cost = INT_MAX;
1929 peel_for_known_alignment.peel_info.count = 0;
1930 peel_for_known_alignment.peel_info.dr = NULL;
1932 if (do_peeling && one_misalignment_known)
1934 /* Peeling is possible, but there is no data access that is not supported
1935 unless aligned. So we try to choose the best possible peeling from
1936 the hash table. */
1937 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1938 (&peeling_htab, loop_vinfo);
1941 /* Compare costs of peeling for known and unknown alignment. */
1942 if (peel_for_known_alignment.peel_info.dr != NULL
1943 && peel_for_unknown_alignment.inside_cost
1944 >= peel_for_known_alignment.inside_cost)
1946 best_peel = peel_for_known_alignment;
1948 /* If the best peeling for known alignment has NPEEL == 0, perform no
1949 peeling at all except if there is an unsupportable dr that we can
1950 align. */
1951 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1952 do_peeling = false;
1955 /* If there is an unsupportable data ref, prefer this over all choices so far
1956 since we'd have to discard a chosen peeling except when it accidentally
1957 aligned the unsupportable data ref. */
1958 if (one_dr_unsupportable)
1959 dr0 = unsupportable_dr;
1960 else if (do_peeling)
1962 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1963 TODO: Use nopeel_outside_cost or get rid of it? */
1964 unsigned nopeel_inside_cost = 0;
1965 unsigned nopeel_outside_cost = 0;
1967 stmt_vector_for_cost dummy;
1968 dummy.create (2);
1969 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1970 &nopeel_outside_cost, &dummy, 0, false);
1971 dummy.release ();
1973 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1974 costs will be recorded. */
1975 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1976 prologue_cost_vec.create (2);
1977 epilogue_cost_vec.create (2);
1979 int dummy2;
1980 nopeel_outside_cost += vect_get_known_peeling_cost
1981 (loop_vinfo, 0, &dummy2,
1982 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1983 &prologue_cost_vec, &epilogue_cost_vec);
1985 prologue_cost_vec.release ();
1986 epilogue_cost_vec.release ();
1988 npeel = best_peel.peel_info.npeel;
1989 dr0 = best_peel.peel_info.dr;
1991 /* If no peeling is not more expensive than the best peeling we
1992 have so far, don't perform any peeling. */
1993 if (nopeel_inside_cost <= best_peel.inside_cost)
1994 do_peeling = false;
1997 if (do_peeling)
1999 stmt = DR_STMT (dr0);
2000 stmt_info = vinfo_for_stmt (stmt);
2001 vectype = STMT_VINFO_VECTYPE (stmt_info);
2003 if (known_alignment_for_access_p (dr0))
2005 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2006 size_zero_node) < 0;
2007 if (!npeel)
2009 /* Since it's known at compile time, compute the number of
2010 iterations in the peeled loop (the peeling factor) for use in
2011 updating DR_MISALIGNMENT values. The peeling factor is the
2012 vectorization factor minus the misalignment as an element
2013 count. */
2014 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2015 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2016 npeel = ((mis & (target_align - 1))
2017 / vect_get_scalar_dr_size (dr0));
2020 /* For interleaved data access every iteration accesses all the
2021 members of the group, therefore we divide the number of iterations
2022 by the group size. */
2023 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2024 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2025 npeel /= GROUP_SIZE (stmt_info);
2027 if (dump_enabled_p ())
2028 dump_printf_loc (MSG_NOTE, vect_location,
2029 "Try peeling by %d\n", npeel);
2032 /* Ensure that all datarefs can be vectorized after the peel. */
2033 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2034 do_peeling = false;
2036 /* Check if all datarefs are supportable and log. */
2037 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2039 stat = vect_verify_datarefs_alignment (loop_vinfo);
2040 if (!stat)
2041 do_peeling = false;
2042 else
2043 return stat;
2046 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2047 if (do_peeling)
2049 unsigned max_allowed_peel
2050 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2051 if (max_allowed_peel != (unsigned)-1)
2053 unsigned max_peel = npeel;
2054 if (max_peel == 0)
2056 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2057 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2059 if (max_peel > max_allowed_peel)
2061 do_peeling = false;
2062 if (dump_enabled_p ())
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "Disable peeling, max peels reached: %d\n", max_peel);
2069 /* Cost model #2 - if peeling may result in a remaining loop not
2070 iterating enough to be vectorized then do not peel. Since this
2071 is a cost heuristic rather than a correctness decision, use the
2072 most likely runtime value for variable vectorization factors. */
2073 if (do_peeling
2074 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2076 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2077 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2078 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2079 < assumed_vf + max_peel)
2080 do_peeling = false;
2083 if (do_peeling)
2085 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2086 If the misalignment of DR_i is identical to that of dr0 then set
2087 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2088 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2089 by the peeling factor times the element size of DR_i (MOD the
2090 vectorization factor times the size). Otherwise, the
2091 misalignment of DR_i must be set to unknown. */
2092 FOR_EACH_VEC_ELT (datarefs, i, dr)
2093 if (dr != dr0)
2095 /* Strided accesses perform only component accesses, alignment
2096 is irrelevant for them. */
2097 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2098 if (STMT_VINFO_STRIDED_P (stmt_info)
2099 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2100 continue;
2102 vect_update_misalignment_for_peel (dr, dr0, npeel);
2105 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2106 if (npeel)
2107 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2108 else
2109 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2110 = DR_MISALIGNMENT (dr0);
2111 SET_DR_MISALIGNMENT (dr0, 0);
2112 if (dump_enabled_p ())
2114 dump_printf_loc (MSG_NOTE, vect_location,
2115 "Alignment of access forced using peeling.\n");
2116 dump_printf_loc (MSG_NOTE, vect_location,
2117 "Peeling for alignment will be applied.\n");
2120 /* The inside-loop cost will be accounted for in vectorizable_load
2121 and vectorizable_store correctly with adjusted alignments.
2122 Drop the body_cst_vec on the floor here. */
2123 stat = vect_verify_datarefs_alignment (loop_vinfo);
2124 gcc_assert (stat);
2125 return stat;
2129 /* (2) Versioning to force alignment. */
2131 /* Try versioning if:
2132 1) optimize loop for speed
2133 2) there is at least one unsupported misaligned data ref with an unknown
2134 misalignment, and
2135 3) all misaligned data refs with a known misalignment are supported, and
2136 4) the number of runtime alignment checks is within reason. */
2138 do_versioning =
2139 optimize_loop_nest_for_speed_p (loop)
2140 && (!loop->inner); /* FORNOW */
2142 if (do_versioning)
2144 FOR_EACH_VEC_ELT (datarefs, i, dr)
2146 stmt = DR_STMT (dr);
2147 stmt_info = vinfo_for_stmt (stmt);
2149 /* For interleaving, only the alignment of the first access
2150 matters. */
2151 if (aligned_access_p (dr)
2152 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2153 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2154 continue;
2156 if (STMT_VINFO_STRIDED_P (stmt_info))
2158 /* Strided loads perform only component accesses, alignment is
2159 irrelevant for them. */
2160 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2161 continue;
2162 do_versioning = false;
2163 break;
2166 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2168 if (!supportable_dr_alignment)
2170 gimple *stmt;
2171 int mask;
2172 tree vectype;
2174 if (known_alignment_for_access_p (dr)
2175 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2176 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2178 do_versioning = false;
2179 break;
2182 stmt = DR_STMT (dr);
2183 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2184 gcc_assert (vectype);
2186 /* At present we don't support versioning for alignment
2187 with variable VF, since there's no guarantee that the
2188 VF is a power of two. We could relax this if we added
2189 a way of enforcing a power-of-two size. */
2190 unsigned HOST_WIDE_INT size;
2191 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2193 do_versioning = false;
2194 break;
2197 /* The rightmost bits of an aligned address must be zeros.
2198 Construct the mask needed for this test. For example,
2199 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2200 mask must be 15 = 0xf. */
2201 mask = size - 1;
2203 /* FORNOW: use the same mask to test all potentially unaligned
2204 references in the loop. The vectorizer currently supports
2205 a single vector size, see the reference to
2206 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2207 vectorization factor is computed. */
2208 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2209 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2210 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2211 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2212 DR_STMT (dr));
2216 /* Versioning requires at least one misaligned data reference. */
2217 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2218 do_versioning = false;
2219 else if (!do_versioning)
2220 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2223 if (do_versioning)
2225 vec<gimple *> may_misalign_stmts
2226 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2227 gimple *stmt;
2229 /* It can now be assumed that the data references in the statements
2230 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2231 of the loop being vectorized. */
2232 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2234 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2235 dr = STMT_VINFO_DATA_REF (stmt_info);
2236 SET_DR_MISALIGNMENT (dr, 0);
2237 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_NOTE, vect_location,
2239 "Alignment of access forced using versioning.\n");
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_NOTE, vect_location,
2244 "Versioning for alignment will be applied.\n");
2246 /* Peeling and versioning can't be done together at this time. */
2247 gcc_assert (! (do_peeling && do_versioning));
2249 stat = vect_verify_datarefs_alignment (loop_vinfo);
2250 gcc_assert (stat);
2251 return stat;
2254 /* This point is reached if neither peeling nor versioning is being done. */
2255 gcc_assert (! (do_peeling || do_versioning));
2257 stat = vect_verify_datarefs_alignment (loop_vinfo);
2258 return stat;
2262 /* Function vect_find_same_alignment_drs.
2264 Update group and alignment relations according to the chosen
2265 vectorization factor. */
2267 static void
2268 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2270 struct data_reference *dra = DDR_A (ddr);
2271 struct data_reference *drb = DDR_B (ddr);
2272 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2273 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2275 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2276 return;
2278 if (dra == drb)
2279 return;
2281 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2282 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2283 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2284 return;
2286 /* Two references with distance zero have the same alignment. */
2287 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2288 - wi::to_poly_offset (DR_INIT (drb)));
2289 if (maybe_ne (diff, 0))
2291 /* Get the wider of the two alignments. */
2292 unsigned int align_a = (vect_calculate_target_alignment (dra)
2293 / BITS_PER_UNIT);
2294 unsigned int align_b = (vect_calculate_target_alignment (drb)
2295 / BITS_PER_UNIT);
2296 unsigned int max_align = MAX (align_a, align_b);
2298 /* Require the gap to be a multiple of the larger vector alignment. */
2299 if (!multiple_p (diff, max_align))
2300 return;
2303 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2304 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2305 if (dump_enabled_p ())
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "accesses have the same alignment: ");
2309 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2310 dump_printf (MSG_NOTE, " and ");
2311 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2312 dump_printf (MSG_NOTE, "\n");
2317 /* Function vect_analyze_data_refs_alignment
2319 Analyze the alignment of the data-references in the loop.
2320 Return FALSE if a data reference is found that cannot be vectorized. */
2322 bool
2323 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2325 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_NOTE, vect_location,
2327 "=== vect_analyze_data_refs_alignment ===\n");
2329 /* Mark groups of data references with same alignment using
2330 data dependence information. */
2331 vec<ddr_p> ddrs = vinfo->ddrs;
2332 struct data_dependence_relation *ddr;
2333 unsigned int i;
2335 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2336 vect_find_same_alignment_drs (ddr);
2338 vec<data_reference_p> datarefs = vinfo->datarefs;
2339 struct data_reference *dr;
2341 vect_record_base_alignments (vinfo);
2342 FOR_EACH_VEC_ELT (datarefs, i, dr)
2344 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2345 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2346 && !vect_compute_data_ref_alignment (dr))
2348 /* Strided accesses perform only component accesses, misalignment
2349 information is irrelevant for them. */
2350 if (STMT_VINFO_STRIDED_P (stmt_info)
2351 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2352 continue;
2354 if (dump_enabled_p ())
2355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2356 "not vectorized: can't calculate alignment "
2357 "for data ref.\n");
2359 return false;
2363 return true;
2367 /* Analyze alignment of DRs of stmts in NODE. */
2369 static bool
2370 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2372 /* We vectorize from the first scalar stmt in the node unless
2373 the node is permuted in which case we start from the first
2374 element in the group. */
2375 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2376 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2377 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2378 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2380 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2381 if (! vect_compute_data_ref_alignment (dr)
2382 /* For creating the data-ref pointer we need alignment of the
2383 first element anyway. */
2384 || (dr != first_dr
2385 && ! vect_compute_data_ref_alignment (first_dr))
2386 || ! verify_data_ref_alignment (dr))
2388 if (dump_enabled_p ())
2389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2390 "not vectorized: bad data alignment in basic "
2391 "block.\n");
2392 return false;
2395 return true;
2398 /* Function vect_slp_analyze_instance_alignment
2400 Analyze the alignment of the data-references in the SLP instance.
2401 Return FALSE if a data reference is found that cannot be vectorized. */
2403 bool
2404 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_NOTE, vect_location,
2408 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2410 slp_tree node;
2411 unsigned i;
2412 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2413 if (! vect_slp_analyze_and_verify_node_alignment (node))
2414 return false;
2416 node = SLP_INSTANCE_TREE (instance);
2417 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2418 && ! vect_slp_analyze_and_verify_node_alignment
2419 (SLP_INSTANCE_TREE (instance)))
2420 return false;
2422 return true;
2426 /* Analyze groups of accesses: check that DR belongs to a group of
2427 accesses of legal size, step, etc. Detect gaps, single element
2428 interleaving, and other special cases. Set grouped access info.
2429 Collect groups of strided stores for further use in SLP analysis.
2430 Worker for vect_analyze_group_access. */
2432 static bool
2433 vect_analyze_group_access_1 (struct data_reference *dr)
2435 tree step = DR_STEP (dr);
2436 tree scalar_type = TREE_TYPE (DR_REF (dr));
2437 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2438 gimple *stmt = DR_STMT (dr);
2439 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2440 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2441 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2442 HOST_WIDE_INT dr_step = -1;
2443 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2444 bool slp_impossible = false;
2446 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2447 size of the interleaving group (including gaps). */
2448 if (tree_fits_shwi_p (step))
2450 dr_step = tree_to_shwi (step);
2451 /* Check that STEP is a multiple of type size. Otherwise there is
2452 a non-element-sized gap at the end of the group which we
2453 cannot represent in GROUP_GAP or GROUP_SIZE.
2454 ??? As we can handle non-constant step fine here we should
2455 simply remove uses of GROUP_GAP between the last and first
2456 element and instead rely on DR_STEP. GROUP_SIZE then would
2457 simply not include that gap. */
2458 if ((dr_step % type_size) != 0)
2460 if (dump_enabled_p ())
2462 dump_printf_loc (MSG_NOTE, vect_location,
2463 "Step ");
2464 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2465 dump_printf (MSG_NOTE,
2466 " is not a multiple of the element size for ");
2467 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2468 dump_printf (MSG_NOTE, "\n");
2470 return false;
2472 groupsize = absu_hwi (dr_step) / type_size;
2474 else
2475 groupsize = 0;
2477 /* Not consecutive access is possible only if it is a part of interleaving. */
2478 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2480 /* Check if it this DR is a part of interleaving, and is a single
2481 element of the group that is accessed in the loop. */
2483 /* Gaps are supported only for loads. STEP must be a multiple of the type
2484 size. */
2485 if (DR_IS_READ (dr)
2486 && (dr_step % type_size) == 0
2487 && groupsize > 0)
2489 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2490 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2491 GROUP_GAP (stmt_info) = groupsize - 1;
2492 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_NOTE, vect_location,
2495 "Detected single element interleaving ");
2496 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2497 dump_printf (MSG_NOTE, " step ");
2498 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2499 dump_printf (MSG_NOTE, "\n");
2502 return true;
2505 if (dump_enabled_p ())
2507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2508 "not consecutive access ");
2509 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2512 if (bb_vinfo)
2514 /* Mark the statement as unvectorizable. */
2515 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2516 return true;
2519 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2520 STMT_VINFO_STRIDED_P (stmt_info) = true;
2521 return true;
2524 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2526 /* First stmt in the interleaving chain. Check the chain. */
2527 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2528 struct data_reference *data_ref = dr;
2529 unsigned int count = 1;
2530 tree prev_init = DR_INIT (data_ref);
2531 gimple *prev = stmt;
2532 HOST_WIDE_INT diff, gaps = 0;
2534 /* By construction, all group members have INTEGER_CST DR_INITs. */
2535 while (next)
2537 /* Skip same data-refs. In case that two or more stmts share
2538 data-ref (supported only for loads), we vectorize only the first
2539 stmt, and the rest get their vectorized loads from the first
2540 one. */
2541 if (!tree_int_cst_compare (DR_INIT (data_ref),
2542 DR_INIT (STMT_VINFO_DATA_REF (
2543 vinfo_for_stmt (next)))))
2545 if (DR_IS_WRITE (data_ref))
2547 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2549 "Two store stmts share the same dr.\n");
2550 return false;
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "Two or more load stmts share the same dr.\n");
2557 /* For load use the same data-ref load. */
2558 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2560 prev = next;
2561 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2562 continue;
2565 prev = next;
2566 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2568 /* All group members have the same STEP by construction. */
2569 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2571 /* Check that the distance between two accesses is equal to the type
2572 size. Otherwise, we have gaps. */
2573 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2574 - TREE_INT_CST_LOW (prev_init)) / type_size;
2575 if (diff != 1)
2577 /* FORNOW: SLP of accesses with gaps is not supported. */
2578 slp_impossible = true;
2579 if (DR_IS_WRITE (data_ref))
2581 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2583 "interleaved store with gaps\n");
2584 return false;
2587 gaps += diff - 1;
2590 last_accessed_element += diff;
2592 /* Store the gap from the previous member of the group. If there is no
2593 gap in the access, GROUP_GAP is always 1. */
2594 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2596 prev_init = DR_INIT (data_ref);
2597 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2598 /* Count the number of data-refs in the chain. */
2599 count++;
2602 if (groupsize == 0)
2603 groupsize = count + gaps;
2605 /* This could be UINT_MAX but as we are generating code in a very
2606 inefficient way we have to cap earlier. See PR78699 for example. */
2607 if (groupsize > 4096)
2609 if (dump_enabled_p ())
2610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2611 "group is too large\n");
2612 return false;
2615 /* Check that the size of the interleaving is equal to count for stores,
2616 i.e., that there are no gaps. */
2617 if (groupsize != count
2618 && !DR_IS_READ (dr))
2620 if (dump_enabled_p ())
2621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2622 "interleaved store with gaps\n");
2623 return false;
2626 /* If there is a gap after the last load in the group it is the
2627 difference between the groupsize and the last accessed
2628 element.
2629 When there is no gap, this difference should be 0. */
2630 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2632 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2633 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_NOTE, vect_location,
2636 "Detected interleaving ");
2637 if (DR_IS_READ (dr))
2638 dump_printf (MSG_NOTE, "load ");
2639 else
2640 dump_printf (MSG_NOTE, "store ");
2641 dump_printf (MSG_NOTE, "of size %u starting with ",
2642 (unsigned)groupsize);
2643 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2644 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2645 dump_printf_loc (MSG_NOTE, vect_location,
2646 "There is a gap of %u elements after the group\n",
2647 GROUP_GAP (vinfo_for_stmt (stmt)));
2650 /* SLP: create an SLP data structure for every interleaving group of
2651 stores for further analysis in vect_analyse_slp. */
2652 if (DR_IS_WRITE (dr) && !slp_impossible)
2654 if (loop_vinfo)
2655 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2656 if (bb_vinfo)
2657 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2661 return true;
2664 /* Analyze groups of accesses: check that DR belongs to a group of
2665 accesses of legal size, step, etc. Detect gaps, single element
2666 interleaving, and other special cases. Set grouped access info.
2667 Collect groups of strided stores for further use in SLP analysis. */
2669 static bool
2670 vect_analyze_group_access (struct data_reference *dr)
2672 if (!vect_analyze_group_access_1 (dr))
2674 /* Dissolve the group if present. */
2675 gimple *next;
2676 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2677 while (stmt)
2679 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2680 next = GROUP_NEXT_ELEMENT (vinfo);
2681 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2682 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2683 stmt = next;
2685 return false;
2687 return true;
2690 /* Analyze the access pattern of the data-reference DR.
2691 In case of non-consecutive accesses call vect_analyze_group_access() to
2692 analyze groups of accesses. */
2694 static bool
2695 vect_analyze_data_ref_access (struct data_reference *dr)
2697 tree step = DR_STEP (dr);
2698 tree scalar_type = TREE_TYPE (DR_REF (dr));
2699 gimple *stmt = DR_STMT (dr);
2700 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2702 struct loop *loop = NULL;
2704 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2705 return true;
2707 if (loop_vinfo)
2708 loop = LOOP_VINFO_LOOP (loop_vinfo);
2710 if (loop_vinfo && !step)
2712 if (dump_enabled_p ())
2713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2714 "bad data-ref access in loop\n");
2715 return false;
2718 /* Allow loads with zero step in inner-loop vectorization. */
2719 if (loop_vinfo && integer_zerop (step))
2721 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2722 if (!nested_in_vect_loop_p (loop, stmt))
2723 return DR_IS_READ (dr);
2724 /* Allow references with zero step for outer loops marked
2725 with pragma omp simd only - it guarantees absence of
2726 loop-carried dependencies between inner loop iterations. */
2727 if (loop->safelen < 2)
2729 if (dump_enabled_p ())
2730 dump_printf_loc (MSG_NOTE, vect_location,
2731 "zero step in inner loop of nest\n");
2732 return false;
2736 if (loop && nested_in_vect_loop_p (loop, stmt))
2738 /* Interleaved accesses are not yet supported within outer-loop
2739 vectorization for references in the inner-loop. */
2740 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2742 /* For the rest of the analysis we use the outer-loop step. */
2743 step = STMT_VINFO_DR_STEP (stmt_info);
2744 if (integer_zerop (step))
2746 if (dump_enabled_p ())
2747 dump_printf_loc (MSG_NOTE, vect_location,
2748 "zero step in outer loop.\n");
2749 return DR_IS_READ (dr);
2753 /* Consecutive? */
2754 if (TREE_CODE (step) == INTEGER_CST)
2756 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2757 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2758 || (dr_step < 0
2759 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2761 /* Mark that it is not interleaving. */
2762 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2763 return true;
2767 if (loop && nested_in_vect_loop_p (loop, stmt))
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE, vect_location,
2771 "grouped access in outer loop.\n");
2772 return false;
2776 /* Assume this is a DR handled by non-constant strided load case. */
2777 if (TREE_CODE (step) != INTEGER_CST)
2778 return (STMT_VINFO_STRIDED_P (stmt_info)
2779 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2780 || vect_analyze_group_access (dr)));
2782 /* Not consecutive access - check if it's a part of interleaving group. */
2783 return vect_analyze_group_access (dr);
2786 /* Compare two data-references DRA and DRB to group them into chunks
2787 suitable for grouping. */
2789 static int
2790 dr_group_sort_cmp (const void *dra_, const void *drb_)
2792 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2793 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2794 int cmp;
2796 /* Stabilize sort. */
2797 if (dra == drb)
2798 return 0;
2800 /* DRs in different loops never belong to the same group. */
2801 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2802 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2803 if (loopa != loopb)
2804 return loopa->num < loopb->num ? -1 : 1;
2806 /* Ordering of DRs according to base. */
2807 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2808 DR_BASE_ADDRESS (drb));
2809 if (cmp != 0)
2810 return cmp;
2812 /* And according to DR_OFFSET. */
2813 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2814 if (cmp != 0)
2815 return cmp;
2817 /* Put reads before writes. */
2818 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2819 return DR_IS_READ (dra) ? -1 : 1;
2821 /* Then sort after access size. */
2822 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2823 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2824 if (cmp != 0)
2825 return cmp;
2827 /* And after step. */
2828 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2829 if (cmp != 0)
2830 return cmp;
2832 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2833 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2834 if (cmp == 0)
2835 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2836 return cmp;
2839 /* If OP is the result of a conversion, return the unconverted value,
2840 otherwise return null. */
2842 static tree
2843 strip_conversion (tree op)
2845 if (TREE_CODE (op) != SSA_NAME)
2846 return NULL_TREE;
2847 gimple *stmt = SSA_NAME_DEF_STMT (op);
2848 if (!is_gimple_assign (stmt)
2849 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2850 return NULL_TREE;
2851 return gimple_assign_rhs1 (stmt);
2854 /* Return true if vectorizable_* routines can handle statements STMT1
2855 and STMT2 being in a single group. */
2857 static bool
2858 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2860 if (gimple_assign_single_p (stmt1))
2861 return gimple_assign_single_p (stmt2);
2863 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2865 /* Check for two masked loads or two masked stores. */
2866 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2867 return false;
2868 internal_fn ifn = gimple_call_internal_fn (stmt1);
2869 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2870 return false;
2871 if (ifn != gimple_call_internal_fn (stmt2))
2872 return false;
2874 /* Check that the masks are the same. Cope with casts of masks,
2875 like those created by build_mask_conversion. */
2876 tree mask1 = gimple_call_arg (stmt1, 2);
2877 tree mask2 = gimple_call_arg (stmt2, 2);
2878 if (!operand_equal_p (mask1, mask2, 0))
2880 mask1 = strip_conversion (mask1);
2881 if (!mask1)
2882 return false;
2883 mask2 = strip_conversion (mask2);
2884 if (!mask2)
2885 return false;
2886 if (!operand_equal_p (mask1, mask2, 0))
2887 return false;
2889 return true;
2892 return false;
2895 /* Function vect_analyze_data_ref_accesses.
2897 Analyze the access pattern of all the data references in the loop.
2899 FORNOW: the only access pattern that is considered vectorizable is a
2900 simple step 1 (consecutive) access.
2902 FORNOW: handle only arrays and pointer accesses. */
2904 bool
2905 vect_analyze_data_ref_accesses (vec_info *vinfo)
2907 unsigned int i;
2908 vec<data_reference_p> datarefs = vinfo->datarefs;
2909 struct data_reference *dr;
2911 if (dump_enabled_p ())
2912 dump_printf_loc (MSG_NOTE, vect_location,
2913 "=== vect_analyze_data_ref_accesses ===\n");
2915 if (datarefs.is_empty ())
2916 return true;
2918 /* Sort the array of datarefs to make building the interleaving chains
2919 linear. Don't modify the original vector's order, it is needed for
2920 determining what dependencies are reversed. */
2921 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2922 datarefs_copy.qsort (dr_group_sort_cmp);
2924 /* Build the interleaving chains. */
2925 for (i = 0; i < datarefs_copy.length () - 1;)
2927 data_reference_p dra = datarefs_copy[i];
2928 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2929 stmt_vec_info lastinfo = NULL;
2930 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2931 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2933 ++i;
2934 continue;
2936 for (i = i + 1; i < datarefs_copy.length (); ++i)
2938 data_reference_p drb = datarefs_copy[i];
2939 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2940 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2941 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2942 break;
2944 /* ??? Imperfect sorting (non-compatible types, non-modulo
2945 accesses, same accesses) can lead to a group to be artificially
2946 split here as we don't just skip over those. If it really
2947 matters we can push those to a worklist and re-iterate
2948 over them. The we can just skip ahead to the next DR here. */
2950 /* DRs in a different loop should not be put into the same
2951 interleaving group. */
2952 if (gimple_bb (DR_STMT (dra))->loop_father
2953 != gimple_bb (DR_STMT (drb))->loop_father)
2954 break;
2956 /* Check that the data-refs have same first location (except init)
2957 and they are both either store or load (not load and store,
2958 not masked loads or stores). */
2959 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2960 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2961 DR_BASE_ADDRESS (drb)) != 0
2962 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2963 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2964 break;
2966 /* Check that the data-refs have the same constant size. */
2967 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2968 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2969 if (!tree_fits_uhwi_p (sza)
2970 || !tree_fits_uhwi_p (szb)
2971 || !tree_int_cst_equal (sza, szb))
2972 break;
2974 /* Check that the data-refs have the same step. */
2975 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2976 break;
2978 /* Check the types are compatible.
2979 ??? We don't distinguish this during sorting. */
2980 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2981 TREE_TYPE (DR_REF (drb))))
2982 break;
2984 /* Check that the DR_INITs are compile-time constants. */
2985 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2986 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2987 break;
2989 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2990 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2991 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2992 HOST_WIDE_INT init_prev
2993 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2994 gcc_assert (init_a <= init_b
2995 && init_a <= init_prev
2996 && init_prev <= init_b);
2998 /* Do not place the same access in the interleaving chain twice. */
2999 if (init_b == init_prev)
3001 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3002 < gimple_uid (DR_STMT (drb)));
3003 /* ??? For now we simply "drop" the later reference which is
3004 otherwise the same rather than finishing off this group.
3005 In the end we'd want to re-process duplicates forming
3006 multiple groups from the refs, likely by just collecting
3007 all candidates (including duplicates and split points
3008 below) in a vector and then process them together. */
3009 continue;
3012 /* If init_b == init_a + the size of the type * k, we have an
3013 interleaving, and DRA is accessed before DRB. */
3014 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3015 if (type_size_a == 0
3016 || (init_b - init_a) % type_size_a != 0)
3017 break;
3019 /* If we have a store, the accesses are adjacent. This splits
3020 groups into chunks we support (we don't support vectorization
3021 of stores with gaps). */
3022 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3023 break;
3025 /* If the step (if not zero or non-constant) is greater than the
3026 difference between data-refs' inits this splits groups into
3027 suitable sizes. */
3028 if (tree_fits_shwi_p (DR_STEP (dra)))
3030 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3031 if (step != 0 && step <= (init_b - init_a))
3032 break;
3035 if (dump_enabled_p ())
3037 dump_printf_loc (MSG_NOTE, vect_location,
3038 "Detected interleaving ");
3039 if (DR_IS_READ (dra))
3040 dump_printf (MSG_NOTE, "load ");
3041 else
3042 dump_printf (MSG_NOTE, "store ");
3043 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3044 dump_printf (MSG_NOTE, " and ");
3045 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3046 dump_printf (MSG_NOTE, "\n");
3049 /* Link the found element into the group list. */
3050 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3052 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3053 lastinfo = stmtinfo_a;
3055 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3056 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3057 lastinfo = stmtinfo_b;
3061 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3062 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3063 && !vect_analyze_data_ref_access (dr))
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3067 "not vectorized: complicated access pattern.\n");
3069 if (is_a <bb_vec_info> (vinfo))
3071 /* Mark the statement as not vectorizable. */
3072 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3073 continue;
3075 else
3077 datarefs_copy.release ();
3078 return false;
3082 datarefs_copy.release ();
3083 return true;
3086 /* Function vect_vfa_segment_size.
3088 Input:
3089 DR: The data reference.
3090 LENGTH_FACTOR: segment length to consider.
3092 Return a value suitable for the dr_with_seg_len::seg_len field.
3093 This is the "distance travelled" by the pointer from the first
3094 iteration in the segment to the last. Note that it does not include
3095 the size of the access; in effect it only describes the first byte. */
3097 static tree
3098 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3100 length_factor = size_binop (MINUS_EXPR,
3101 fold_convert (sizetype, length_factor),
3102 size_one_node);
3103 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3104 length_factor);
3107 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3108 gives the worst-case number of bytes covered by the segment. */
3110 static unsigned HOST_WIDE_INT
3111 vect_vfa_access_size (data_reference *dr)
3113 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3114 tree ref_type = TREE_TYPE (DR_REF (dr));
3115 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3116 unsigned HOST_WIDE_INT access_size = ref_size;
3117 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3119 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3120 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3122 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3123 && (vect_supportable_dr_alignment (dr, false)
3124 == dr_explicit_realign_optimized))
3126 /* We might access a full vector's worth. */
3127 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3128 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3130 return access_size;
3133 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3135 static unsigned int
3136 vect_vfa_align (const data_reference *dr)
3138 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3141 /* Function vect_no_alias_p.
3143 Given data references A and B with equal base and offset, see whether
3144 the alias relation can be decided at compilation time. Return 1 if
3145 it can and the references alias, 0 if it can and the references do
3146 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3147 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3148 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3150 static int
3151 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3152 tree segment_length_a, tree segment_length_b,
3153 unsigned HOST_WIDE_INT access_size_a,
3154 unsigned HOST_WIDE_INT access_size_b)
3156 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3157 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3158 poly_uint64 const_length_a;
3159 poly_uint64 const_length_b;
3161 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3162 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3163 [a, a+12) */
3164 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3166 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3167 offset_a = (offset_a + access_size_a) - const_length_a;
3169 else
3170 const_length_a = tree_to_poly_uint64 (segment_length_a);
3171 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3173 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3174 offset_b = (offset_b + access_size_b) - const_length_b;
3176 else
3177 const_length_b = tree_to_poly_uint64 (segment_length_b);
3179 const_length_a += access_size_a;
3180 const_length_b += access_size_b;
3182 if (ranges_known_overlap_p (offset_a, const_length_a,
3183 offset_b, const_length_b))
3184 return 1;
3186 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3187 offset_b, const_length_b))
3188 return 0;
3190 return -1;
3193 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3194 in DDR is >= VF. */
3196 static bool
3197 dependence_distance_ge_vf (data_dependence_relation *ddr,
3198 unsigned int loop_depth, poly_uint64 vf)
3200 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3201 || DDR_NUM_DIST_VECTS (ddr) == 0)
3202 return false;
3204 /* If the dependence is exact, we should have limited the VF instead. */
3205 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3207 unsigned int i;
3208 lambda_vector dist_v;
3209 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3211 HOST_WIDE_INT dist = dist_v[loop_depth];
3212 if (dist != 0
3213 && !(dist > 0 && DDR_REVERSED_P (ddr))
3214 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3215 return false;
3218 if (dump_enabled_p ())
3220 dump_printf_loc (MSG_NOTE, vect_location,
3221 "dependence distance between ");
3222 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3223 dump_printf (MSG_NOTE, " and ");
3224 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3225 dump_printf (MSG_NOTE, " is >= VF\n");
3228 return true;
3231 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3233 static void
3234 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3236 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3237 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3238 dump_printf (dump_kind, ") >= ");
3239 dump_dec (dump_kind, lower_bound.min_value);
3242 /* Record that the vectorized loop requires the vec_lower_bound described
3243 by EXPR, UNSIGNED_P and MIN_VALUE. */
3245 static void
3246 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3247 poly_uint64 min_value)
3249 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3250 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3251 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3253 unsigned_p &= lower_bounds[i].unsigned_p;
3254 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3255 if (lower_bounds[i].unsigned_p != unsigned_p
3256 || maybe_lt (lower_bounds[i].min_value, min_value))
3258 lower_bounds[i].unsigned_p = unsigned_p;
3259 lower_bounds[i].min_value = min_value;
3260 if (dump_enabled_p ())
3262 dump_printf_loc (MSG_NOTE, vect_location,
3263 "updating run-time check to ");
3264 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3265 dump_printf (MSG_NOTE, "\n");
3268 return;
3271 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3272 if (dump_enabled_p ())
3274 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3275 dump_lower_bound (MSG_NOTE, lower_bound);
3276 dump_printf (MSG_NOTE, "\n");
3278 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3281 /* Return true if it's unlikely that the step of the vectorized form of DR
3282 will span fewer than GAP bytes. */
3284 static bool
3285 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3287 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3288 HOST_WIDE_INT count
3289 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3290 if (GROUP_FIRST_ELEMENT (stmt_info))
3291 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3292 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3295 /* Return true if we know that there is no alias between DR_A and DR_B
3296 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3297 *LOWER_BOUND_OUT to this N. */
3299 static bool
3300 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3301 poly_uint64 *lower_bound_out)
3303 /* Check that there is a constant gap of known sign between DR_A
3304 and DR_B. */
3305 poly_int64 init_a, init_b;
3306 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3307 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3308 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3309 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3310 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3311 || !ordered_p (init_a, init_b))
3312 return false;
3314 /* Sort DR_A and DR_B by the address they access. */
3315 if (maybe_lt (init_b, init_a))
3317 std::swap (init_a, init_b);
3318 std::swap (dr_a, dr_b);
3321 /* If the two accesses could be dependent within a scalar iteration,
3322 make sure that we'd retain their order. */
3323 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3324 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3325 return false;
3327 /* There is no alias if abs (DR_STEP) is greater than or equal to
3328 the bytes spanned by the combination of the two accesses. */
3329 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3330 return true;
3333 /* Function vect_prune_runtime_alias_test_list.
3335 Prune a list of ddrs to be tested at run-time by versioning for alias.
3336 Merge several alias checks into one if possible.
3337 Return FALSE if resulting list of ddrs is longer then allowed by
3338 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3340 bool
3341 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3343 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3344 hash_set <tree_pair_hash> compared_objects;
3346 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3347 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3348 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3349 vec<vec_object_pair> &check_unequal_addrs
3350 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3351 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3352 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3354 ddr_p ddr;
3355 unsigned int i;
3356 tree length_factor;
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "=== vect_prune_runtime_alias_test_list ===\n");
3362 /* Step values are irrelevant for aliasing if the number of vector
3363 iterations is equal to the number of scalar iterations (which can
3364 happen for fully-SLP loops). */
3365 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3367 if (!ignore_step_p)
3369 /* Convert the checks for nonzero steps into bound tests. */
3370 tree value;
3371 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3372 vect_check_lower_bound (loop_vinfo, value, true, 1);
3375 if (may_alias_ddrs.is_empty ())
3376 return true;
3378 comp_alias_ddrs.create (may_alias_ddrs.length ());
3380 unsigned int loop_depth
3381 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3382 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3384 /* First, we collect all data ref pairs for aliasing checks. */
3385 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3387 int comp_res;
3388 poly_uint64 lower_bound;
3389 struct data_reference *dr_a, *dr_b;
3390 gimple *dr_group_first_a, *dr_group_first_b;
3391 tree segment_length_a, segment_length_b;
3392 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3393 unsigned int align_a, align_b;
3394 gimple *stmt_a, *stmt_b;
3396 /* Ignore the alias if the VF we chose ended up being no greater
3397 than the dependence distance. */
3398 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3399 continue;
3401 if (DDR_OBJECT_A (ddr))
3403 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3404 if (!compared_objects.add (new_pair))
3406 if (dump_enabled_p ())
3408 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3409 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3410 dump_printf (MSG_NOTE, " and ");
3411 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3412 dump_printf (MSG_NOTE, " have different addresses\n");
3414 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3416 continue;
3419 dr_a = DDR_A (ddr);
3420 stmt_a = DR_STMT (DDR_A (ddr));
3422 dr_b = DDR_B (ddr);
3423 stmt_b = DR_STMT (DDR_B (ddr));
3425 /* Skip the pair if inter-iteration dependencies are irrelevant
3426 and intra-iteration dependencies are guaranteed to be honored. */
3427 if (ignore_step_p
3428 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3429 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3431 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_NOTE, vect_location,
3434 "no need for alias check between ");
3435 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3436 dump_printf (MSG_NOTE, " and ");
3437 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3438 dump_printf (MSG_NOTE, " when VF is 1\n");
3440 continue;
3443 /* See whether we can handle the alias using a bounds check on
3444 the step, and whether that's likely to be the best approach.
3445 (It might not be, for example, if the minimum step is much larger
3446 than the number of bytes handled by one vector iteration.) */
3447 if (!ignore_step_p
3448 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3449 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3450 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3451 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3453 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3454 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3458 dump_printf (MSG_NOTE, " and ");
3459 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3460 dump_printf (MSG_NOTE, " when the step ");
3461 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3462 dump_printf (MSG_NOTE, " is outside ");
3463 if (unsigned_p)
3464 dump_printf (MSG_NOTE, "[0");
3465 else
3467 dump_printf (MSG_NOTE, "(");
3468 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3470 dump_printf (MSG_NOTE, ", ");
3471 dump_dec (MSG_NOTE, lower_bound);
3472 dump_printf (MSG_NOTE, ")\n");
3474 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3475 lower_bound);
3476 continue;
3479 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3480 if (dr_group_first_a)
3482 stmt_a = dr_group_first_a;
3483 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3486 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3487 if (dr_group_first_b)
3489 stmt_b = dr_group_first_b;
3490 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3493 if (ignore_step_p)
3495 segment_length_a = size_zero_node;
3496 segment_length_b = size_zero_node;
3498 else
3500 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3501 length_factor = scalar_loop_iters;
3502 else
3503 length_factor = size_int (vect_factor);
3504 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3505 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3507 access_size_a = vect_vfa_access_size (dr_a);
3508 access_size_b = vect_vfa_access_size (dr_b);
3509 align_a = vect_vfa_align (dr_a);
3510 align_b = vect_vfa_align (dr_b);
3512 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3513 DR_BASE_ADDRESS (dr_b));
3514 if (comp_res == 0)
3515 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3516 DR_OFFSET (dr_b));
3518 /* See whether the alias is known at compilation time. */
3519 if (comp_res == 0
3520 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3521 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3522 && poly_int_tree_p (segment_length_a)
3523 && poly_int_tree_p (segment_length_b))
3525 int res = vect_compile_time_alias (dr_a, dr_b,
3526 segment_length_a,
3527 segment_length_b,
3528 access_size_a,
3529 access_size_b);
3530 if (res >= 0 && dump_enabled_p ())
3532 dump_printf_loc (MSG_NOTE, vect_location,
3533 "can tell at compile time that ");
3534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3535 dump_printf (MSG_NOTE, " and ");
3536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3537 if (res == 0)
3538 dump_printf (MSG_NOTE, " do not alias\n");
3539 else
3540 dump_printf (MSG_NOTE, " alias\n");
3543 if (res == 0)
3544 continue;
3546 if (res == 1)
3548 if (dump_enabled_p ())
3549 dump_printf_loc (MSG_NOTE, vect_location,
3550 "not vectorized: compilation time alias.\n");
3551 return false;
3555 dr_with_seg_len_pair_t dr_with_seg_len_pair
3556 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3557 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3559 /* Canonicalize pairs by sorting the two DR members. */
3560 if (comp_res > 0)
3561 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3563 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3566 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3568 unsigned int count = (comp_alias_ddrs.length ()
3569 + check_unequal_addrs.length ());
3571 dump_printf_loc (MSG_NOTE, vect_location,
3572 "improved number of alias checks from %d to %d\n",
3573 may_alias_ddrs.length (), count);
3574 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3576 if (dump_enabled_p ())
3577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3578 "number of versioning for alias "
3579 "run-time tests exceeds %d "
3580 "(--param vect-max-version-for-alias-checks)\n",
3581 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3582 return false;
3585 return true;
3588 /* Check whether we can use an internal function for a gather load
3589 or scatter store. READ_P is true for loads and false for stores.
3590 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3591 the type of the memory elements being loaded or stored. OFFSET_BITS
3592 is the number of bits in each scalar offset and OFFSET_SIGN is the
3593 sign of the offset. SCALE is the amount by which the offset should
3594 be multiplied *after* it has been converted to address width.
3596 Return true if the function is supported, storing the function
3597 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3599 bool
3600 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3601 tree memory_type, unsigned int offset_bits,
3602 signop offset_sign, int scale,
3603 internal_fn *ifn_out, tree *element_type_out)
3605 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3606 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3607 if (offset_bits > element_bits)
3608 /* Internal functions require the offset to be the same width as
3609 the vector elements. We can extend narrower offsets, but it isn't
3610 safe to truncate wider offsets. */
3611 return false;
3613 if (element_bits != memory_bits)
3614 /* For now the vector elements must be the same width as the
3615 memory elements. */
3616 return false;
3618 /* Work out which function we need. */
3619 internal_fn ifn;
3620 if (read_p)
3621 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3622 else
3623 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3625 /* Test whether the target supports this combination. */
3626 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3627 offset_sign, scale))
3628 return false;
3630 *ifn_out = ifn;
3631 *element_type_out = TREE_TYPE (vectype);
3632 return true;
3635 /* CALL is a call to an internal gather load or scatter store function.
3636 Describe the operation in INFO. */
3638 static void
3639 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3641 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3642 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3643 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3645 info->ifn = gimple_call_internal_fn (call);
3646 info->decl = NULL_TREE;
3647 info->base = gimple_call_arg (call, 0);
3648 info->offset = gimple_call_arg (call, 1);
3649 info->offset_dt = vect_unknown_def_type;
3650 info->offset_vectype = NULL_TREE;
3651 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3652 info->element_type = TREE_TYPE (vectype);
3653 info->memory_type = TREE_TYPE (DR_REF (dr));
3656 /* Return true if a non-affine read or write in STMT is suitable for a
3657 gather load or scatter store. Describe the operation in *INFO if so. */
3659 bool
3660 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3661 gather_scatter_info *info)
3663 HOST_WIDE_INT scale = 1;
3664 poly_int64 pbitpos, pbitsize;
3665 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3667 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3668 tree offtype = NULL_TREE;
3669 tree decl = NULL_TREE, base, off;
3670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3671 tree memory_type = TREE_TYPE (DR_REF (dr));
3672 machine_mode pmode;
3673 int punsignedp, reversep, pvolatilep = 0;
3674 internal_fn ifn;
3675 tree element_type;
3676 bool masked_p = false;
3678 /* See whether this is already a call to a gather/scatter internal function.
3679 If not, see whether it's a masked load or store. */
3680 gcall *call = dyn_cast <gcall *> (stmt);
3681 if (call && gimple_call_internal_p (call))
3683 ifn = gimple_call_internal_fn (stmt);
3684 if (internal_gather_scatter_fn_p (ifn))
3686 vect_describe_gather_scatter_call (call, info);
3687 return true;
3689 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3692 /* True if we should aim to use internal functions rather than
3693 built-in functions. */
3694 bool use_ifn_p = (DR_IS_READ (dr)
3695 ? supports_vec_gather_load_p ()
3696 : supports_vec_scatter_store_p ());
3698 base = DR_REF (dr);
3699 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3700 see if we can use the def stmt of the address. */
3701 if (masked_p
3702 && TREE_CODE (base) == MEM_REF
3703 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3704 && integer_zerop (TREE_OPERAND (base, 1))
3705 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3707 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3708 if (is_gimple_assign (def_stmt)
3709 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3710 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3713 /* The gather and scatter builtins need address of the form
3714 loop_invariant + vector * {1, 2, 4, 8}
3716 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3717 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3718 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3719 multiplications and additions in it. To get a vector, we need
3720 a single SSA_NAME that will be defined in the loop and will
3721 contain everything that is not loop invariant and that can be
3722 vectorized. The following code attempts to find such a preexistng
3723 SSA_NAME OFF and put the loop invariants into a tree BASE
3724 that can be gimplified before the loop. */
3725 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3726 &punsignedp, &reversep, &pvolatilep);
3727 gcc_assert (base && !reversep);
3728 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3730 if (TREE_CODE (base) == MEM_REF)
3732 if (!integer_zerop (TREE_OPERAND (base, 1)))
3734 if (off == NULL_TREE)
3735 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3736 else
3737 off = size_binop (PLUS_EXPR, off,
3738 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3740 base = TREE_OPERAND (base, 0);
3742 else
3743 base = build_fold_addr_expr (base);
3745 if (off == NULL_TREE)
3746 off = size_zero_node;
3748 /* If base is not loop invariant, either off is 0, then we start with just
3749 the constant offset in the loop invariant BASE and continue with base
3750 as OFF, otherwise give up.
3751 We could handle that case by gimplifying the addition of base + off
3752 into some SSA_NAME and use that as off, but for now punt. */
3753 if (!expr_invariant_in_loop_p (loop, base))
3755 if (!integer_zerop (off))
3756 return false;
3757 off = base;
3758 base = size_int (pbytepos);
3760 /* Otherwise put base + constant offset into the loop invariant BASE
3761 and continue with OFF. */
3762 else
3764 base = fold_convert (sizetype, base);
3765 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3768 /* OFF at this point may be either a SSA_NAME or some tree expression
3769 from get_inner_reference. Try to peel off loop invariants from it
3770 into BASE as long as possible. */
3771 STRIP_NOPS (off);
3772 while (offtype == NULL_TREE)
3774 enum tree_code code;
3775 tree op0, op1, add = NULL_TREE;
3777 if (TREE_CODE (off) == SSA_NAME)
3779 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3781 if (expr_invariant_in_loop_p (loop, off))
3782 return false;
3784 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3785 break;
3787 op0 = gimple_assign_rhs1 (def_stmt);
3788 code = gimple_assign_rhs_code (def_stmt);
3789 op1 = gimple_assign_rhs2 (def_stmt);
3791 else
3793 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3794 return false;
3795 code = TREE_CODE (off);
3796 extract_ops_from_tree (off, &code, &op0, &op1);
3798 switch (code)
3800 case POINTER_PLUS_EXPR:
3801 case PLUS_EXPR:
3802 if (expr_invariant_in_loop_p (loop, op0))
3804 add = op0;
3805 off = op1;
3806 do_add:
3807 add = fold_convert (sizetype, add);
3808 if (scale != 1)
3809 add = size_binop (MULT_EXPR, add, size_int (scale));
3810 base = size_binop (PLUS_EXPR, base, add);
3811 continue;
3813 if (expr_invariant_in_loop_p (loop, op1))
3815 add = op1;
3816 off = op0;
3817 goto do_add;
3819 break;
3820 case MINUS_EXPR:
3821 if (expr_invariant_in_loop_p (loop, op1))
3823 add = fold_convert (sizetype, op1);
3824 add = size_binop (MINUS_EXPR, size_zero_node, add);
3825 off = op0;
3826 goto do_add;
3828 break;
3829 case MULT_EXPR:
3830 if (scale == 1 && tree_fits_shwi_p (op1))
3832 int new_scale = tree_to_shwi (op1);
3833 /* Only treat this as a scaling operation if the target
3834 supports it. */
3835 if (use_ifn_p
3836 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3837 vectype, memory_type, 1,
3838 TYPE_SIGN (TREE_TYPE (op0)),
3839 new_scale, &ifn,
3840 &element_type))
3841 break;
3842 scale = new_scale;
3843 off = op0;
3844 continue;
3846 break;
3847 case SSA_NAME:
3848 off = op0;
3849 continue;
3850 CASE_CONVERT:
3851 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3852 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3853 break;
3854 if (TYPE_PRECISION (TREE_TYPE (op0))
3855 == TYPE_PRECISION (TREE_TYPE (off)))
3857 off = op0;
3858 continue;
3861 /* The internal functions need the offset to be the same width
3862 as the elements of VECTYPE. Don't include operations that
3863 cast the offset from that width to a different width. */
3864 if (use_ifn_p
3865 && (int_size_in_bytes (TREE_TYPE (vectype))
3866 == int_size_in_bytes (TREE_TYPE (off))))
3867 break;
3869 if (TYPE_PRECISION (TREE_TYPE (op0))
3870 < TYPE_PRECISION (TREE_TYPE (off)))
3872 off = op0;
3873 offtype = TREE_TYPE (off);
3874 STRIP_NOPS (off);
3875 continue;
3877 break;
3878 default:
3879 break;
3881 break;
3884 /* If at the end OFF still isn't a SSA_NAME or isn't
3885 defined in the loop, punt. */
3886 if (TREE_CODE (off) != SSA_NAME
3887 || expr_invariant_in_loop_p (loop, off))
3888 return false;
3890 if (offtype == NULL_TREE)
3891 offtype = TREE_TYPE (off);
3893 if (use_ifn_p)
3895 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3896 memory_type, TYPE_PRECISION (offtype),
3897 TYPE_SIGN (offtype), scale, &ifn,
3898 &element_type))
3899 return false;
3901 else
3903 if (DR_IS_READ (dr))
3905 if (targetm.vectorize.builtin_gather)
3906 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3908 else
3910 if (targetm.vectorize.builtin_scatter)
3911 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3914 if (!decl)
3915 return false;
3917 ifn = IFN_LAST;
3918 element_type = TREE_TYPE (vectype);
3921 info->ifn = ifn;
3922 info->decl = decl;
3923 info->base = base;
3924 info->offset = off;
3925 info->offset_dt = vect_unknown_def_type;
3926 info->offset_vectype = NULL_TREE;
3927 info->scale = scale;
3928 info->element_type = element_type;
3929 info->memory_type = memory_type;
3930 return true;
3933 /* Function vect_analyze_data_refs.
3935 Find all the data references in the loop or basic block.
3937 The general structure of the analysis of data refs in the vectorizer is as
3938 follows:
3939 1- vect_analyze_data_refs(loop/bb): call
3940 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3941 in the loop/bb and their dependences.
3942 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3943 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3944 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3948 bool
3949 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3951 struct loop *loop = NULL;
3952 unsigned int i;
3953 struct data_reference *dr;
3954 tree scalar_type;
3956 if (dump_enabled_p ())
3957 dump_printf_loc (MSG_NOTE, vect_location,
3958 "=== vect_analyze_data_refs ===\n");
3960 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3961 loop = LOOP_VINFO_LOOP (loop_vinfo);
3963 /* Go through the data-refs, check that the analysis succeeded. Update
3964 pointer from stmt_vec_info struct to DR and vectype. */
3966 vec<data_reference_p> datarefs = vinfo->datarefs;
3967 FOR_EACH_VEC_ELT (datarefs, i, dr)
3969 gimple *stmt;
3970 stmt_vec_info stmt_info;
3971 tree base, offset, init;
3972 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3973 bool simd_lane_access = false;
3974 poly_uint64 vf;
3976 again:
3977 if (!dr || !DR_REF (dr))
3979 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3981 "not vectorized: unhandled data-ref\n");
3982 return false;
3985 stmt = DR_STMT (dr);
3986 stmt_info = vinfo_for_stmt (stmt);
3988 /* Discard clobbers from the dataref vector. We will remove
3989 clobber stmts during vectorization. */
3990 if (gimple_clobber_p (stmt))
3992 free_data_ref (dr);
3993 if (i == datarefs.length () - 1)
3995 datarefs.pop ();
3996 break;
3998 datarefs.ordered_remove (i);
3999 dr = datarefs[i];
4000 goto again;
4003 /* Check that analysis of the data-ref succeeded. */
4004 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4005 || !DR_STEP (dr))
4007 bool maybe_gather
4008 = DR_IS_READ (dr)
4009 && !TREE_THIS_VOLATILE (DR_REF (dr))
4010 && (targetm.vectorize.builtin_gather != NULL
4011 || supports_vec_gather_load_p ());
4012 bool maybe_scatter
4013 = DR_IS_WRITE (dr)
4014 && !TREE_THIS_VOLATILE (DR_REF (dr))
4015 && (targetm.vectorize.builtin_scatter != NULL
4016 || supports_vec_scatter_store_p ());
4017 bool maybe_simd_lane_access
4018 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4020 /* If target supports vector gather loads or scatter stores, or if
4021 this might be a SIMD lane access, see if they can't be used. */
4022 if (is_a <loop_vec_info> (vinfo)
4023 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4024 && !nested_in_vect_loop_p (loop, stmt))
4026 struct data_reference *newdr
4027 = create_data_ref (NULL, loop_containing_stmt (stmt),
4028 DR_REF (dr), stmt, !maybe_scatter,
4029 DR_IS_CONDITIONAL_IN_STMT (dr));
4030 gcc_assert (newdr != NULL && DR_REF (newdr));
4031 if (DR_BASE_ADDRESS (newdr)
4032 && DR_OFFSET (newdr)
4033 && DR_INIT (newdr)
4034 && DR_STEP (newdr)
4035 && integer_zerop (DR_STEP (newdr)))
4037 if (maybe_simd_lane_access)
4039 tree off = DR_OFFSET (newdr);
4040 STRIP_NOPS (off);
4041 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4042 && TREE_CODE (off) == MULT_EXPR
4043 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4045 tree step = TREE_OPERAND (off, 1);
4046 off = TREE_OPERAND (off, 0);
4047 STRIP_NOPS (off);
4048 if (CONVERT_EXPR_P (off)
4049 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4050 0)))
4051 < TYPE_PRECISION (TREE_TYPE (off)))
4052 off = TREE_OPERAND (off, 0);
4053 if (TREE_CODE (off) == SSA_NAME)
4055 gimple *def = SSA_NAME_DEF_STMT (off);
4056 tree reft = TREE_TYPE (DR_REF (newdr));
4057 if (is_gimple_call (def)
4058 && gimple_call_internal_p (def)
4059 && (gimple_call_internal_fn (def)
4060 == IFN_GOMP_SIMD_LANE))
4062 tree arg = gimple_call_arg (def, 0);
4063 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4064 arg = SSA_NAME_VAR (arg);
4065 if (arg == loop->simduid
4066 /* For now. */
4067 && tree_int_cst_equal
4068 (TYPE_SIZE_UNIT (reft),
4069 step))
4071 DR_OFFSET (newdr) = ssize_int (0);
4072 DR_STEP (newdr) = step;
4073 DR_OFFSET_ALIGNMENT (newdr)
4074 = BIGGEST_ALIGNMENT;
4075 DR_STEP_ALIGNMENT (newdr)
4076 = highest_pow2_factor (step);
4077 dr = newdr;
4078 simd_lane_access = true;
4084 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4086 dr = newdr;
4087 if (maybe_gather)
4088 gatherscatter = GATHER;
4089 else
4090 gatherscatter = SCATTER;
4093 if (gatherscatter == SG_NONE && !simd_lane_access)
4094 free_data_ref (newdr);
4097 if (gatherscatter == SG_NONE && !simd_lane_access)
4099 if (dump_enabled_p ())
4101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4102 "not vectorized: data ref analysis "
4103 "failed ");
4104 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4107 if (is_a <bb_vec_info> (vinfo))
4108 break;
4110 return false;
4114 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4116 if (dump_enabled_p ())
4117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4118 "not vectorized: base addr of dr is a "
4119 "constant\n");
4121 if (is_a <bb_vec_info> (vinfo))
4122 break;
4124 if (gatherscatter != SG_NONE || simd_lane_access)
4125 free_data_ref (dr);
4126 return false;
4129 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4131 if (dump_enabled_p ())
4133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4134 "not vectorized: volatile type ");
4135 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4138 if (is_a <bb_vec_info> (vinfo))
4139 break;
4141 return false;
4144 if (stmt_can_throw_internal (stmt))
4146 if (dump_enabled_p ())
4148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4149 "not vectorized: statement can throw an "
4150 "exception ");
4151 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4154 if (is_a <bb_vec_info> (vinfo))
4155 break;
4157 if (gatherscatter != SG_NONE || simd_lane_access)
4158 free_data_ref (dr);
4159 return false;
4162 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4163 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4165 if (dump_enabled_p ())
4167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4168 "not vectorized: statement is bitfield "
4169 "access ");
4170 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4173 if (is_a <bb_vec_info> (vinfo))
4174 break;
4176 if (gatherscatter != SG_NONE || simd_lane_access)
4177 free_data_ref (dr);
4178 return false;
4181 base = unshare_expr (DR_BASE_ADDRESS (dr));
4182 offset = unshare_expr (DR_OFFSET (dr));
4183 init = unshare_expr (DR_INIT (dr));
4185 if (is_gimple_call (stmt)
4186 && (!gimple_call_internal_p (stmt)
4187 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4188 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4190 if (dump_enabled_p ())
4192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4193 "not vectorized: dr in a call ");
4194 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4197 if (is_a <bb_vec_info> (vinfo))
4198 break;
4200 if (gatherscatter != SG_NONE || simd_lane_access)
4201 free_data_ref (dr);
4202 return false;
4205 /* Update DR field in stmt_vec_info struct. */
4207 /* If the dataref is in an inner-loop of the loop that is considered for
4208 for vectorization, we also want to analyze the access relative to
4209 the outer-loop (DR contains information only relative to the
4210 inner-most enclosing loop). We do that by building a reference to the
4211 first location accessed by the inner-loop, and analyze it relative to
4212 the outer-loop. */
4213 if (loop && nested_in_vect_loop_p (loop, stmt))
4215 /* Build a reference to the first location accessed by the
4216 inner loop: *(BASE + INIT + OFFSET). By construction,
4217 this address must be invariant in the inner loop, so we
4218 can consider it as being used in the outer loop. */
4219 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4220 init, offset);
4221 tree init_addr = fold_build_pointer_plus (base, init_offset);
4222 tree init_ref = build_fold_indirect_ref (init_addr);
4224 if (dump_enabled_p ())
4226 dump_printf_loc (MSG_NOTE, vect_location,
4227 "analyze in outer loop: ");
4228 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4229 dump_printf (MSG_NOTE, "\n");
4232 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4233 init_ref, loop))
4234 /* dr_analyze_innermost already explained the failure. */
4235 return false;
4237 if (dump_enabled_p ())
4239 dump_printf_loc (MSG_NOTE, vect_location,
4240 "\touter base_address: ");
4241 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4242 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4243 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4244 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4245 STMT_VINFO_DR_OFFSET (stmt_info));
4246 dump_printf (MSG_NOTE,
4247 "\n\touter constant offset from base address: ");
4248 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4249 STMT_VINFO_DR_INIT (stmt_info));
4250 dump_printf (MSG_NOTE, "\n\touter step: ");
4251 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4252 STMT_VINFO_DR_STEP (stmt_info));
4253 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4254 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4255 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4256 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4257 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4258 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4259 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4260 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4264 if (STMT_VINFO_DATA_REF (stmt_info))
4266 if (dump_enabled_p ())
4268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4269 "not vectorized: more than one data ref "
4270 "in stmt: ");
4271 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4274 if (is_a <bb_vec_info> (vinfo))
4275 break;
4277 if (gatherscatter != SG_NONE || simd_lane_access)
4278 free_data_ref (dr);
4279 return false;
4282 STMT_VINFO_DATA_REF (stmt_info) = dr;
4283 if (simd_lane_access)
4285 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4286 free_data_ref (datarefs[i]);
4287 datarefs[i] = dr;
4290 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4291 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4292 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4294 if (dump_enabled_p ())
4296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4297 "not vectorized: base object not addressable "
4298 "for stmt: ");
4299 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4301 if (is_a <bb_vec_info> (vinfo))
4303 /* In BB vectorization the ref can still participate
4304 in dependence analysis, we just can't vectorize it. */
4305 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4306 continue;
4308 return false;
4311 /* Set vectype for STMT. */
4312 scalar_type = TREE_TYPE (DR_REF (dr));
4313 STMT_VINFO_VECTYPE (stmt_info)
4314 = get_vectype_for_scalar_type (scalar_type);
4315 if (!STMT_VINFO_VECTYPE (stmt_info))
4317 if (dump_enabled_p ())
4319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4320 "not vectorized: no vectype for stmt: ");
4321 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4322 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4323 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4324 scalar_type);
4325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4328 if (is_a <bb_vec_info> (vinfo))
4330 /* No vector type is fine, the ref can still participate
4331 in dependence analysis, we just can't vectorize it. */
4332 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4333 continue;
4336 if (gatherscatter != SG_NONE || simd_lane_access)
4338 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4339 if (gatherscatter != SG_NONE)
4340 free_data_ref (dr);
4342 return false;
4344 else
4346 if (dump_enabled_p ())
4348 dump_printf_loc (MSG_NOTE, vect_location,
4349 "got vectype for stmt: ");
4350 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4351 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4352 STMT_VINFO_VECTYPE (stmt_info));
4353 dump_printf (MSG_NOTE, "\n");
4357 /* Adjust the minimal vectorization factor according to the
4358 vector type. */
4359 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4360 *min_vf = upper_bound (*min_vf, vf);
4362 if (gatherscatter != SG_NONE)
4364 gather_scatter_info gs_info;
4365 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4366 &gs_info)
4367 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4369 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4370 free_data_ref (dr);
4371 if (dump_enabled_p ())
4373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4374 (gatherscatter == GATHER) ?
4375 "not vectorized: not suitable for gather "
4376 "load " :
4377 "not vectorized: not suitable for scatter "
4378 "store ");
4379 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4381 return false;
4384 free_data_ref (datarefs[i]);
4385 datarefs[i] = dr;
4386 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4389 else if (is_a <loop_vec_info> (vinfo)
4390 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4392 if (nested_in_vect_loop_p (loop, stmt))
4394 if (dump_enabled_p ())
4396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4397 "not vectorized: not suitable for strided "
4398 "load ");
4399 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4401 return false;
4403 STMT_VINFO_STRIDED_P (stmt_info) = true;
4407 /* If we stopped analysis at the first dataref we could not analyze
4408 when trying to vectorize a basic-block mark the rest of the datarefs
4409 as not vectorizable and truncate the vector of datarefs. That
4410 avoids spending useless time in analyzing their dependence. */
4411 if (i != datarefs.length ())
4413 gcc_assert (is_a <bb_vec_info> (vinfo));
4414 for (unsigned j = i; j < datarefs.length (); ++j)
4416 data_reference_p dr = datarefs[j];
4417 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4418 free_data_ref (dr);
4420 datarefs.truncate (i);
4423 return true;
4427 /* Function vect_get_new_vect_var.
4429 Returns a name for a new variable. The current naming scheme appends the
4430 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4431 the name of vectorizer generated variables, and appends that to NAME if
4432 provided. */
4434 tree
4435 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4437 const char *prefix;
4438 tree new_vect_var;
4440 switch (var_kind)
4442 case vect_simple_var:
4443 prefix = "vect";
4444 break;
4445 case vect_scalar_var:
4446 prefix = "stmp";
4447 break;
4448 case vect_mask_var:
4449 prefix = "mask";
4450 break;
4451 case vect_pointer_var:
4452 prefix = "vectp";
4453 break;
4454 default:
4455 gcc_unreachable ();
4458 if (name)
4460 char* tmp = concat (prefix, "_", name, NULL);
4461 new_vect_var = create_tmp_reg (type, tmp);
4462 free (tmp);
4464 else
4465 new_vect_var = create_tmp_reg (type, prefix);
4467 return new_vect_var;
4470 /* Like vect_get_new_vect_var but return an SSA name. */
4472 tree
4473 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4475 const char *prefix;
4476 tree new_vect_var;
4478 switch (var_kind)
4480 case vect_simple_var:
4481 prefix = "vect";
4482 break;
4483 case vect_scalar_var:
4484 prefix = "stmp";
4485 break;
4486 case vect_pointer_var:
4487 prefix = "vectp";
4488 break;
4489 default:
4490 gcc_unreachable ();
4493 if (name)
4495 char* tmp = concat (prefix, "_", name, NULL);
4496 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4497 free (tmp);
4499 else
4500 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4502 return new_vect_var;
4505 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4507 static void
4508 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4510 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4511 int misalign = DR_MISALIGNMENT (dr);
4512 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4513 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4514 else
4515 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4516 DR_TARGET_ALIGNMENT (dr), misalign);
4519 /* Function vect_create_addr_base_for_vector_ref.
4521 Create an expression that computes the address of the first memory location
4522 that will be accessed for a data reference.
4524 Input:
4525 STMT: The statement containing the data reference.
4526 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4527 OFFSET: Optional. If supplied, it is be added to the initial address.
4528 LOOP: Specify relative to which loop-nest should the address be computed.
4529 For example, when the dataref is in an inner-loop nested in an
4530 outer-loop that is now being vectorized, LOOP can be either the
4531 outer-loop, or the inner-loop. The first memory location accessed
4532 by the following dataref ('in' points to short):
4534 for (i=0; i<N; i++)
4535 for (j=0; j<M; j++)
4536 s += in[i+j]
4538 is as follows:
4539 if LOOP=i_loop: &in (relative to i_loop)
4540 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4541 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4542 initial address. Unlike OFFSET, which is number of elements to
4543 be added, BYTE_OFFSET is measured in bytes.
4545 Output:
4546 1. Return an SSA_NAME whose value is the address of the memory location of
4547 the first vector of the data reference.
4548 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4549 these statement(s) which define the returned SSA_NAME.
4551 FORNOW: We are only handling array accesses with step 1. */
4553 tree
4554 vect_create_addr_base_for_vector_ref (gimple *stmt,
4555 gimple_seq *new_stmt_list,
4556 tree offset,
4557 tree byte_offset)
4559 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4560 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4561 const char *base_name;
4562 tree addr_base;
4563 tree dest;
4564 gimple_seq seq = NULL;
4565 tree vect_ptr_type;
4566 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4567 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4568 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4570 tree data_ref_base = unshare_expr (drb->base_address);
4571 tree base_offset = unshare_expr (drb->offset);
4572 tree init = unshare_expr (drb->init);
4574 if (loop_vinfo)
4575 base_name = get_name (data_ref_base);
4576 else
4578 base_offset = ssize_int (0);
4579 init = ssize_int (0);
4580 base_name = get_name (DR_REF (dr));
4583 /* Create base_offset */
4584 base_offset = size_binop (PLUS_EXPR,
4585 fold_convert (sizetype, base_offset),
4586 fold_convert (sizetype, init));
4588 if (offset)
4590 offset = fold_build2 (MULT_EXPR, sizetype,
4591 fold_convert (sizetype, offset), step);
4592 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4593 base_offset, offset);
4595 if (byte_offset)
4597 byte_offset = fold_convert (sizetype, byte_offset);
4598 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4599 base_offset, byte_offset);
4602 /* base + base_offset */
4603 if (loop_vinfo)
4604 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4605 else
4607 addr_base = build1 (ADDR_EXPR,
4608 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4609 unshare_expr (DR_REF (dr)));
4612 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4613 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4614 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4615 gimple_seq_add_seq (new_stmt_list, seq);
4617 if (DR_PTR_INFO (dr)
4618 && TREE_CODE (addr_base) == SSA_NAME
4619 && !SSA_NAME_PTR_INFO (addr_base))
4621 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4622 if (offset || byte_offset)
4623 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4626 if (dump_enabled_p ())
4628 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4629 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4630 dump_printf (MSG_NOTE, "\n");
4633 return addr_base;
4637 /* Function vect_create_data_ref_ptr.
4639 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4640 location accessed in the loop by STMT, along with the def-use update
4641 chain to appropriately advance the pointer through the loop iterations.
4642 Also set aliasing information for the pointer. This pointer is used by
4643 the callers to this function to create a memory reference expression for
4644 vector load/store access.
4646 Input:
4647 1. STMT: a stmt that references memory. Expected to be of the form
4648 GIMPLE_ASSIGN <name, data-ref> or
4649 GIMPLE_ASSIGN <data-ref, name>.
4650 2. AGGR_TYPE: the type of the reference, which should be either a vector
4651 or an array.
4652 3. AT_LOOP: the loop where the vector memref is to be created.
4653 4. OFFSET (optional): an offset to be added to the initial address accessed
4654 by the data-ref in STMT.
4655 5. BSI: location where the new stmts are to be placed if there is no loop
4656 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4657 pointing to the initial address.
4658 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4659 to the initial address accessed by the data-ref in STMT. This is
4660 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4661 in bytes.
4662 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4663 to the IV during each iteration of the loop. NULL says to move
4664 by one copy of AGGR_TYPE up or down, depending on the step of the
4665 data reference.
4667 Output:
4668 1. Declare a new ptr to vector_type, and have it point to the base of the
4669 data reference (initial addressed accessed by the data reference).
4670 For example, for vector of type V8HI, the following code is generated:
4672 v8hi *ap;
4673 ap = (v8hi *)initial_address;
4675 if OFFSET is not supplied:
4676 initial_address = &a[init];
4677 if OFFSET is supplied:
4678 initial_address = &a[init + OFFSET];
4679 if BYTE_OFFSET is supplied:
4680 initial_address = &a[init] + BYTE_OFFSET;
4682 Return the initial_address in INITIAL_ADDRESS.
4684 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4685 update the pointer in each iteration of the loop.
4687 Return the increment stmt that updates the pointer in PTR_INCR.
4689 3. Set INV_P to true if the access pattern of the data reference in the
4690 vectorized loop is invariant. Set it to false otherwise.
4692 4. Return the pointer. */
4694 tree
4695 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4696 tree offset, tree *initial_address,
4697 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4698 bool only_init, bool *inv_p, tree byte_offset,
4699 tree iv_step)
4701 const char *base_name;
4702 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4703 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4704 struct loop *loop = NULL;
4705 bool nested_in_vect_loop = false;
4706 struct loop *containing_loop = NULL;
4707 tree aggr_ptr_type;
4708 tree aggr_ptr;
4709 tree new_temp;
4710 gimple_seq new_stmt_list = NULL;
4711 edge pe = NULL;
4712 basic_block new_bb;
4713 tree aggr_ptr_init;
4714 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4715 tree aptr;
4716 gimple_stmt_iterator incr_gsi;
4717 bool insert_after;
4718 tree indx_before_incr, indx_after_incr;
4719 gimple *incr;
4720 tree step;
4721 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4723 gcc_assert (iv_step != NULL_TREE
4724 || TREE_CODE (aggr_type) == ARRAY_TYPE
4725 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4727 if (loop_vinfo)
4729 loop = LOOP_VINFO_LOOP (loop_vinfo);
4730 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4731 containing_loop = (gimple_bb (stmt))->loop_father;
4732 pe = loop_preheader_edge (loop);
4734 else
4736 gcc_assert (bb_vinfo);
4737 only_init = true;
4738 *ptr_incr = NULL;
4741 /* Check the step (evolution) of the load in LOOP, and record
4742 whether it's invariant. */
4743 step = vect_dr_behavior (dr)->step;
4744 if (integer_zerop (step))
4745 *inv_p = true;
4746 else
4747 *inv_p = false;
4749 /* Create an expression for the first address accessed by this load
4750 in LOOP. */
4751 base_name = get_name (DR_BASE_ADDRESS (dr));
4753 if (dump_enabled_p ())
4755 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4756 dump_printf_loc (MSG_NOTE, vect_location,
4757 "create %s-pointer variable to type: ",
4758 get_tree_code_name (TREE_CODE (aggr_type)));
4759 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4760 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4761 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4762 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4763 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4764 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4765 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4766 else
4767 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4768 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4769 dump_printf (MSG_NOTE, "\n");
4772 /* (1) Create the new aggregate-pointer variable.
4773 Vector and array types inherit the alias set of their component
4774 type by default so we need to use a ref-all pointer if the data
4775 reference does not conflict with the created aggregated data
4776 reference because it is not addressable. */
4777 bool need_ref_all = false;
4778 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4779 get_alias_set (DR_REF (dr))))
4780 need_ref_all = true;
4781 /* Likewise for any of the data references in the stmt group. */
4782 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4784 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4787 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4788 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4789 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4790 get_alias_set (DR_REF (sdr))))
4792 need_ref_all = true;
4793 break;
4795 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4797 while (orig_stmt);
4799 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4800 need_ref_all);
4801 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4804 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4805 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4806 def-use update cycles for the pointer: one relative to the outer-loop
4807 (LOOP), which is what steps (3) and (4) below do. The other is relative
4808 to the inner-loop (which is the inner-most loop containing the dataref),
4809 and this is done be step (5) below.
4811 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4812 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4813 redundant. Steps (3),(4) create the following:
4815 vp0 = &base_addr;
4816 LOOP: vp1 = phi(vp0,vp2)
4819 vp2 = vp1 + step
4820 goto LOOP
4822 If there is an inner-loop nested in loop, then step (5) will also be
4823 applied, and an additional update in the inner-loop will be created:
4825 vp0 = &base_addr;
4826 LOOP: vp1 = phi(vp0,vp2)
4828 inner: vp3 = phi(vp1,vp4)
4829 vp4 = vp3 + inner_step
4830 if () goto inner
4832 vp2 = vp1 + step
4833 if () goto LOOP */
4835 /* (2) Calculate the initial address of the aggregate-pointer, and set
4836 the aggregate-pointer to point to it before the loop. */
4838 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4840 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4841 offset, byte_offset);
4842 if (new_stmt_list)
4844 if (pe)
4846 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4847 gcc_assert (!new_bb);
4849 else
4850 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4853 *initial_address = new_temp;
4854 aggr_ptr_init = new_temp;
4856 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4857 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4858 inner-loop nested in LOOP (during outer-loop vectorization). */
4860 /* No update in loop is required. */
4861 if (only_init && (!loop_vinfo || at_loop == loop))
4862 aptr = aggr_ptr_init;
4863 else
4865 if (iv_step == NULL_TREE)
4867 /* The step of the aggregate pointer is the type size. */
4868 iv_step = TYPE_SIZE_UNIT (aggr_type);
4869 /* One exception to the above is when the scalar step of the load in
4870 LOOP is zero. In this case the step here is also zero. */
4871 if (*inv_p)
4872 iv_step = size_zero_node;
4873 else if (tree_int_cst_sgn (step) == -1)
4874 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4877 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4879 create_iv (aggr_ptr_init,
4880 fold_convert (aggr_ptr_type, iv_step),
4881 aggr_ptr, loop, &incr_gsi, insert_after,
4882 &indx_before_incr, &indx_after_incr);
4883 incr = gsi_stmt (incr_gsi);
4884 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4886 /* Copy the points-to information if it exists. */
4887 if (DR_PTR_INFO (dr))
4889 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4890 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4892 if (ptr_incr)
4893 *ptr_incr = incr;
4895 aptr = indx_before_incr;
4898 if (!nested_in_vect_loop || only_init)
4899 return aptr;
4902 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4903 nested in LOOP, if exists. */
4905 gcc_assert (nested_in_vect_loop);
4906 if (!only_init)
4908 standard_iv_increment_position (containing_loop, &incr_gsi,
4909 &insert_after);
4910 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4911 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4912 &indx_after_incr);
4913 incr = gsi_stmt (incr_gsi);
4914 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4916 /* Copy the points-to information if it exists. */
4917 if (DR_PTR_INFO (dr))
4919 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4920 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4922 if (ptr_incr)
4923 *ptr_incr = incr;
4925 return indx_before_incr;
4927 else
4928 gcc_unreachable ();
4932 /* Function bump_vector_ptr
4934 Increment a pointer (to a vector type) by vector-size. If requested,
4935 i.e. if PTR-INCR is given, then also connect the new increment stmt
4936 to the existing def-use update-chain of the pointer, by modifying
4937 the PTR_INCR as illustrated below:
4939 The pointer def-use update-chain before this function:
4940 DATAREF_PTR = phi (p_0, p_2)
4941 ....
4942 PTR_INCR: p_2 = DATAREF_PTR + step
4944 The pointer def-use update-chain after this function:
4945 DATAREF_PTR = phi (p_0, p_2)
4946 ....
4947 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4948 ....
4949 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4951 Input:
4952 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4953 in the loop.
4954 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4955 the loop. The increment amount across iterations is expected
4956 to be vector_size.
4957 BSI - location where the new update stmt is to be placed.
4958 STMT - the original scalar memory-access stmt that is being vectorized.
4959 BUMP - optional. The offset by which to bump the pointer. If not given,
4960 the offset is assumed to be vector_size.
4962 Output: Return NEW_DATAREF_PTR as illustrated above.
4966 tree
4967 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4968 gimple *stmt, tree bump)
4970 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4971 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4972 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4973 tree update = TYPE_SIZE_UNIT (vectype);
4974 gassign *incr_stmt;
4975 ssa_op_iter iter;
4976 use_operand_p use_p;
4977 tree new_dataref_ptr;
4979 if (bump)
4980 update = bump;
4982 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4983 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4984 else
4985 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4986 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4987 dataref_ptr, update);
4988 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4990 /* Copy the points-to information if it exists. */
4991 if (DR_PTR_INFO (dr))
4993 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4994 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4997 if (!ptr_incr)
4998 return new_dataref_ptr;
5000 /* Update the vector-pointer's cross-iteration increment. */
5001 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5003 tree use = USE_FROM_PTR (use_p);
5005 if (use == dataref_ptr)
5006 SET_USE (use_p, new_dataref_ptr);
5007 else
5008 gcc_assert (operand_equal_p (use, update, 0));
5011 return new_dataref_ptr;
5015 /* Copy memory reference info such as base/clique from the SRC reference
5016 to the DEST MEM_REF. */
5018 void
5019 vect_copy_ref_info (tree dest, tree src)
5021 if (TREE_CODE (dest) != MEM_REF)
5022 return;
5024 tree src_base = src;
5025 while (handled_component_p (src_base))
5026 src_base = TREE_OPERAND (src_base, 0);
5027 if (TREE_CODE (src_base) != MEM_REF
5028 && TREE_CODE (src_base) != TARGET_MEM_REF)
5029 return;
5031 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5032 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5036 /* Function vect_create_destination_var.
5038 Create a new temporary of type VECTYPE. */
5040 tree
5041 vect_create_destination_var (tree scalar_dest, tree vectype)
5043 tree vec_dest;
5044 const char *name;
5045 char *new_name;
5046 tree type;
5047 enum vect_var_kind kind;
5049 kind = vectype
5050 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5051 ? vect_mask_var
5052 : vect_simple_var
5053 : vect_scalar_var;
5054 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5056 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5058 name = get_name (scalar_dest);
5059 if (name)
5060 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5061 else
5062 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5063 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5064 free (new_name);
5066 return vec_dest;
5069 /* Function vect_grouped_store_supported.
5071 Returns TRUE if interleave high and interleave low permutations
5072 are supported, and FALSE otherwise. */
5074 bool
5075 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5077 machine_mode mode = TYPE_MODE (vectype);
5079 /* vect_permute_store_chain requires the group size to be equal to 3 or
5080 be a power of two. */
5081 if (count != 3 && exact_log2 (count) == -1)
5083 if (dump_enabled_p ())
5084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5085 "the size of the group of accesses"
5086 " is not a power of 2 or not eqaul to 3\n");
5087 return false;
5090 /* Check that the permutation is supported. */
5091 if (VECTOR_MODE_P (mode))
5093 unsigned int i;
5094 if (count == 3)
5096 unsigned int j0 = 0, j1 = 0, j2 = 0;
5097 unsigned int i, j;
5099 unsigned int nelt;
5100 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5102 if (dump_enabled_p ())
5103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5104 "cannot handle groups of 3 stores for"
5105 " variable-length vectors\n");
5106 return false;
5109 vec_perm_builder sel (nelt, nelt, 1);
5110 sel.quick_grow (nelt);
5111 vec_perm_indices indices;
5112 for (j = 0; j < 3; j++)
5114 int nelt0 = ((3 - j) * nelt) % 3;
5115 int nelt1 = ((3 - j) * nelt + 1) % 3;
5116 int nelt2 = ((3 - j) * nelt + 2) % 3;
5117 for (i = 0; i < nelt; i++)
5119 if (3 * i + nelt0 < nelt)
5120 sel[3 * i + nelt0] = j0++;
5121 if (3 * i + nelt1 < nelt)
5122 sel[3 * i + nelt1] = nelt + j1++;
5123 if (3 * i + nelt2 < nelt)
5124 sel[3 * i + nelt2] = 0;
5126 indices.new_vector (sel, 2, nelt);
5127 if (!can_vec_perm_const_p (mode, indices))
5129 if (dump_enabled_p ())
5130 dump_printf (MSG_MISSED_OPTIMIZATION,
5131 "permutation op not supported by target.\n");
5132 return false;
5135 for (i = 0; i < nelt; i++)
5137 if (3 * i + nelt0 < nelt)
5138 sel[3 * i + nelt0] = 3 * i + nelt0;
5139 if (3 * i + nelt1 < nelt)
5140 sel[3 * i + nelt1] = 3 * i + nelt1;
5141 if (3 * i + nelt2 < nelt)
5142 sel[3 * i + nelt2] = nelt + j2++;
5144 indices.new_vector (sel, 2, nelt);
5145 if (!can_vec_perm_const_p (mode, indices))
5147 if (dump_enabled_p ())
5148 dump_printf (MSG_MISSED_OPTIMIZATION,
5149 "permutation op not supported by target.\n");
5150 return false;
5153 return true;
5155 else
5157 /* If length is not equal to 3 then only power of 2 is supported. */
5158 gcc_assert (pow2p_hwi (count));
5159 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5161 /* The encoding has 2 interleaved stepped patterns. */
5162 vec_perm_builder sel (nelt, 2, 3);
5163 sel.quick_grow (6);
5164 for (i = 0; i < 3; i++)
5166 sel[i * 2] = i;
5167 sel[i * 2 + 1] = i + nelt;
5169 vec_perm_indices indices (sel, 2, nelt);
5170 if (can_vec_perm_const_p (mode, indices))
5172 for (i = 0; i < 6; i++)
5173 sel[i] += exact_div (nelt, 2);
5174 indices.new_vector (sel, 2, nelt);
5175 if (can_vec_perm_const_p (mode, indices))
5176 return true;
5181 if (dump_enabled_p ())
5182 dump_printf (MSG_MISSED_OPTIMIZATION,
5183 "permutaion op not supported by target.\n");
5184 return false;
5188 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5189 type VECTYPE. MASKED_P says whether the masked form is needed. */
5191 bool
5192 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5193 bool masked_p)
5195 if (masked_p)
5196 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5197 vec_mask_store_lanes_optab,
5198 vectype, count);
5199 else
5200 return vect_lanes_optab_supported_p ("vec_store_lanes",
5201 vec_store_lanes_optab,
5202 vectype, count);
5206 /* Function vect_permute_store_chain.
5208 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5209 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5210 the data correctly for the stores. Return the final references for stores
5211 in RESULT_CHAIN.
5213 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5214 The input is 4 vectors each containing 8 elements. We assign a number to
5215 each element, the input sequence is:
5217 1st vec: 0 1 2 3 4 5 6 7
5218 2nd vec: 8 9 10 11 12 13 14 15
5219 3rd vec: 16 17 18 19 20 21 22 23
5220 4th vec: 24 25 26 27 28 29 30 31
5222 The output sequence should be:
5224 1st vec: 0 8 16 24 1 9 17 25
5225 2nd vec: 2 10 18 26 3 11 19 27
5226 3rd vec: 4 12 20 28 5 13 21 30
5227 4th vec: 6 14 22 30 7 15 23 31
5229 i.e., we interleave the contents of the four vectors in their order.
5231 We use interleave_high/low instructions to create such output. The input of
5232 each interleave_high/low operation is two vectors:
5233 1st vec 2nd vec
5234 0 1 2 3 4 5 6 7
5235 the even elements of the result vector are obtained left-to-right from the
5236 high/low elements of the first vector. The odd elements of the result are
5237 obtained left-to-right from the high/low elements of the second vector.
5238 The output of interleave_high will be: 0 4 1 5
5239 and of interleave_low: 2 6 3 7
5242 The permutation is done in log LENGTH stages. In each stage interleave_high
5243 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5244 where the first argument is taken from the first half of DR_CHAIN and the
5245 second argument from it's second half.
5246 In our example,
5248 I1: interleave_high (1st vec, 3rd vec)
5249 I2: interleave_low (1st vec, 3rd vec)
5250 I3: interleave_high (2nd vec, 4th vec)
5251 I4: interleave_low (2nd vec, 4th vec)
5253 The output for the first stage is:
5255 I1: 0 16 1 17 2 18 3 19
5256 I2: 4 20 5 21 6 22 7 23
5257 I3: 8 24 9 25 10 26 11 27
5258 I4: 12 28 13 29 14 30 15 31
5260 The output of the second stage, i.e. the final result is:
5262 I1: 0 8 16 24 1 9 17 25
5263 I2: 2 10 18 26 3 11 19 27
5264 I3: 4 12 20 28 5 13 21 30
5265 I4: 6 14 22 30 7 15 23 31. */
5267 void
5268 vect_permute_store_chain (vec<tree> dr_chain,
5269 unsigned int length,
5270 gimple *stmt,
5271 gimple_stmt_iterator *gsi,
5272 vec<tree> *result_chain)
5274 tree vect1, vect2, high, low;
5275 gimple *perm_stmt;
5276 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5277 tree perm_mask_low, perm_mask_high;
5278 tree data_ref;
5279 tree perm3_mask_low, perm3_mask_high;
5280 unsigned int i, j, n, log_length = exact_log2 (length);
5282 result_chain->quick_grow (length);
5283 memcpy (result_chain->address (), dr_chain.address (),
5284 length * sizeof (tree));
5286 if (length == 3)
5288 /* vect_grouped_store_supported ensures that this is constant. */
5289 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5290 unsigned int j0 = 0, j1 = 0, j2 = 0;
5292 vec_perm_builder sel (nelt, nelt, 1);
5293 sel.quick_grow (nelt);
5294 vec_perm_indices indices;
5295 for (j = 0; j < 3; j++)
5297 int nelt0 = ((3 - j) * nelt) % 3;
5298 int nelt1 = ((3 - j) * nelt + 1) % 3;
5299 int nelt2 = ((3 - j) * nelt + 2) % 3;
5301 for (i = 0; i < nelt; i++)
5303 if (3 * i + nelt0 < nelt)
5304 sel[3 * i + nelt0] = j0++;
5305 if (3 * i + nelt1 < nelt)
5306 sel[3 * i + nelt1] = nelt + j1++;
5307 if (3 * i + nelt2 < nelt)
5308 sel[3 * i + nelt2] = 0;
5310 indices.new_vector (sel, 2, nelt);
5311 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5313 for (i = 0; i < nelt; i++)
5315 if (3 * i + nelt0 < nelt)
5316 sel[3 * i + nelt0] = 3 * i + nelt0;
5317 if (3 * i + nelt1 < nelt)
5318 sel[3 * i + nelt1] = 3 * i + nelt1;
5319 if (3 * i + nelt2 < nelt)
5320 sel[3 * i + nelt2] = nelt + j2++;
5322 indices.new_vector (sel, 2, nelt);
5323 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5325 vect1 = dr_chain[0];
5326 vect2 = dr_chain[1];
5328 /* Create interleaving stmt:
5329 low = VEC_PERM_EXPR <vect1, vect2,
5330 {j, nelt, *, j + 1, nelt + j + 1, *,
5331 j + 2, nelt + j + 2, *, ...}> */
5332 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5333 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5334 vect2, perm3_mask_low);
5335 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5337 vect1 = data_ref;
5338 vect2 = dr_chain[2];
5339 /* Create interleaving stmt:
5340 low = VEC_PERM_EXPR <vect1, vect2,
5341 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5342 6, 7, nelt + j + 2, ...}> */
5343 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5344 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5345 vect2, perm3_mask_high);
5346 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5347 (*result_chain)[j] = data_ref;
5350 else
5352 /* If length is not equal to 3 then only power of 2 is supported. */
5353 gcc_assert (pow2p_hwi (length));
5355 /* The encoding has 2 interleaved stepped patterns. */
5356 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5357 vec_perm_builder sel (nelt, 2, 3);
5358 sel.quick_grow (6);
5359 for (i = 0; i < 3; i++)
5361 sel[i * 2] = i;
5362 sel[i * 2 + 1] = i + nelt;
5364 vec_perm_indices indices (sel, 2, nelt);
5365 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5367 for (i = 0; i < 6; i++)
5368 sel[i] += exact_div (nelt, 2);
5369 indices.new_vector (sel, 2, nelt);
5370 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5372 for (i = 0, n = log_length; i < n; i++)
5374 for (j = 0; j < length/2; j++)
5376 vect1 = dr_chain[j];
5377 vect2 = dr_chain[j+length/2];
5379 /* Create interleaving stmt:
5380 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5381 ...}> */
5382 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5383 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5384 vect2, perm_mask_high);
5385 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5386 (*result_chain)[2*j] = high;
5388 /* Create interleaving stmt:
5389 low = VEC_PERM_EXPR <vect1, vect2,
5390 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5391 ...}> */
5392 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5393 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5394 vect2, perm_mask_low);
5395 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5396 (*result_chain)[2*j+1] = low;
5398 memcpy (dr_chain.address (), result_chain->address (),
5399 length * sizeof (tree));
5404 /* Function vect_setup_realignment
5406 This function is called when vectorizing an unaligned load using
5407 the dr_explicit_realign[_optimized] scheme.
5408 This function generates the following code at the loop prolog:
5410 p = initial_addr;
5411 x msq_init = *(floor(p)); # prolog load
5412 realignment_token = call target_builtin;
5413 loop:
5414 x msq = phi (msq_init, ---)
5416 The stmts marked with x are generated only for the case of
5417 dr_explicit_realign_optimized.
5419 The code above sets up a new (vector) pointer, pointing to the first
5420 location accessed by STMT, and a "floor-aligned" load using that pointer.
5421 It also generates code to compute the "realignment-token" (if the relevant
5422 target hook was defined), and creates a phi-node at the loop-header bb
5423 whose arguments are the result of the prolog-load (created by this
5424 function) and the result of a load that takes place in the loop (to be
5425 created by the caller to this function).
5427 For the case of dr_explicit_realign_optimized:
5428 The caller to this function uses the phi-result (msq) to create the
5429 realignment code inside the loop, and sets up the missing phi argument,
5430 as follows:
5431 loop:
5432 msq = phi (msq_init, lsq)
5433 lsq = *(floor(p')); # load in loop
5434 result = realign_load (msq, lsq, realignment_token);
5436 For the case of dr_explicit_realign:
5437 loop:
5438 msq = *(floor(p)); # load in loop
5439 p' = p + (VS-1);
5440 lsq = *(floor(p')); # load in loop
5441 result = realign_load (msq, lsq, realignment_token);
5443 Input:
5444 STMT - (scalar) load stmt to be vectorized. This load accesses
5445 a memory location that may be unaligned.
5446 BSI - place where new code is to be inserted.
5447 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5448 is used.
5450 Output:
5451 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5452 target hook, if defined.
5453 Return value - the result of the loop-header phi node. */
5455 tree
5456 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5457 tree *realignment_token,
5458 enum dr_alignment_support alignment_support_scheme,
5459 tree init_addr,
5460 struct loop **at_loop)
5462 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5463 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5465 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5466 struct loop *loop = NULL;
5467 edge pe = NULL;
5468 tree scalar_dest = gimple_assign_lhs (stmt);
5469 tree vec_dest;
5470 gimple *inc;
5471 tree ptr;
5472 tree data_ref;
5473 basic_block new_bb;
5474 tree msq_init = NULL_TREE;
5475 tree new_temp;
5476 gphi *phi_stmt;
5477 tree msq = NULL_TREE;
5478 gimple_seq stmts = NULL;
5479 bool inv_p;
5480 bool compute_in_loop = false;
5481 bool nested_in_vect_loop = false;
5482 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5483 struct loop *loop_for_initial_load = NULL;
5485 if (loop_vinfo)
5487 loop = LOOP_VINFO_LOOP (loop_vinfo);
5488 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5491 gcc_assert (alignment_support_scheme == dr_explicit_realign
5492 || alignment_support_scheme == dr_explicit_realign_optimized);
5494 /* We need to generate three things:
5495 1. the misalignment computation
5496 2. the extra vector load (for the optimized realignment scheme).
5497 3. the phi node for the two vectors from which the realignment is
5498 done (for the optimized realignment scheme). */
5500 /* 1. Determine where to generate the misalignment computation.
5502 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5503 calculation will be generated by this function, outside the loop (in the
5504 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5505 caller, inside the loop.
5507 Background: If the misalignment remains fixed throughout the iterations of
5508 the loop, then both realignment schemes are applicable, and also the
5509 misalignment computation can be done outside LOOP. This is because we are
5510 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5511 are a multiple of VS (the Vector Size), and therefore the misalignment in
5512 different vectorized LOOP iterations is always the same.
5513 The problem arises only if the memory access is in an inner-loop nested
5514 inside LOOP, which is now being vectorized using outer-loop vectorization.
5515 This is the only case when the misalignment of the memory access may not
5516 remain fixed throughout the iterations of the inner-loop (as explained in
5517 detail in vect_supportable_dr_alignment). In this case, not only is the
5518 optimized realignment scheme not applicable, but also the misalignment
5519 computation (and generation of the realignment token that is passed to
5520 REALIGN_LOAD) have to be done inside the loop.
5522 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5523 or not, which in turn determines if the misalignment is computed inside
5524 the inner-loop, or outside LOOP. */
5526 if (init_addr != NULL_TREE || !loop_vinfo)
5528 compute_in_loop = true;
5529 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5533 /* 2. Determine where to generate the extra vector load.
5535 For the optimized realignment scheme, instead of generating two vector
5536 loads in each iteration, we generate a single extra vector load in the
5537 preheader of the loop, and in each iteration reuse the result of the
5538 vector load from the previous iteration. In case the memory access is in
5539 an inner-loop nested inside LOOP, which is now being vectorized using
5540 outer-loop vectorization, we need to determine whether this initial vector
5541 load should be generated at the preheader of the inner-loop, or can be
5542 generated at the preheader of LOOP. If the memory access has no evolution
5543 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5544 to be generated inside LOOP (in the preheader of the inner-loop). */
5546 if (nested_in_vect_loop)
5548 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5549 bool invariant_in_outerloop =
5550 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5551 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5553 else
5554 loop_for_initial_load = loop;
5555 if (at_loop)
5556 *at_loop = loop_for_initial_load;
5558 if (loop_for_initial_load)
5559 pe = loop_preheader_edge (loop_for_initial_load);
5561 /* 3. For the case of the optimized realignment, create the first vector
5562 load at the loop preheader. */
5564 if (alignment_support_scheme == dr_explicit_realign_optimized)
5566 /* Create msq_init = *(floor(p1)) in the loop preheader */
5567 gassign *new_stmt;
5569 gcc_assert (!compute_in_loop);
5570 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5571 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5572 NULL_TREE, &init_addr, NULL, &inc,
5573 true, &inv_p);
5574 if (TREE_CODE (ptr) == SSA_NAME)
5575 new_temp = copy_ssa_name (ptr);
5576 else
5577 new_temp = make_ssa_name (TREE_TYPE (ptr));
5578 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5579 new_stmt = gimple_build_assign
5580 (new_temp, BIT_AND_EXPR, ptr,
5581 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5582 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5583 gcc_assert (!new_bb);
5584 data_ref
5585 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5586 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5587 vect_copy_ref_info (data_ref, DR_REF (dr));
5588 new_stmt = gimple_build_assign (vec_dest, data_ref);
5589 new_temp = make_ssa_name (vec_dest, new_stmt);
5590 gimple_assign_set_lhs (new_stmt, new_temp);
5591 if (pe)
5593 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5594 gcc_assert (!new_bb);
5596 else
5597 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5599 msq_init = gimple_assign_lhs (new_stmt);
5602 /* 4. Create realignment token using a target builtin, if available.
5603 It is done either inside the containing loop, or before LOOP (as
5604 determined above). */
5606 if (targetm.vectorize.builtin_mask_for_load)
5608 gcall *new_stmt;
5609 tree builtin_decl;
5611 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5612 if (!init_addr)
5614 /* Generate the INIT_ADDR computation outside LOOP. */
5615 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5616 NULL_TREE);
5617 if (loop)
5619 pe = loop_preheader_edge (loop);
5620 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5621 gcc_assert (!new_bb);
5623 else
5624 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5627 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5628 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5629 vec_dest =
5630 vect_create_destination_var (scalar_dest,
5631 gimple_call_return_type (new_stmt));
5632 new_temp = make_ssa_name (vec_dest, new_stmt);
5633 gimple_call_set_lhs (new_stmt, new_temp);
5635 if (compute_in_loop)
5636 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5637 else
5639 /* Generate the misalignment computation outside LOOP. */
5640 pe = loop_preheader_edge (loop);
5641 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5642 gcc_assert (!new_bb);
5645 *realignment_token = gimple_call_lhs (new_stmt);
5647 /* The result of the CALL_EXPR to this builtin is determined from
5648 the value of the parameter and no global variables are touched
5649 which makes the builtin a "const" function. Requiring the
5650 builtin to have the "const" attribute makes it unnecessary
5651 to call mark_call_clobbered. */
5652 gcc_assert (TREE_READONLY (builtin_decl));
5655 if (alignment_support_scheme == dr_explicit_realign)
5656 return msq;
5658 gcc_assert (!compute_in_loop);
5659 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5662 /* 5. Create msq = phi <msq_init, lsq> in loop */
5664 pe = loop_preheader_edge (containing_loop);
5665 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5666 msq = make_ssa_name (vec_dest);
5667 phi_stmt = create_phi_node (msq, containing_loop->header);
5668 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5670 return msq;
5674 /* Function vect_grouped_load_supported.
5676 COUNT is the size of the load group (the number of statements plus the
5677 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5678 only one statement, with a gap of COUNT - 1.
5680 Returns true if a suitable permute exists. */
5682 bool
5683 vect_grouped_load_supported (tree vectype, bool single_element_p,
5684 unsigned HOST_WIDE_INT count)
5686 machine_mode mode = TYPE_MODE (vectype);
5688 /* If this is single-element interleaving with an element distance
5689 that leaves unused vector loads around punt - we at least create
5690 very sub-optimal code in that case (and blow up memory,
5691 see PR65518). */
5692 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5694 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696 "single-element interleaving not supported "
5697 "for not adjacent vector loads\n");
5698 return false;
5701 /* vect_permute_load_chain requires the group size to be equal to 3 or
5702 be a power of two. */
5703 if (count != 3 && exact_log2 (count) == -1)
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707 "the size of the group of accesses"
5708 " is not a power of 2 or not equal to 3\n");
5709 return false;
5712 /* Check that the permutation is supported. */
5713 if (VECTOR_MODE_P (mode))
5715 unsigned int i, j;
5716 if (count == 3)
5718 unsigned int nelt;
5719 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5723 "cannot handle groups of 3 loads for"
5724 " variable-length vectors\n");
5725 return false;
5728 vec_perm_builder sel (nelt, nelt, 1);
5729 sel.quick_grow (nelt);
5730 vec_perm_indices indices;
5731 unsigned int k;
5732 for (k = 0; k < 3; k++)
5734 for (i = 0; i < nelt; i++)
5735 if (3 * i + k < 2 * nelt)
5736 sel[i] = 3 * i + k;
5737 else
5738 sel[i] = 0;
5739 indices.new_vector (sel, 2, nelt);
5740 if (!can_vec_perm_const_p (mode, indices))
5742 if (dump_enabled_p ())
5743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5744 "shuffle of 3 loads is not supported by"
5745 " target\n");
5746 return false;
5748 for (i = 0, j = 0; i < nelt; i++)
5749 if (3 * i + k < 2 * nelt)
5750 sel[i] = i;
5751 else
5752 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5753 indices.new_vector (sel, 2, nelt);
5754 if (!can_vec_perm_const_p (mode, indices))
5756 if (dump_enabled_p ())
5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 "shuffle of 3 loads is not supported by"
5759 " target\n");
5760 return false;
5763 return true;
5765 else
5767 /* If length is not equal to 3 then only power of 2 is supported. */
5768 gcc_assert (pow2p_hwi (count));
5769 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5771 /* The encoding has a single stepped pattern. */
5772 vec_perm_builder sel (nelt, 1, 3);
5773 sel.quick_grow (3);
5774 for (i = 0; i < 3; i++)
5775 sel[i] = i * 2;
5776 vec_perm_indices indices (sel, 2, nelt);
5777 if (can_vec_perm_const_p (mode, indices))
5779 for (i = 0; i < 3; i++)
5780 sel[i] = i * 2 + 1;
5781 indices.new_vector (sel, 2, nelt);
5782 if (can_vec_perm_const_p (mode, indices))
5783 return true;
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790 "extract even/odd not supported by target\n");
5791 return false;
5794 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5795 type VECTYPE. MASKED_P says whether the masked form is needed. */
5797 bool
5798 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5799 bool masked_p)
5801 if (masked_p)
5802 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5803 vec_mask_load_lanes_optab,
5804 vectype, count);
5805 else
5806 return vect_lanes_optab_supported_p ("vec_load_lanes",
5807 vec_load_lanes_optab,
5808 vectype, count);
5811 /* Function vect_permute_load_chain.
5813 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5814 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5815 the input data correctly. Return the final references for loads in
5816 RESULT_CHAIN.
5818 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5819 The input is 4 vectors each containing 8 elements. We assign a number to each
5820 element, the input sequence is:
5822 1st vec: 0 1 2 3 4 5 6 7
5823 2nd vec: 8 9 10 11 12 13 14 15
5824 3rd vec: 16 17 18 19 20 21 22 23
5825 4th vec: 24 25 26 27 28 29 30 31
5827 The output sequence should be:
5829 1st vec: 0 4 8 12 16 20 24 28
5830 2nd vec: 1 5 9 13 17 21 25 29
5831 3rd vec: 2 6 10 14 18 22 26 30
5832 4th vec: 3 7 11 15 19 23 27 31
5834 i.e., the first output vector should contain the first elements of each
5835 interleaving group, etc.
5837 We use extract_even/odd instructions to create such output. The input of
5838 each extract_even/odd operation is two vectors
5839 1st vec 2nd vec
5840 0 1 2 3 4 5 6 7
5842 and the output is the vector of extracted even/odd elements. The output of
5843 extract_even will be: 0 2 4 6
5844 and of extract_odd: 1 3 5 7
5847 The permutation is done in log LENGTH stages. In each stage extract_even
5848 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5849 their order. In our example,
5851 E1: extract_even (1st vec, 2nd vec)
5852 E2: extract_odd (1st vec, 2nd vec)
5853 E3: extract_even (3rd vec, 4th vec)
5854 E4: extract_odd (3rd vec, 4th vec)
5856 The output for the first stage will be:
5858 E1: 0 2 4 6 8 10 12 14
5859 E2: 1 3 5 7 9 11 13 15
5860 E3: 16 18 20 22 24 26 28 30
5861 E4: 17 19 21 23 25 27 29 31
5863 In order to proceed and create the correct sequence for the next stage (or
5864 for the correct output, if the second stage is the last one, as in our
5865 example), we first put the output of extract_even operation and then the
5866 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5867 The input for the second stage is:
5869 1st vec (E1): 0 2 4 6 8 10 12 14
5870 2nd vec (E3): 16 18 20 22 24 26 28 30
5871 3rd vec (E2): 1 3 5 7 9 11 13 15
5872 4th vec (E4): 17 19 21 23 25 27 29 31
5874 The output of the second stage:
5876 E1: 0 4 8 12 16 20 24 28
5877 E2: 2 6 10 14 18 22 26 30
5878 E3: 1 5 9 13 17 21 25 29
5879 E4: 3 7 11 15 19 23 27 31
5881 And RESULT_CHAIN after reordering:
5883 1st vec (E1): 0 4 8 12 16 20 24 28
5884 2nd vec (E3): 1 5 9 13 17 21 25 29
5885 3rd vec (E2): 2 6 10 14 18 22 26 30
5886 4th vec (E4): 3 7 11 15 19 23 27 31. */
5888 static void
5889 vect_permute_load_chain (vec<tree> dr_chain,
5890 unsigned int length,
5891 gimple *stmt,
5892 gimple_stmt_iterator *gsi,
5893 vec<tree> *result_chain)
5895 tree data_ref, first_vect, second_vect;
5896 tree perm_mask_even, perm_mask_odd;
5897 tree perm3_mask_low, perm3_mask_high;
5898 gimple *perm_stmt;
5899 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5900 unsigned int i, j, log_length = exact_log2 (length);
5902 result_chain->quick_grow (length);
5903 memcpy (result_chain->address (), dr_chain.address (),
5904 length * sizeof (tree));
5906 if (length == 3)
5908 /* vect_grouped_load_supported ensures that this is constant. */
5909 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5910 unsigned int k;
5912 vec_perm_builder sel (nelt, nelt, 1);
5913 sel.quick_grow (nelt);
5914 vec_perm_indices indices;
5915 for (k = 0; k < 3; k++)
5917 for (i = 0; i < nelt; i++)
5918 if (3 * i + k < 2 * nelt)
5919 sel[i] = 3 * i + k;
5920 else
5921 sel[i] = 0;
5922 indices.new_vector (sel, 2, nelt);
5923 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5925 for (i = 0, j = 0; i < nelt; i++)
5926 if (3 * i + k < 2 * nelt)
5927 sel[i] = i;
5928 else
5929 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5930 indices.new_vector (sel, 2, nelt);
5931 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5933 first_vect = dr_chain[0];
5934 second_vect = dr_chain[1];
5936 /* Create interleaving stmt (low part of):
5937 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5938 ...}> */
5939 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5940 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5941 second_vect, perm3_mask_low);
5942 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5944 /* Create interleaving stmt (high part of):
5945 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5946 ...}> */
5947 first_vect = data_ref;
5948 second_vect = dr_chain[2];
5949 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5950 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5951 second_vect, perm3_mask_high);
5952 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5953 (*result_chain)[k] = data_ref;
5956 else
5958 /* If length is not equal to 3 then only power of 2 is supported. */
5959 gcc_assert (pow2p_hwi (length));
5961 /* The encoding has a single stepped pattern. */
5962 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5963 vec_perm_builder sel (nelt, 1, 3);
5964 sel.quick_grow (3);
5965 for (i = 0; i < 3; ++i)
5966 sel[i] = i * 2;
5967 vec_perm_indices indices (sel, 2, nelt);
5968 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5970 for (i = 0; i < 3; ++i)
5971 sel[i] = i * 2 + 1;
5972 indices.new_vector (sel, 2, nelt);
5973 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5975 for (i = 0; i < log_length; i++)
5977 for (j = 0; j < length; j += 2)
5979 first_vect = dr_chain[j];
5980 second_vect = dr_chain[j+1];
5982 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5983 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5984 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5985 first_vect, second_vect,
5986 perm_mask_even);
5987 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5988 (*result_chain)[j/2] = data_ref;
5990 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5991 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5992 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5993 first_vect, second_vect,
5994 perm_mask_odd);
5995 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5996 (*result_chain)[j/2+length/2] = data_ref;
5998 memcpy (dr_chain.address (), result_chain->address (),
5999 length * sizeof (tree));
6004 /* Function vect_shift_permute_load_chain.
6006 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6007 sequence of stmts to reorder the input data accordingly.
6008 Return the final references for loads in RESULT_CHAIN.
6009 Return true if successed, false otherwise.
6011 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6012 The input is 3 vectors each containing 8 elements. We assign a
6013 number to each element, the input sequence is:
6015 1st vec: 0 1 2 3 4 5 6 7
6016 2nd vec: 8 9 10 11 12 13 14 15
6017 3rd vec: 16 17 18 19 20 21 22 23
6019 The output sequence should be:
6021 1st vec: 0 3 6 9 12 15 18 21
6022 2nd vec: 1 4 7 10 13 16 19 22
6023 3rd vec: 2 5 8 11 14 17 20 23
6025 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6027 First we shuffle all 3 vectors to get correct elements order:
6029 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6030 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6031 3rd vec: (16 19 22) (17 20 23) (18 21)
6033 Next we unite and shift vector 3 times:
6035 1st step:
6036 shift right by 6 the concatenation of:
6037 "1st vec" and "2nd vec"
6038 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6039 "2nd vec" and "3rd vec"
6040 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6041 "3rd vec" and "1st vec"
6042 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6043 | New vectors |
6045 So that now new vectors are:
6047 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6048 2nd vec: (10 13) (16 19 22) (17 20 23)
6049 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6051 2nd step:
6052 shift right by 5 the concatenation of:
6053 "1st vec" and "3rd vec"
6054 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6055 "2nd vec" and "1st vec"
6056 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6057 "3rd vec" and "2nd vec"
6058 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6059 | New vectors |
6061 So that now new vectors are:
6063 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6064 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6065 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6067 3rd step:
6068 shift right by 5 the concatenation of:
6069 "1st vec" and "1st vec"
6070 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6071 shift right by 3 the concatenation of:
6072 "2nd vec" and "2nd vec"
6073 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6074 | New vectors |
6076 So that now all vectors are READY:
6077 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6078 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6079 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6081 This algorithm is faster than one in vect_permute_load_chain if:
6082 1. "shift of a concatination" is faster than general permutation.
6083 This is usually so.
6084 2. The TARGET machine can't execute vector instructions in parallel.
6085 This is because each step of the algorithm depends on previous.
6086 The algorithm in vect_permute_load_chain is much more parallel.
6088 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6091 static bool
6092 vect_shift_permute_load_chain (vec<tree> dr_chain,
6093 unsigned int length,
6094 gimple *stmt,
6095 gimple_stmt_iterator *gsi,
6096 vec<tree> *result_chain)
6098 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6099 tree perm2_mask1, perm2_mask2, perm3_mask;
6100 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6101 gimple *perm_stmt;
6103 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6104 unsigned int i;
6105 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6106 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6108 unsigned HOST_WIDE_INT nelt, vf;
6109 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6110 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6111 /* Not supported for variable-length vectors. */
6112 return false;
6114 vec_perm_builder sel (nelt, nelt, 1);
6115 sel.quick_grow (nelt);
6117 result_chain->quick_grow (length);
6118 memcpy (result_chain->address (), dr_chain.address (),
6119 length * sizeof (tree));
6121 if (pow2p_hwi (length) && vf > 4)
6123 unsigned int j, log_length = exact_log2 (length);
6124 for (i = 0; i < nelt / 2; ++i)
6125 sel[i] = i * 2;
6126 for (i = 0; i < nelt / 2; ++i)
6127 sel[nelt / 2 + i] = i * 2 + 1;
6128 vec_perm_indices indices (sel, 2, nelt);
6129 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6131 if (dump_enabled_p ())
6132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6133 "shuffle of 2 fields structure is not \
6134 supported by target\n");
6135 return false;
6137 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6139 for (i = 0; i < nelt / 2; ++i)
6140 sel[i] = i * 2 + 1;
6141 for (i = 0; i < nelt / 2; ++i)
6142 sel[nelt / 2 + i] = i * 2;
6143 indices.new_vector (sel, 2, nelt);
6144 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6146 if (dump_enabled_p ())
6147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148 "shuffle of 2 fields structure is not \
6149 supported by target\n");
6150 return false;
6152 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6154 /* Generating permutation constant to shift all elements.
6155 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6156 for (i = 0; i < nelt; i++)
6157 sel[i] = nelt / 2 + i;
6158 indices.new_vector (sel, 2, nelt);
6159 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6161 if (dump_enabled_p ())
6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163 "shift permutation is not supported by target\n");
6164 return false;
6166 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6168 /* Generating permutation constant to select vector from 2.
6169 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6170 for (i = 0; i < nelt / 2; i++)
6171 sel[i] = i;
6172 for (i = nelt / 2; i < nelt; i++)
6173 sel[i] = nelt + i;
6174 indices.new_vector (sel, 2, nelt);
6175 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6177 if (dump_enabled_p ())
6178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6179 "select is not supported by target\n");
6180 return false;
6182 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6184 for (i = 0; i < log_length; i++)
6186 for (j = 0; j < length; j += 2)
6188 first_vect = dr_chain[j];
6189 second_vect = dr_chain[j + 1];
6191 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6192 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6193 first_vect, first_vect,
6194 perm2_mask1);
6195 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6196 vect[0] = data_ref;
6198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6200 second_vect, second_vect,
6201 perm2_mask2);
6202 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6203 vect[1] = data_ref;
6205 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6206 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6207 vect[0], vect[1], shift1_mask);
6208 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6209 (*result_chain)[j/2 + length/2] = data_ref;
6211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213 vect[0], vect[1], select_mask);
6214 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6215 (*result_chain)[j/2] = data_ref;
6217 memcpy (dr_chain.address (), result_chain->address (),
6218 length * sizeof (tree));
6220 return true;
6222 if (length == 3 && vf > 2)
6224 unsigned int k = 0, l = 0;
6226 /* Generating permutation constant to get all elements in rigth order.
6227 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6228 for (i = 0; i < nelt; i++)
6230 if (3 * k + (l % 3) >= nelt)
6232 k = 0;
6233 l += (3 - (nelt % 3));
6235 sel[i] = 3 * k + (l % 3);
6236 k++;
6238 vec_perm_indices indices (sel, 2, nelt);
6239 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241 if (dump_enabled_p ())
6242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6243 "shuffle of 3 fields structure is not \
6244 supported by target\n");
6245 return false;
6247 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6249 /* Generating permutation constant to shift all elements.
6250 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6251 for (i = 0; i < nelt; i++)
6252 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6253 indices.new_vector (sel, 2, nelt);
6254 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6256 if (dump_enabled_p ())
6257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 "shift permutation is not supported by target\n");
6259 return false;
6261 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6263 /* Generating permutation constant to shift all elements.
6264 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6265 for (i = 0; i < nelt; i++)
6266 sel[i] = 2 * (nelt / 3) + 1 + i;
6267 indices.new_vector (sel, 2, nelt);
6268 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6270 if (dump_enabled_p ())
6271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272 "shift permutation is not supported by target\n");
6273 return false;
6275 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6277 /* Generating permutation constant to shift all elements.
6278 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6279 for (i = 0; i < nelt; i++)
6280 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6281 indices.new_vector (sel, 2, nelt);
6282 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6284 if (dump_enabled_p ())
6285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6286 "shift permutation is not supported by target\n");
6287 return false;
6289 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6291 /* Generating permutation constant to shift all elements.
6292 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6293 for (i = 0; i < nelt; i++)
6294 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6295 indices.new_vector (sel, 2, nelt);
6296 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6298 if (dump_enabled_p ())
6299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6300 "shift permutation is not supported by target\n");
6301 return false;
6303 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6305 for (k = 0; k < 3; k++)
6307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6309 dr_chain[k], dr_chain[k],
6310 perm3_mask);
6311 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6312 vect[k] = data_ref;
6315 for (k = 0; k < 3; k++)
6317 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6318 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6319 vect[k % 3], vect[(k + 1) % 3],
6320 shift1_mask);
6321 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6322 vect_shift[k] = data_ref;
6325 for (k = 0; k < 3; k++)
6327 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6328 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6329 vect_shift[(4 - k) % 3],
6330 vect_shift[(3 - k) % 3],
6331 shift2_mask);
6332 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6333 vect[k] = data_ref;
6336 (*result_chain)[3 - (nelt % 3)] = vect[2];
6338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6340 vect[0], shift3_mask);
6341 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6342 (*result_chain)[nelt % 3] = data_ref;
6344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6346 vect[1], shift4_mask);
6347 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6348 (*result_chain)[0] = data_ref;
6349 return true;
6351 return false;
6354 /* Function vect_transform_grouped_load.
6356 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6357 to perform their permutation and ascribe the result vectorized statements to
6358 the scalar statements.
6361 void
6362 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6363 gimple_stmt_iterator *gsi)
6365 machine_mode mode;
6366 vec<tree> result_chain = vNULL;
6368 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6369 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6370 vectors, that are ready for vector computation. */
6371 result_chain.create (size);
6373 /* If reassociation width for vector type is 2 or greater target machine can
6374 execute 2 or more vector instructions in parallel. Otherwise try to
6375 get chain for loads group using vect_shift_permute_load_chain. */
6376 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6377 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6378 || pow2p_hwi (size)
6379 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6380 gsi, &result_chain))
6381 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6382 vect_record_grouped_load_vectors (stmt, result_chain);
6383 result_chain.release ();
6386 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6387 generated as part of the vectorization of STMT. Assign the statement
6388 for each vector to the associated scalar statement. */
6390 void
6391 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6393 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6394 gimple *next_stmt, *new_stmt;
6395 unsigned int i, gap_count;
6396 tree tmp_data_ref;
6398 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6399 Since we scan the chain starting from it's first node, their order
6400 corresponds the order of data-refs in RESULT_CHAIN. */
6401 next_stmt = first_stmt;
6402 gap_count = 1;
6403 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6405 if (!next_stmt)
6406 break;
6408 /* Skip the gaps. Loads created for the gaps will be removed by dead
6409 code elimination pass later. No need to check for the first stmt in
6410 the group, since it always exists.
6411 GROUP_GAP is the number of steps in elements from the previous
6412 access (if there is no gap GROUP_GAP is 1). We skip loads that
6413 correspond to the gaps. */
6414 if (next_stmt != first_stmt
6415 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6417 gap_count++;
6418 continue;
6421 while (next_stmt)
6423 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6424 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6425 copies, and we put the new vector statement in the first available
6426 RELATED_STMT. */
6427 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6428 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6429 else
6431 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6433 gimple *prev_stmt =
6434 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6435 gimple *rel_stmt =
6436 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6437 while (rel_stmt)
6439 prev_stmt = rel_stmt;
6440 rel_stmt =
6441 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6444 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6445 new_stmt;
6449 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6450 gap_count = 1;
6451 /* If NEXT_STMT accesses the same DR as the previous statement,
6452 put the same TMP_DATA_REF as its vectorized statement; otherwise
6453 get the next data-ref from RESULT_CHAIN. */
6454 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6455 break;
6460 /* Function vect_force_dr_alignment_p.
6462 Returns whether the alignment of a DECL can be forced to be aligned
6463 on ALIGNMENT bit boundary. */
6465 bool
6466 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6468 if (!VAR_P (decl))
6469 return false;
6471 if (decl_in_symtab_p (decl)
6472 && !symtab_node::get (decl)->can_increase_alignment_p ())
6473 return false;
6475 if (TREE_STATIC (decl))
6476 return (alignment <= MAX_OFILE_ALIGNMENT);
6477 else
6478 return (alignment <= MAX_STACK_ALIGNMENT);
6482 /* Return whether the data reference DR is supported with respect to its
6483 alignment.
6484 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6485 it is aligned, i.e., check if it is possible to vectorize it with different
6486 alignment. */
6488 enum dr_alignment_support
6489 vect_supportable_dr_alignment (struct data_reference *dr,
6490 bool check_aligned_accesses)
6492 gimple *stmt = DR_STMT (dr);
6493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6494 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6495 machine_mode mode = TYPE_MODE (vectype);
6496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6497 struct loop *vect_loop = NULL;
6498 bool nested_in_vect_loop = false;
6500 if (aligned_access_p (dr) && !check_aligned_accesses)
6501 return dr_aligned;
6503 /* For now assume all conditional loads/stores support unaligned
6504 access without any special code. */
6505 if (is_gimple_call (stmt)
6506 && gimple_call_internal_p (stmt)
6507 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6508 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6509 return dr_unaligned_supported;
6511 if (loop_vinfo)
6513 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6514 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6517 /* Possibly unaligned access. */
6519 /* We can choose between using the implicit realignment scheme (generating
6520 a misaligned_move stmt) and the explicit realignment scheme (generating
6521 aligned loads with a REALIGN_LOAD). There are two variants to the
6522 explicit realignment scheme: optimized, and unoptimized.
6523 We can optimize the realignment only if the step between consecutive
6524 vector loads is equal to the vector size. Since the vector memory
6525 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6526 is guaranteed that the misalignment amount remains the same throughout the
6527 execution of the vectorized loop. Therefore, we can create the
6528 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6529 at the loop preheader.
6531 However, in the case of outer-loop vectorization, when vectorizing a
6532 memory access in the inner-loop nested within the LOOP that is now being
6533 vectorized, while it is guaranteed that the misalignment of the
6534 vectorized memory access will remain the same in different outer-loop
6535 iterations, it is *not* guaranteed that is will remain the same throughout
6536 the execution of the inner-loop. This is because the inner-loop advances
6537 with the original scalar step (and not in steps of VS). If the inner-loop
6538 step happens to be a multiple of VS, then the misalignment remains fixed
6539 and we can use the optimized realignment scheme. For example:
6541 for (i=0; i<N; i++)
6542 for (j=0; j<M; j++)
6543 s += a[i+j];
6545 When vectorizing the i-loop in the above example, the step between
6546 consecutive vector loads is 1, and so the misalignment does not remain
6547 fixed across the execution of the inner-loop, and the realignment cannot
6548 be optimized (as illustrated in the following pseudo vectorized loop):
6550 for (i=0; i<N; i+=4)
6551 for (j=0; j<M; j++){
6552 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6553 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6554 // (assuming that we start from an aligned address).
6557 We therefore have to use the unoptimized realignment scheme:
6559 for (i=0; i<N; i+=4)
6560 for (j=k; j<M; j+=4)
6561 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6562 // that the misalignment of the initial address is
6563 // 0).
6565 The loop can then be vectorized as follows:
6567 for (k=0; k<4; k++){
6568 rt = get_realignment_token (&vp[k]);
6569 for (i=0; i<N; i+=4){
6570 v1 = vp[i+k];
6571 for (j=k; j<M; j+=4){
6572 v2 = vp[i+j+VS-1];
6573 va = REALIGN_LOAD <v1,v2,rt>;
6574 vs += va;
6575 v1 = v2;
6578 } */
6580 if (DR_IS_READ (dr))
6582 bool is_packed = false;
6583 tree type = (TREE_TYPE (DR_REF (dr)));
6585 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6586 && (!targetm.vectorize.builtin_mask_for_load
6587 || targetm.vectorize.builtin_mask_for_load ()))
6589 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6591 /* If we are doing SLP then the accesses need not have the
6592 same alignment, instead it depends on the SLP group size. */
6593 if (loop_vinfo
6594 && STMT_SLP_TYPE (stmt_info)
6595 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6596 * GROUP_SIZE (vinfo_for_stmt
6597 (GROUP_FIRST_ELEMENT (stmt_info))),
6598 TYPE_VECTOR_SUBPARTS (vectype)))
6600 else if (!loop_vinfo
6601 || (nested_in_vect_loop
6602 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6603 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6604 return dr_explicit_realign;
6605 else
6606 return dr_explicit_realign_optimized;
6608 if (!known_alignment_for_access_p (dr))
6609 is_packed = not_size_aligned (DR_REF (dr));
6611 if (targetm.vectorize.support_vector_misalignment
6612 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6613 /* Can't software pipeline the loads, but can at least do them. */
6614 return dr_unaligned_supported;
6616 else
6618 bool is_packed = false;
6619 tree type = (TREE_TYPE (DR_REF (dr)));
6621 if (!known_alignment_for_access_p (dr))
6622 is_packed = not_size_aligned (DR_REF (dr));
6624 if (targetm.vectorize.support_vector_misalignment
6625 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6626 return dr_unaligned_supported;
6629 /* Unsupported. */
6630 return dr_unaligned_unsupported;